├── .ipynb_checkpoints
    ├── Summ_Gensim-checkpoint.ipynb
    ├── Summ_Pytease-checkpoint.ipynb
    ├── Summ_Sumy_lex-checkpoint.ipynb
    ├── Summ_Sumy_lsa-checkpoint.ipynb
    ├── Summ_Sumy_luhn-checkpoint.ipynb
    └── bleu_model-checkpoint.ipynb
├── README.md
├── Summ_Gensim.ipynb
├── Summ_Pytease.ipynb
├── Summ_Sumy_lex.ipynb
├── Summ_Sumy_lsa.ipynb
├── Summ_Sumy_luhn.ipynb
├── bleu_model.ipynb
└── pyrougetextrank.py


/.ipynb_checkpoints/Summ_Gensim-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 4,
 6 |    "metadata": {
 7 |     "collapsed": false
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import os\n",
12 |     "from gensim.summarization import summarize"
13 |    ]
14 |   },
15 |   {
16 |    "cell_type": "code",
17 |    "execution_count": 5,
18 |    "metadata": {
19 |     "collapsed": false
20 |    },
21 |    "outputs": [],
22 |    "source": [
23 |     "def summ_it():\n",
24 |     "    total = 0\n",
25 |     "    for file in os.listdir('topics'):\n",
26 |     "        with open('topics/' + file, 'r') as f:\n",
27 |     "            summ = summarize(unicode(f.read().replace('\\n',' '), errors='ignore'), word_count=75)\n",
28 |     "            with open('summ_gensim/' + file.split('.')[0] + '.txt', 'w') as fw:\n",
29 |     "                fw.writelines(summ)\n",
30 |     "                total += len(summ)"
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "code",
35 |    "execution_count": 6,
36 |    "metadata": {
37 |     "collapsed": false
38 |    },
39 |    "outputs": [
40 |     {
41 |      "name": "stdout",
42 |      "output_type": "stream",
43 |      "text": [
44 |       "1 loop, best of 3: 57.7 s per loop\n"
45 |      ]
46 |     }
47 |    ],
48 |    "source": [
49 |     "summ_it()"
50 |    ]
51 |   },
52 |   {
53 |    "cell_type": "code",
54 |    "execution_count": null,
55 |    "metadata": {
56 |     "collapsed": true
57 |    },
58 |    "outputs": [],
59 |    "source": []
60 |   }
61 |  ],
62 |  "metadata": {
63 |   "anaconda-cloud": {},
64 |   "kernelspec": {
65 |    "display_name": "Python [default]",
66 |    "language": "python",
67 |    "name": "python2"
68 |   },
69 |   "language_info": {
70 |    "codemirror_mode": {
71 |     "name": "ipython",
72 |     "version": 2
73 |    },
74 |    "file_extension": ".py",
75 |    "mimetype": "text/x-python",
76 |    "name": "python",
77 |    "nbconvert_exporter": "python",
78 |    "pygments_lexer": "ipython2",
79 |    "version": "2.7.12"
80 |   }
81 |  },
82 |  "nbformat": 4,
83 |  "nbformat_minor": 0
84 | }
85 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Summ_Pytease-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": false
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import os\n",
12 |     "from pyteaser import Summarize"
13 |    ]
14 |   },
15 |   {
16 |    "cell_type": "code",
17 |    "execution_count": null,
18 |    "metadata": {
19 |     "collapsed": false
20 |    },
21 |    "outputs": [],
22 |    "source": [
23 |     "for file in os.listdir('topics'):\n",
24 |     "    with open('topics/' + file, 'r') as f:\n",
25 |     "        #print file\n",
26 |     "        summ = Summarize('',unicode(f.read().replace('\\n',' '), errors='ignore'))\n",
27 |     "        with open('summ_pyteaser/' + file.split('.')[0] + '.txt', 'w') as fw:\n",
28 |     "            fw.writelines(summ)"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "code",
33 |    "execution_count": null,
34 |    "metadata": {
35 |     "collapsed": true
36 |    },
37 |    "outputs": [],
38 |    "source": []
39 |   }
40 |  ],
41 |  "metadata": {
42 |   "anaconda-cloud": {},
43 |   "kernelspec": {
44 |    "display_name": "Python [default]",
45 |    "language": "python",
46 |    "name": "python2"
47 |   },
48 |   "language_info": {
49 |    "codemirror_mode": {
50 |     "name": "ipython",
51 |     "version": 2
52 |    },
53 |    "file_extension": ".py",
54 |    "mimetype": "text/x-python",
55 |    "name": "python",
56 |    "nbconvert_exporter": "python",
57 |    "pygments_lexer": "ipython2",
58 |    "version": "2.7.12"
59 |   }
60 |  },
61 |  "nbformat": 4,
62 |  "nbformat_minor": 0
63 | }
64 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Summ_Sumy_lex-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import os\n",
 12 |     "from sumy.summarizers.lex_rank import LexRankSummarizer\n",
 13 |     "from sumy.parsers.plaintext import PlaintextParser\n",
 14 |     "from sumy.nlp.tokenizers import Tokenizer\n",
 15 |     "from sumy.nlp.stemmers import Stemmer\n",
 16 |     "from sumy.utils import get_stop_words"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 6,
 22 |    "metadata": {
 23 |     "collapsed": true
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "LANGUAGE = \"english\"\n",
 28 |     "SENTENCES_COUNT = 5"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 7,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "def summ_it():\n",
 40 |     "    length = 0\n",
 41 |     "    total = 0\n",
 42 |     "    for file in os.listdir('topics'):\n",
 43 |     "        with open('topics/' + file, 'r') as f:\n",
 44 |     "            parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n",
 45 |     "            stemmer = Stemmer(LANGUAGE)\n",
 46 |     "            summarizer = LexRankSummarizer(stemmer)\n",
 47 |     "            summarizer.stop_words = get_stop_words(LANGUAGE)\n",
 48 |     "            with open('summ_sumy_lex/' + file.split('.')[0] + '.txt', 'w') as fw:\n",
 49 |     "                for sentence in summarizer(parser.document, SENTENCES_COUNT):\n",
 50 |     "                    #print sentence\n",
 51 |     "                    fw.writelines(str(sentence))\n",
 52 |     "                    length += len(str(sentence))\n",
 53 |     "                total += length\n",
 54 |     "                length = 0"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 8,
 60 |    "metadata": {
 61 |     "collapsed": false
 62 |    },
 63 |    "outputs": [
 64 |     {
 65 |      "name": "stdout",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "1 loop, best of 3: 41.2 s per loop\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "summ_it()"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "collapsed": true
 81 |    },
 82 |    "outputs": [],
 83 |    "source": []
 84 |   }
 85 |  ],
 86 |  "metadata": {
 87 |   "anaconda-cloud": {},
 88 |   "kernelspec": {
 89 |    "display_name": "Python [default]",
 90 |    "language": "python",
 91 |    "name": "python2"
 92 |   },
 93 |   "language_info": {
 94 |    "codemirror_mode": {
 95 |     "name": "ipython",
 96 |     "version": 2
 97 |    },
 98 |    "file_extension": ".py",
 99 |    "mimetype": "text/x-python",
100 |    "name": "python",
101 |    "nbconvert_exporter": "python",
102 |    "pygments_lexer": "ipython2",
103 |    "version": "2.7.12"
104 |   }
105 |  },
106 |  "nbformat": 4,
107 |  "nbformat_minor": 0
108 | }
109 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Summ_Sumy_lsa-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": false
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import os\n",
12 |     "from sumy.summarizers import luhn\n",
13 |     "from sumy.parsers.plaintext import PlaintextParser\n",
14 |     "from sumy.nlp.tokenizers import Tokenizer\n",
15 |     "from sumy.nlp.stemmers import Stemmer\n",
16 |     "from sumy.utils import get_stop_words\n",
17 |     "from sumy.summarizers.lsa import LsaSummarizer as Summarizer"
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": null,
23 |    "metadata": {
24 |     "collapsed": true
25 |    },
26 |    "outputs": [],
27 |    "source": [
28 |     "LANGUAGE = \"english\"\n",
29 |     "SENTENCES_COUNT = 2"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "code",
34 |    "execution_count": null,
35 |    "metadata": {
36 |     "collapsed": false
37 |    },
38 |    "outputs": [],
39 |    "source": [
40 |     "for file in os.listdir('topics'):\n",
41 |     "    with open('topics/' + file, 'r') as f:\n",
42 |     "        parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n",
43 |     "        stemmer = Stemmer(LANGUAGE)\n",
44 |     "        summarizer = Summarizer(stemmer)\n",
45 |     "        summarizer.stop_words = get_stop_words(LANGUAGE)\n",
46 |     "        with open('summ_sumy_lsa/' + file.split('.')[0] + '.txt', 'w') as fw:\n",
47 |     "            for sentence in summarizer(parser.document, SENTENCES_COUNT):\n",
48 |     "                #print sentence\n",
49 |     "                fw.writelines(str(sentence))"
50 |    ]
51 |   },
52 |   {
53 |    "cell_type": "code",
54 |    "execution_count": null,
55 |    "metadata": {
56 |     "collapsed": true
57 |    },
58 |    "outputs": [],
59 |    "source": []
60 |   }
61 |  ],
62 |  "metadata": {
63 |   "anaconda-cloud": {},
64 |   "kernelspec": {
65 |    "display_name": "Python [default]",
66 |    "language": "python",
67 |    "name": "python2"
68 |   },
69 |   "language_info": {
70 |    "codemirror_mode": {
71 |     "name": "ipython",
72 |     "version": 2
73 |    },
74 |    "file_extension": ".py",
75 |    "mimetype": "text/x-python",
76 |    "name": "python",
77 |    "nbconvert_exporter": "python",
78 |    "pygments_lexer": "ipython2",
79 |    "version": "2.7.12"
80 |   }
81 |  },
82 |  "nbformat": 4,
83 |  "nbformat_minor": 0
84 | }
85 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Summ_Sumy_luhn-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 4,
 6 |    "metadata": {
 7 |     "collapsed": false
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import os\n",
12 |     "from sumy.summarizers.luhn import LuhnSummarizer \n",
13 |     "from sumy.parsers.plaintext import PlaintextParser\n",
14 |     "from sumy.nlp.tokenizers import Tokenizer\n",
15 |     "from sumy.nlp.stemmers import Stemmer\n",
16 |     "from sumy.utils import get_stop_words"
17 |    ]
18 |   },
19 |   {
20 |    "cell_type": "code",
21 |    "execution_count": 5,
22 |    "metadata": {
23 |     "collapsed": true
24 |    },
25 |    "outputs": [],
26 |    "source": [
27 |     "LANGUAGE = \"english\"\n",
28 |     "SENTENCES_COUNT = 1"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "code",
33 |    "execution_count": 6,
34 |    "metadata": {
35 |     "collapsed": false
36 |    },
37 |    "outputs": [],
38 |    "source": [
39 |     "for file in os.listdir('topics'):\n",
40 |     "    with open('topics/' + file, 'r') as f:\n",
41 |     "        parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n",
42 |     "        stemmer = Stemmer(LANGUAGE)\n",
43 |     "        summarizer = LuhnSummarizer(stemmer)\n",
44 |     "        summarizer.stop_words = get_stop_words(LANGUAGE)\n",
45 |     "        with open('summ_sumy_lex/' + file.split('.')[0] + '.txt', 'w') as fw:\n",
46 |     "            for sentence in summarizer(parser.document, SENTENCES_COUNT):\n",
47 |     "                #print sentence\n",
48 |     "                fw.writelines(str(sentence))"
49 |    ]
50 |   },
51 |   {
52 |    "cell_type": "markdown",
53 |    "metadata": {},
54 |    "source": [
55 |     "While summarizing a document, summarizer invokes __call__() method which takes two arguments, a document and Sentence_Count. The second argument doesn't seem to be effective while summarizing the document. Even with SENTENCE_COUNT = 1 summaries generated have more than 1 sentence."
56 |    ]
57 |   },
58 |   {
59 |    "cell_type": "code",
60 |    "execution_count": null,
61 |    "metadata": {
62 |     "collapsed": true
63 |    },
64 |    "outputs": [],
65 |    "source": []
66 |   }
67 |  ],
68 |  "metadata": {
69 |   "anaconda-cloud": {},
70 |   "kernelspec": {
71 |    "display_name": "Python [default]",
72 |    "language": "python",
73 |    "name": "python2"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 2
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython2",
85 |    "version": "2.7.12"
86 |   }
87 |  },
88 |  "nbformat": 4,
89 |  "nbformat_minor": 0
90 | }
91 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/bleu_model-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import nltk\n",
 12 |     "import os\n",
 13 |     "import numpy as np"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "path2model = 'path to model summary'\n",
 25 |     "path2gold = 'path to reference summary'"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "files = os.listdir(path2model)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "bleu_scores = []\n",
 48 |     "for i in files:\n",
 49 |     "    reference = []\n",
 50 |     "    gold_summ = os.listdir(path2gold+'/' + i.split('.txt')[0])\n",
 51 |     "    with open(path2model + '/' + i,'r') as fmodel:\n",
 52 |     "        textmodel = fmodel.read()\n",
 53 |     "        fmodel.close()\n",
 54 |     "    for j in gold_summ:\n",
 55 |     "         with open(path2gold + '/' + i.split('.txt')[0] + '/' + j,'r') as fgold:    \n",
 56 |     "            textgold = fgold.read()\n",
 57 |     "            fmodel.close()\n",
 58 |     "        reference.append(textgold)\n",
 59 |     "       \n",
 60 |     "    \n",
 61 |     "    bleu_scores.append(nltk.translate.bleu_score.sentence_bleu(reference, textmodel, weights=[0.4,0.3,0.2]))"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "np.mean(bleu_scores)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "np.std(bleu_scores)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "collapsed": true
 91 |    },
 92 |    "outputs": [],
 93 |    "source": []
 94 |   }
 95 |  ],
 96 |  "metadata": {
 97 |   "anaconda-cloud": {},
 98 |   "kernelspec": {
 99 |    "display_name": "Python [conda root]",
100 |    "language": "python",
101 |    "name": "conda-root-py"
102 |   },
103 |   "language_info": {
104 |    "codemirror_mode": {
105 |     "name": "ipython",
106 |     "version": 2
107 |    },
108 |    "file_extension": ".py",
109 |    "mimetype": "text/x-python",
110 |    "name": "python",
111 |    "nbconvert_exporter": "python",
112 |    "pygments_lexer": "ipython2",
113 |    "version": "2.7.12"
114 |   }
115 |  },
116 |  "nbformat": 4,
117 |  "nbformat_minor": 1
118 | }
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scripts-for-extractive-summarization
 2 | Scripts for an upcoming blog "Extractive vs. Abstractive Summarization" for RaRe Technologies. 
 3 | 
 4 | Requirements:
 5 |   1. Gensim
 6 |   2. Pyteaser
 7 |   3. Sumy
 8 |   4. NLTK
 9 |   5. pythonrouge by tagguci
10 | 
11 | The following extractive summarization algorithms have been used:
12 |   1. Gensim's TextRank
13 |   2. Sumy's implementation of Luhn's algorithm
14 |   3. Sumy's LexRank implementation
15 |   4. Sumy's LSA implementation
16 |   5. Pyteaser: python implementation of TextTeaser.
17 |   
18 | Link to the blog: https://t.co/YidmkzqRVW
19 | 


--------------------------------------------------------------------------------
/Summ_Gensim.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 4,
 6 |    "metadata": {
 7 |     "collapsed": false
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import os\n",
12 |     "from gensim.summarization import summarize"
13 |    ]
14 |   },
15 |   {
16 |    "cell_type": "code",
17 |    "execution_count": 5,
18 |    "metadata": {
19 |     "collapsed": false
20 |    },
21 |    "outputs": [],
22 |    "source": [
23 |     "def summ_it():\n",
24 |     "    total = 0\n",
25 |     "    for file in os.listdir('topics'):\n",
26 |     "        with open('topics/' + file, 'r') as f:\n",
27 |     "            summ = summarize(unicode(f.read().replace('\\n',' '), errors='ignore'), word_count=75)\n",
28 |     "            with open('summ_gensim/' + file.split('.')[0] + '.txt', 'w') as fw:\n",
29 |     "                fw.writelines(summ)\n",
30 |     "                total += len(summ)"
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "code",
35 |    "execution_count": 6,
36 |    "metadata": {
37 |     "collapsed": false
38 |    },
39 |    "outputs": [
40 |     {
41 |      "name": "stdout",
42 |      "output_type": "stream",
43 |      "text": [
44 |       "1 loop, best of 3: 57.7 s per loop\n"
45 |      ]
46 |     }
47 |    ],
48 |    "source": [
49 |     "summ_it()"
50 |    ]
51 |   },
52 |   {
53 |    "cell_type": "code",
54 |    "execution_count": null,
55 |    "metadata": {
56 |     "collapsed": true
57 |    },
58 |    "outputs": [],
59 |    "source": []
60 |   }
61 |  ],
62 |  "metadata": {
63 |   "anaconda-cloud": {},
64 |   "kernelspec": {
65 |    "display_name": "Python [default]",
66 |    "language": "python",
67 |    "name": "python2"
68 |   },
69 |   "language_info": {
70 |    "codemirror_mode": {
71 |     "name": "ipython",
72 |     "version": 2
73 |    },
74 |    "file_extension": ".py",
75 |    "mimetype": "text/x-python",
76 |    "name": "python",
77 |    "nbconvert_exporter": "python",
78 |    "pygments_lexer": "ipython2",
79 |    "version": "2.7.12"
80 |   }
81 |  },
82 |  "nbformat": 4,
83 |  "nbformat_minor": 0
84 | }
85 | 


--------------------------------------------------------------------------------
/Summ_Pytease.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": false
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import os\n",
12 |     "from pyteaser import Summarize"
13 |    ]
14 |   },
15 |   {
16 |    "cell_type": "code",
17 |    "execution_count": null,
18 |    "metadata": {
19 |     "collapsed": false
20 |    },
21 |    "outputs": [],
22 |    "source": [
23 |     "for file in os.listdir('topics'):\n",
24 |     "    with open('topics/' + file, 'r') as f:\n",
25 |     "        #print file\n",
26 |     "        summ = Summarize('',unicode(f.read().replace('\\n',' '), errors='ignore'))\n",
27 |     "        with open('summ_pyteaser/' + file.split('.')[0] + '.txt', 'w') as fw:\n",
28 |     "            fw.writelines(summ)"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "code",
33 |    "execution_count": null,
34 |    "metadata": {
35 |     "collapsed": true
36 |    },
37 |    "outputs": [],
38 |    "source": []
39 |   }
40 |  ],
41 |  "metadata": {
42 |   "anaconda-cloud": {},
43 |   "kernelspec": {
44 |    "display_name": "Python [default]",
45 |    "language": "python",
46 |    "name": "python2"
47 |   },
48 |   "language_info": {
49 |    "codemirror_mode": {
50 |     "name": "ipython",
51 |     "version": 2
52 |    },
53 |    "file_extension": ".py",
54 |    "mimetype": "text/x-python",
55 |    "name": "python",
56 |    "nbconvert_exporter": "python",
57 |    "pygments_lexer": "ipython2",
58 |    "version": "2.7.12"
59 |   }
60 |  },
61 |  "nbformat": 4,
62 |  "nbformat_minor": 0
63 | }
64 | 


--------------------------------------------------------------------------------
/Summ_Sumy_lex.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import os\n",
 12 |     "from sumy.summarizers.lex_rank import LexRankSummarizer\n",
 13 |     "from sumy.parsers.plaintext import PlaintextParser\n",
 14 |     "from sumy.nlp.tokenizers import Tokenizer\n",
 15 |     "from sumy.nlp.stemmers import Stemmer\n",
 16 |     "from sumy.utils import get_stop_words"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 6,
 22 |    "metadata": {
 23 |     "collapsed": true
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "LANGUAGE = \"english\"\n",
 28 |     "SENTENCES_COUNT = 5"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 7,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "def summ_it():\n",
 40 |     "    length = 0\n",
 41 |     "    total = 0\n",
 42 |     "    for file in os.listdir('topics'):\n",
 43 |     "        with open('topics/' + file, 'r') as f:\n",
 44 |     "            parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n",
 45 |     "            stemmer = Stemmer(LANGUAGE)\n",
 46 |     "            summarizer = LexRankSummarizer(stemmer)\n",
 47 |     "            summarizer.stop_words = get_stop_words(LANGUAGE)\n",
 48 |     "            with open('summ_sumy_lex/' + file.split('.')[0] + '.txt', 'w') as fw:\n",
 49 |     "                for sentence in summarizer(parser.document, SENTENCES_COUNT):\n",
 50 |     "                    #print sentence\n",
 51 |     "                    fw.writelines(str(sentence))\n",
 52 |     "                    length += len(str(sentence))\n",
 53 |     "                total += length\n",
 54 |     "                length = 0"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 8,
 60 |    "metadata": {
 61 |     "collapsed": false
 62 |    },
 63 |    "outputs": [
 64 |     {
 65 |      "name": "stdout",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "1 loop, best of 3: 41.2 s per loop\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "summ_it()"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "collapsed": true
 81 |    },
 82 |    "outputs": [],
 83 |    "source": []
 84 |   }
 85 |  ],
 86 |  "metadata": {
 87 |   "anaconda-cloud": {},
 88 |   "kernelspec": {
 89 |    "display_name": "Python [default]",
 90 |    "language": "python",
 91 |    "name": "python2"
 92 |   },
 93 |   "language_info": {
 94 |    "codemirror_mode": {
 95 |     "name": "ipython",
 96 |     "version": 2
 97 |    },
 98 |    "file_extension": ".py",
 99 |    "mimetype": "text/x-python",
100 |    "name": "python",
101 |    "nbconvert_exporter": "python",
102 |    "pygments_lexer": "ipython2",
103 |    "version": "2.7.12"
104 |   }
105 |  },
106 |  "nbformat": 4,
107 |  "nbformat_minor": 0
108 | }
109 | 


--------------------------------------------------------------------------------
/Summ_Sumy_lsa.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": false
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import os\n",
12 |     "from sumy.summarizers import luhn\n",
13 |     "from sumy.parsers.plaintext import PlaintextParser\n",
14 |     "from sumy.nlp.tokenizers import Tokenizer\n",
15 |     "from sumy.nlp.stemmers import Stemmer\n",
16 |     "from sumy.utils import get_stop_words\n",
17 |     "from sumy.summarizers.lsa import LsaSummarizer as Summarizer"
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": null,
23 |    "metadata": {
24 |     "collapsed": true
25 |    },
26 |    "outputs": [],
27 |    "source": [
28 |     "LANGUAGE = \"english\"\n",
29 |     "SENTENCES_COUNT = 2"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "code",
34 |    "execution_count": null,
35 |    "metadata": {
36 |     "collapsed": false
37 |    },
38 |    "outputs": [],
39 |    "source": [
40 |     "for file in os.listdir('topics'):\n",
41 |     "    with open('topics/' + file, 'r') as f:\n",
42 |     "        parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n",
43 |     "        stemmer = Stemmer(LANGUAGE)\n",
44 |     "        summarizer = Summarizer(stemmer)\n",
45 |     "        summarizer.stop_words = get_stop_words(LANGUAGE)\n",
46 |     "        with open('summ_sumy_lsa/' + file.split('.')[0] + '.txt', 'w') as fw:\n",
47 |     "            for sentence in summarizer(parser.document, SENTENCES_COUNT):\n",
48 |     "                #print sentence\n",
49 |     "                fw.writelines(str(sentence))"
50 |    ]
51 |   },
52 |   {
53 |    "cell_type": "code",
54 |    "execution_count": null,
55 |    "metadata": {
56 |     "collapsed": true
57 |    },
58 |    "outputs": [],
59 |    "source": []
60 |   }
61 |  ],
62 |  "metadata": {
63 |   "anaconda-cloud": {},
64 |   "kernelspec": {
65 |    "display_name": "Python [default]",
66 |    "language": "python",
67 |    "name": "python2"
68 |   },
69 |   "language_info": {
70 |    "codemirror_mode": {
71 |     "name": "ipython",
72 |     "version": 2
73 |    },
74 |    "file_extension": ".py",
75 |    "mimetype": "text/x-python",
76 |    "name": "python",
77 |    "nbconvert_exporter": "python",
78 |    "pygments_lexer": "ipython2",
79 |    "version": "2.7.12"
80 |   }
81 |  },
82 |  "nbformat": 4,
83 |  "nbformat_minor": 0
84 | }
85 | 


--------------------------------------------------------------------------------
/Summ_Sumy_luhn.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 4,
 6 |    "metadata": {
 7 |     "collapsed": false
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import os\n",
12 |     "from sumy.summarizers.luhn import LuhnSummarizer \n",
13 |     "from sumy.parsers.plaintext import PlaintextParser\n",
14 |     "from sumy.nlp.tokenizers import Tokenizer\n",
15 |     "from sumy.nlp.stemmers import Stemmer\n",
16 |     "from sumy.utils import get_stop_words"
17 |    ]
18 |   },
19 |   {
20 |    "cell_type": "code",
21 |    "execution_count": 5,
22 |    "metadata": {
23 |     "collapsed": true
24 |    },
25 |    "outputs": [],
26 |    "source": [
27 |     "LANGUAGE = \"english\"\n",
28 |     "SENTENCES_COUNT = 1"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "code",
33 |    "execution_count": 6,
34 |    "metadata": {
35 |     "collapsed": false
36 |    },
37 |    "outputs": [],
38 |    "source": [
39 |     "for file in os.listdir('topics'):\n",
40 |     "    with open('topics/' + file, 'r') as f:\n",
41 |     "        parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n",
42 |     "        stemmer = Stemmer(LANGUAGE)\n",
43 |     "        summarizer = LuhnSummarizer(stemmer)\n",
44 |     "        summarizer.stop_words = get_stop_words(LANGUAGE)\n",
45 |     "        with open('summ_sumy_lex/' + file.split('.')[0] + '.txt', 'w') as fw:\n",
46 |     "            for sentence in summarizer(parser.document, SENTENCES_COUNT):\n",
47 |     "                #print sentence\n",
48 |     "                fw.writelines(str(sentence))"
49 |    ]
50 |   },
51 |   {
52 |    "cell_type": "markdown",
53 |    "metadata": {},
54 |    "source": [
55 |     "While summarizing a document, summarizer invokes __call__() method which takes two arguments, a document and Sentence_Count. The second argument doesn't seem to be effective while summarizing the document. Even with SENTENCE_COUNT = 1 summaries generated have more than 1 sentence."
56 |    ]
57 |   },
58 |   {
59 |    "cell_type": "code",
60 |    "execution_count": null,
61 |    "metadata": {
62 |     "collapsed": true
63 |    },
64 |    "outputs": [],
65 |    "source": []
66 |   }
67 |  ],
68 |  "metadata": {
69 |   "anaconda-cloud": {},
70 |   "kernelspec": {
71 |    "display_name": "Python [default]",
72 |    "language": "python",
73 |    "name": "python2"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 2
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython2",
85 |    "version": "2.7.12"
86 |   }
87 |  },
88 |  "nbformat": 4,
89 |  "nbformat_minor": 0
90 | }
91 | 


--------------------------------------------------------------------------------
/bleu_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import nltk\n",
 12 |     "import os\n",
 13 |     "import numpy as np"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "path2model = 'path to model summary'\n",
 25 |     "path2gold = 'path to reference summary'"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "files = os.listdir(path2model)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "bleu_scores = []\n",
 48 |     "for i in files:\n",
 49 |     "    reference = []\n",
 50 |     "    gold_summ = os.listdir(path2gold+'/' + i.split('.txt')[0])\n",
 51 |     "    with open(path2model + '/' + i,'r') as fmodel:\n",
 52 |     "        textmodel = fmodel.read()\n",
 53 |     "        fmodel.close()\n",
 54 |     "    for j in gold_summ:\n",
 55 |     "         with open(path2gold + '/' + i.split('.txt')[0] + '/' + j,'r') as fgold:    \n",
 56 |     "            textgold = fgold.read()\n",
 57 |     "            fmodel.close()\n",
 58 |     "        reference.append(textgold)\n",
 59 |     "       \n",
 60 |     "    \n",
 61 |     "    bleu_scores.append(nltk.translate.bleu_score.sentence_bleu(reference, textmodel, weights=[0.4,0.3,0.2]))"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "np.mean(bleu_scores)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "np.std(bleu_scores)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "collapsed": true
 91 |    },
 92 |    "outputs": [],
 93 |    "source": []
 94 |   }
 95 |  ],
 96 |  "metadata": {
 97 |   "anaconda-cloud": {},
 98 |   "kernelspec": {
 99 |    "display_name": "Python [conda root]",
100 |    "language": "python",
101 |    "name": "conda-root-py"
102 |   },
103 |   "language_info": {
104 |    "codemirror_mode": {
105 |     "name": "ipython",
106 |     "version": 2
107 |    },
108 |    "file_extension": ".py",
109 |    "mimetype": "text/x-python",
110 |    "name": "python",
111 |    "nbconvert_exporter": "python",
112 |    "pygments_lexer": "ipython2",
113 |    "version": "2.7.12"
114 |   }
115 |  },
116 |  "nbformat": 4,
117 |  "nbformat_minor": 1
118 | }
119 | 


--------------------------------------------------------------------------------
/pyrougetextrank.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from pythonrouge import pythonrouge
 4 | 
 5 | path2model = 'path to model summary'
 6 | path2gold = 'path to reference summaries'
 7 | files = os.listdir(path2model)
 8 | final_mean = []
 9 | for i in files:
10 |     gold_summ = os.listdir(path2gold+'/' + i.split('.txt')[0])
11 |     with open(path2model + '/' + i,'r') as fmodel:
12 |         textmodel = fmodel.read()
13 |         fmodel.close()
14 |     score = []
15 |     for j in gold_summ:
16 |         with open(path2gold + '/' + i.split('.txt')[0] + '/' + j,'r') as fgold:
17 |             textgold = fgold.read()
18 |             fmodel.close()
19 |         score.append(pythonrouge.pythonrouge(textmodel, textgold)['ROUGE-1'])
20 |     final_mean.append(np.max(score))
21 |     print np.max(score)
22 | 
23 | print np.mean(final_mean)
24 | print np.std(final_mean)
25 | 


--------------------------------------------------------------------------------