├── Amharic
    ├── 1_scrap
    │   ├── Amhari Selinum Scrapper.ipynb
    │   ├── NewsPlease.ipynb
    │   ├── addisadmassnews simple.ipynb
    │   ├── ecadforum simple.ipynb
    │   ├── ethiopianregistrar simple.ipynb
    │   ├── ethsat simple.ipynb
    │   ├── goolgule simple.ipynb
    │   └── zehabesha archive.ipynb
    ├── 2_Clean Amhari.ipynb
    ├── 3_Amhari Build Word2Vec.ipynb
    ├── 4_Amhari python3 Model Scheduled Sampling.ipynb
    └── Readme.md
├── Arabic
    ├── 1_Basic Cleaning
    │   ├── 1_Pointer Generator
    │   │   ├── Model 5 RL Arabic Policy Gradient.ipynb
    │   │   └── zaksum_result_Arabic_pretrain_pointergenerator_9_7_2019_11_53Pm.xml
    │   ├── 2_Scheduled Sampling
    │   │   ├── Model 5 CL Arabic Scheduled Sampling.ipynb
    │   │   └── zaksum_result_Arabic_CL_Scheduled Sampling_9_7_2019_11_44Pm.xml
    │   └── 3_RL Policy Gradient
    │   │   ├── Model 5 RL Arabic Policy Gradient.ipynb
    │   │   └── zaksum_result_Arabic_RL_policygradient_10_7_2019_1_37Am.xml
    ├── 2_Advanced Cleaning
    │   ├── 1_Pointer Generator
    │   │   ├── Model 5E RL Arabic Policy Gradient.ipynb
    │   │   └── zaksum_result_Arabic_pretrain_pointergenerator_19_7_2019_3_08Pm.xml
    │   ├── 2_Scheduled Sampling
    │   │   ├── Model 5E CL Arabic Scheduled Sampling.ipynb
    │   │   └── zaksum_result_Extreme_Arabic_CL_Scheduled Sampling_14_7_2019_8_58Pm.xml
    │   └── 3_RL Policy Gradient
    │   │   ├── Model 5E RL Arabic Policy Gradient.ipynb
    │   │   └── zaksum_result_Arabic_RL_policygradient_19_7_2019_5_47Pm.xml
    ├── Build Word2Vec.ipynb
    ├── README.md
    └── zaksum RL.ipynb
├── Hindi
    ├── 1_NewsCrawler (googel colab).ipynb
    ├── 2_process (local).py
    ├── 3_Build_Word2Vec_VocabDict (google colab).ipynb
    └── 4_Model_5_CL_CSV_py3_Scheduled_Sampling (google colab).ipynb
├── Implementation A (seq2seq with attention and feature rich representation)
    ├── .ipynb_checkpoints
    │   ├── Model_1-checkpoint.ipynb
    │   └── Model_3-checkpoint.ipynb
    ├── Model 2
    │   ├── .ipynb_checkpoints
    │   │   ├── Model_2-checkpoint.ipynb
    │   │   └── Model_2_features(tf_idf_,_pos_tags)-checkpoint.ipynb
    │   ├── Model_2.ipynb
    │   ├── Model_2_features(tf_idf_,_pos_tags).ipynb
    │   └── Results
    │   │   ├── (model 2 with tf-idf and pos) result_featurerich_15_11_2018_5_28pm.xml
    │   │   └── (model 2)result_valid_29_10_2018_5_28pm.xml
    ├── Model_1.ipynb
    ├── Model_3.ipynb
    └── README.md
├── Implementation B (Pointer Generator seq2seq network)
    ├── Model_4_generator_.ipynb
    ├── Model_4_generator_python3.ipynb
    ├── PreProcessData
    │   ├── README.md
    │   └── process_English.py
    ├── README.md
    ├── Results
    │   └── Pointer Generator
    │   │   ├── generator output.xml
    │   │   └── result generator 5_1_2019.xml
    └── zaksum_eval.ipynb
├── Implementation C (Reinforcement Learning with seq2seq)
    ├── Policy Gradient
    │   ├── Model_5_RL_Policy_Gradient.ipynb
    │   ├── README.md
    │   ├── Results
    │   │   ├── result_policy.xml
    │   │   └── result_pretrain.xml
    │   └── zaksum_eval.ipynb
    ├── README.md
    └── Scheduled Sampling with intradecoder
    │   ├── Model 5 RL.ipynb
    │   ├── README.md
    │   ├── Results
    │       └── result_Reinforcement learning.xml
    │   └── zaksum eval.ipynb
└── README.md


/Amharic/2_Clean Amhari.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Clean Amhari.ipynb","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyMK5OHtWYiY41hN4dXtiQME"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"id":"Jq49SquyhqQz","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":122},"outputId":"2ef46d81-a2dc-41e3-b084-c9b038d5ac6b","executionInfo":{"status":"ok","timestamp":1580472934481,"user_tz":-120,"elapsed":30307,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}}},"source":["from google.colab import drive\n","drive.mount('/content/drive')"],"execution_count":1,"outputs":[{"output_type":"stream","text":["Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n","\n","Enter your authorization code:\n","··········\n","Mounted at /content/drive\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"UVTPWcC72o9F","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":204},"outputId":"e99ef0a7-6731-4f1d-d4a3-4ea21d066594","executionInfo":{"status":"ok","timestamp":1580478414096,"user_tz":-120,"elapsed":7764,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}}},"source":["!pip install langdetect"],"execution_count":14,"outputs":[{"output_type":"stream","text":["Collecting langdetect\n","\u001b[?25l  Downloading https://files.pythonhosted.org/packages/59/59/4bc44158a767a6d66de18c4136c8aa90491d56cc951c10b74dd1e13213c9/langdetect-1.0.7.zip (998kB)\n","\r\u001b[K     |▎                               | 10kB 20.2MB/s eta 0:00:01\r\u001b[K     |▋                               | 20kB 3.1MB/s eta 0:00:01\r\u001b[K     |█                               | 30kB 4.2MB/s eta 0:00:01\r\u001b[K     |█▎                              | 40kB 2.9MB/s eta 0:00:01\r\u001b[K     |█▋                              | 51kB 3.2MB/s eta 0:00:01\r\u001b[K     |██                              | 61kB 3.8MB/s eta 0:00:01\r\u001b[K     |██▎                             | 71kB 4.2MB/s eta 0:00:01\r\u001b[K     |██▋                             | 81kB 4.5MB/s eta 0:00:01\r\u001b[K     |███                             | 92kB 5.0MB/s eta 0:00:01\r\u001b[K     |███▎                            | 102kB 4.7MB/s eta 0:00:01\r\u001b[K     |███▋                            | 112kB 4.7MB/s eta 0:00:01\r\u001b[K     |████                            | 122kB 4.7MB/s eta 0:00:01\r\u001b[K     |████▎                           | 133kB 4.7MB/s eta 0:00:01\r\u001b[K     |████▋                           | 143kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████                           | 153kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████▎                          | 163kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████▋                          | 174kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████                          | 184kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████▎                         | 194kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████▋                         | 204kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████                         | 215kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████▎                        | 225kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████▌                        | 235kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████▉                        | 245kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████▏                       | 256kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████▌                       | 266kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████▉                       | 276kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████▏                      | 286kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████▌                      | 296kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████▉                      | 307kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████▏                     | 317kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████▌                     | 327kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████▉                     | 337kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████▏                    | 348kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████▌                    | 358kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████▉                    | 368kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████▏                   | 378kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████▌                   | 389kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████▉                   | 399kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████▏                  | 409kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████▌                  | 419kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████▉                  | 430kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████▏                 | 440kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████▌                 | 450kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████▊                 | 460kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████                 | 471kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████▍                | 481kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████▊                | 491kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████                | 501kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████▍               | 512kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████▊               | 522kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████               | 532kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████▍              | 542kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████▊              | 552kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████              | 563kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████▍             | 573kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████▊             | 583kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████             | 593kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████▍            | 604kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████▊            | 614kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████████            | 624kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████████▍           | 634kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████████▊           | 645kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████           | 655kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████▍          | 665kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████▊          | 675kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████████          | 686kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████████▎         | 696kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████████▋         | 706kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████████         | 716kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████████▎        | 727kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████████▋        | 737kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████████████        | 747kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████████████▎       | 757kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████████████▋       | 768kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████████       | 778kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████████▎      | 788kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████████▋      | 798kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████████████      | 808kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████████████▎     | 819kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████████████▋     | 829kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████████████     | 839kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████████████▎    | 849kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████████████▋    | 860kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████████████████    | 870kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████████████████▎   | 880kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████████████████▋   | 890kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████████████   | 901kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████████████▏  | 911kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████████████▌  | 921kB 4.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████████████▉  | 931kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████████████████▏ | 942kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████████████████▌ | 952kB 4.7MB/s eta 0:00:01\r\u001b[K     |██████████████████████████████▉ | 962kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████████████████▏| 972kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████████████████▌| 983kB 4.7MB/s eta 0:00:01\r\u001b[K     |███████████████████████████████▉| 993kB 4.7MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 1.0MB 4.7MB/s \n","\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from langdetect) (1.12.0)\n","Building wheels for collected packages: langdetect\n","  Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n","  Created wheel for langdetect: filename=langdetect-1.0.7-cp36-none-any.whl size=993460 sha256=1bea9507e6d0cee774c8179482e3e1b34e2f4ae50a5a50f7c376c737eb6e146c\n","  Stored in directory: /root/.cache/pip/wheels/ec/0c/a9/1647275e7ef5014e7b83ff30105180e332867d65e7617ddafe\n","Successfully built langdetect\n","Installing collected packages: langdetect\n","Successfully installed langdetect-1.0.7\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"dJuk4ZzT4C1W","colab_type":"code","colab":{}},"source":["from __future__ import print_function\n","\n","\n","import json\n","import os\n","import pandas\n","import io\n","import sys\n","import re\n","\n","\n","class ProgressBar(object):\n","    DEFAULT = 'Progress: %(bar)s %(percent)3d%%'\n","    FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go'\n","\n","    def __init__(self, total, width=40, fmt=DEFAULT, symbol='=',\n","                 output=sys.stderr):\n","        assert len(symbol) == 1\n","\n","        self.total = total\n","        self.width = width\n","        self.symbol = symbol\n","        self.output = output\n","        self.fmt = re.sub(r'(?P<name>%\\(.+?\\))d',\n","            r'\\g<name>%dd' % len(str(total)), fmt)\n","\n","        self.current = 0\n","\n","    def __call__(self):\n","        percent = self.current / float(self.total)\n","        size = int(self.width * percent)\n","        remaining = self.total - self.current\n","        bar = '[' + self.symbol * size + ' ' * (self.width - size) + ']'\n","\n","        args = {\n","            'total': self.total,\n","            'bar': bar,\n","            'current': self.current,\n","            'percent': percent * 100,\n","            'remaining': remaining\n","        }\n","        print('\\r' + self.fmt % args, file=self.output, end='')\n","\n","    def done(self):\n","        self.current = self.total\n","        self()\n","        print('', file=self.output)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"VWHIVSGthvZL","colab_type":"code","colab":{}},"source":["import pandas as pd \n","\n","df1 = pd.read_csv(\"/content/drive/My Drive/Amhari/news___goolgule.csv\") \n","df2 = pd.read_csv(\"/content/drive/My Drive/Amhari/news___amhari___ethiopianregistrar.csv\") \n","df3 = pd.read_csv(\"/content/drive/My Drive/Amhari/news_amhari_ethsat.csv\") \n","df4 = pd.read_csv(\"/content/drive/My Drive/Amhari/news_amhari_ecadforum___.csv\") \n","df5 = pd.read_csv(\"/content/drive/My Drive/Amhari/news_zehabesha_archive.csv\") \n","df6 = pd.read_csv(\"/content/drive/My Drive/Amhari/news_amhari_selinum.csv\") \n","df7 = pd.read_csv(\"/content/drive/My Drive/Amhari/news_amhari_addisadmassnews.csv\") \n","\n","frames = [df1, df2, df3 , df4 , df5 , df6 ,df7]\n","result = pd.concat(frames)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"Vdr7-IjpiT0Q","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":204},"outputId":"016c06f6-7a24-4bc8-8ccd-ebb0d524d500","executionInfo":{"status":"ok","timestamp":1580478780351,"user_tz":-120,"elapsed":814,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}}},"source":["result.head()"],"execution_count":27,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>title</th>\n","      <th>content</th>\n","      <th>link</th>\n","      <th>id</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>Test post</td>\n","      <td>This is a test post for testing links and othe...</td>\n","      <td>http://www.goolgule.com/2012/7/page/1/</td>\n","      <td>652</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>title</td>\n","      <td>content</td>\n","      <td>link</td>\n","      <td>id</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>ሰንደቅ፡ዓላማችንን</td>\n","      <td>አረንጓዴ፡ብጫና፡ቀይ፡ሰንደቅ፡ዓላማችን፡ የአንድ፡ኢትዮጵያና፡ የነፃነት፡ ም...</td>\n","      <td>http://www.goolgule.com/2012/9/page/1/</td>\n","      <td>652</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>“ለዚህ (መንግሥት) ታላቅ ክብር አለኝ”</td>\n","      <td>“ለኢትዮጵያ የጥምር መንግስት ያስፈልጋታል” በማለት በ1997 ምርጫ አጥብ...</td>\n","      <td>http://www.goolgule.com/2012/9/page/1/</td>\n","      <td>652</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>“ግራውንድ ሲቀነስ አንድ”</td>\n","      <td>(ቀጭኑ ዘ-ቄራ )\\nለአዲስ አበባ ከተማ ክብር ያላችሁ እንስማማለን። አዲ...</td>\n","      <td>http://www.goolgule.com/2012/9/page/1/</td>\n","      <td>652</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["                       title  ...   id\n","0                  Test post  ...  652\n","1                      title  ...   id\n","2                ሰንደቅ፡ዓላማችንን  ...  652\n","3  “ለዚህ (መንግሥት) ታላቅ ክብር አለኝ”  ...  652\n","4           “ግራውንድ ሲቀነስ አንድ”  ...  652\n","\n","[5 rows x 4 columns]"]},"metadata":{"tags":[]},"execution_count":27}]},{"cell_type":"code","metadata":{"id":"dLpTqPsZiVzu","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"45fcb4a6-8fca-4130-9eb7-5ce9b9c55c0d","executionInfo":{"status":"ok","timestamp":1580473094716,"user_tz":-120,"elapsed":750,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}}},"source":["len(result[\"title\"])"],"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":["74776"]},"metadata":{"tags":[]},"execution_count":4}]},{"cell_type":"code","metadata":{"id":"BD1dnw70iYyz","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"7a4f3206-0676-4c0f-b46d-d4b367259d96","executionInfo":{"status":"ok","timestamp":1580473435828,"user_tz":-120,"elapsed":769,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}}},"source":["len(result.title.unique())"],"execution_count":10,"outputs":[{"output_type":"execute_result","data":{"text/plain":["55335"]},"metadata":{"tags":[]},"execution_count":10}]},{"cell_type":"code","metadata":{"id":"Yd7AwBoN7Tsi","colab_type":"code","colab":{}},"source":["!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz\n","!pip install fasttext\n","import fasttext\n","model = fasttext.load_model('lid.176.ftz')"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"dNZP6SqQ7W_K","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"fd82a1dd-5526-4db9-a3eb-690a98470117","executionInfo":{"status":"ok","timestamp":1580479782826,"user_tz":-120,"elapsed":678,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}}},"source":["print(model.predict('ዓላማችንን apple apple ዓላማችንን', k=2))  # top 2 matching languages"],"execution_count":42,"outputs":[{"output_type":"stream","text":["(('__label__am', '__label__cv'), array([0.90513647, 0.04793829]))\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"gMzdp7bC7srx","colab_type":"code","colab":{}},"source":["aa = model.predict('ዓላማችንን apple apple ዓላማችንን', k=2)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"QFYb-AWZ_C4B","colab_type":"code","colab":{}},"source":["import string\n","\n","special_punctuations = '''`÷×؛<>_()*&^%][ـ،/:\"؟.,'{}~¦+|!”…“–ـ»«'''\n","english_punctuations = string.punctuation\n","punctuations_list = special_punctuations + english_punctuations\n","\n","def remove_punctuations(text):\n","    translator = str.maketrans('', '', punctuations_list)\n","    return text.translate(translator)\n","\n","def clean_text(text):\n","    #trim    \n","    text = text.strip()\n","    \n","    text = remove_punctuations(text)\n","    \n","    search =  ['\\n'  , '\\t', '&quot;' ,'?'  ,'؟'  ,'!'   ]\n","    replace = [' , ' , ' ' ,  ' '     ,' ? ',' ? ',' ! ']\n","        \n","    for i in range(0, len(search)):\n","        text = text.replace(search[i], replace[i])\n","           \n","    text = text.replace(\".\", \" , \")\n","\n","    # remove numbers\n","    text = ''.join([i for i in text if not i.isdigit()])\n","    return text"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"IZkuaGDM_yZV","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"6dbdfecf-bbb8-479a-ed84-16e8dc1a3edf","executionInfo":{"status":"ok","timestamp":1580480992002,"user_tz":-120,"elapsed":804,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}}},"source":["print(clean_text('ለዚህ (መንግሥት) ታላቅ ክብር አለኝ'))\n","print(clean_text('ለዚህ (መንግሥት) ታላቅ 7987 ክብር አለኝ'))"],"execution_count":77,"outputs":[{"output_type":"stream","text":["ለዚህ መንግሥት ታላቅ ክብር አለኝ\n","ለዚህ መንግሥት ታላቅ  ክብር አለኝ\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"Q83sl33QjSb5","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"f9699549-40ed-4c47-f3ba-54206129b8a6","executionInfo":{"status":"ok","timestamp":1580481251627,"user_tz":-120,"elapsed":145924,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}}},"source":["df_clean = [] \n","titles = [] \n","progress = ProgressBar(len(result[\"title\"]), fmt=ProgressBar.FULL)\n","for index, row in result.iterrows():\n","  row['title']   = clean_text(str(row['title']))\n","  row['content'] = clean_text(str(row['content']))\n","  if str(row['title']) != \"nan\" and str(row['title']) != \"\":\n","    row['title'] = row['title'].replace(\"\\n\" , \" , \")\n","    detect = model.predict(row['title'], k=2)\n","    if detect[0][0] == \"__label__am\" and row['title'] not in titles:\n","      titles.append(row['title'])\n","      r={}\n","      r[\"title\"] = row['title']\n","      r[\"content\"] = row['content']\n","      df_clean.append(r)\n","    progress.current += 1\n","    progress()\n","progress.done()"],"execution_count":78,"outputs":[{"output_type":"stream","text":["[========================================] 74776/74776 (100%)     0 to go\n"],"name":"stderr"}]},{"cell_type":"code","metadata":{"id":"xWC684tp9ay7","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"7b1dbad9-a7e8-4223-c969-293dc25fcd77","executionInfo":{"status":"ok","timestamp":1580481252698,"user_tz":-120,"elapsed":1043,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}}},"source":["print(len(df_clean))"],"execution_count":79,"outputs":[{"output_type":"stream","text":["53953\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"udy5baTnxFXA","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":204},"outputId":"8111cce6-943f-41ce-91c9-e3f43e8e01fb","executionInfo":{"status":"ok","timestamp":1580481252699,"user_tz":-120,"elapsed":1026,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}}},"source":["amhari_clean = pd.DataFrame(df_clean)\n","amhari_clean.head()"],"execution_count":80,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>title</th>\n","      <th>content</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>ሰንደቅ፡ዓላማችንን</td>\n","      <td>አረንጓዴ፡ብጫና፡ቀይ፡ሰንደቅ፡ዓላማችን፡ የአንድ፡ኢትዮጵያና፡ የነፃነት፡ ም...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>ለዚህ መንግሥት ታላቅ ክብር አለኝ</td>\n","      <td>ለኢትዮጵያ የጥምር መንግስት ያስፈልጋታል በማለት በ ምርጫ አጥብቆ ሲከራከ...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>ግራውንድ ሲቀነስ አንድ</td>\n","      <td>ቀጭኑ ዘቄራ  , ለአዲስ አበባ ከተማ ክብር ያላችሁ እንስማማለን። አዲስ ...</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>ዶላር አዘዋዋሪዎቹ ደላሎች</td>\n","      <td>ህወሓት የአጋር ፓርቲዎቹን የንግድ ተቋማትና ልማታዊ ባለሀብት እያለ የሚጠ...</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>ታላቅ ዕድል ነው</td>\n","      <td>ዛሬ አርብ ልክ ከጠዋቱ ፡ ላይ  የኢህአዴግ ወኪሎችና ኢህአዴግን እንደሚደ...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["                   title                                            content\n","0            ሰንደቅ፡ዓላማችንን  አረንጓዴ፡ብጫና፡ቀይ፡ሰንደቅ፡ዓላማችን፡ የአንድ፡ኢትዮጵያና፡ የነፃነት፡ ም...\n","1  ለዚህ መንግሥት ታላቅ ክብር አለኝ  ለኢትዮጵያ የጥምር መንግስት ያስፈልጋታል በማለት በ ምርጫ አጥብቆ ሲከራከ...\n","2         ግራውንድ ሲቀነስ አንድ  ቀጭኑ ዘቄራ  , ለአዲስ አበባ ከተማ ክብር ያላችሁ እንስማማለን። አዲስ ...\n","3       ዶላር አዘዋዋሪዎቹ ደላሎች  ህወሓት የአጋር ፓርቲዎቹን የንግድ ተቋማትና ልማታዊ ባለሀብት እያለ የሚጠ...\n","4             ታላቅ ዕድል ነው  ዛሬ አርብ ልክ ከጠዋቱ ፡ ላይ  የኢህአዴግ ወኪሎችና ኢህአዴግን እንደሚደ..."]},"metadata":{"tags":[]},"execution_count":80}]},{"cell_type":"code","metadata":{"id":"GbAmfL1277W7","colab_type":"code","colab":{}},"source":["amhari_clean.to_csv(\"/content/drive/My Drive/Amhari/amhari_cleaner.csv\", mode='w',index = False,encoding='utf-8')"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"v8iiIabR78Mi","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}


--------------------------------------------------------------------------------
/Amharic/Readme.md:
--------------------------------------------------------------------------------
1 | # Amharic Abstractive Text Summarization
2 | data, pre-trained models and results can be found [here](https://drive.google.com/open?id=1Vjv3mDuATtaljGKY9Ic3UGcQci99fiUD).
3 | 


--------------------------------------------------------------------------------
/Arabic/README.md:
--------------------------------------------------------------------------------
 1 | # Datasets and word-embedding
 2 | 
 3 | ## Basic Cleaning 
 4 | ### Dataset 
 5 | [Arabic News](https://drive.google.com/file/d/12Lqej0BcPelRQ81ewYrqkIl2xzfQald8/view?usp=sharing)
 6 | 
 7 | ### Word-embedding 
 8 | [model](https://drive.google.com/file/d/1yq4ewNJgib3j3p2KDffSFJOK8MwKch_L/view?usp=sharing)
 9 | 
10 | [.npy](https://drive.google.com/file/d/18keYjngW35nDBod75Qc2AKjIIrU84ubd/view?usp=sharing)
11 | 
12 | ------------------------------
13 | 
14 | ## Advanced Cleaning
15 | ### Dataset
16 | [Arabic News](https://drive.google.com/file/d/10vjPdkQqyFug95WY3lJE-b4yV7hSdjQe/view?usp=sharing)
17 | ### Word-embedding 
18 | [model](https://drive.google.com/file/d/1zFj_W0on8yD9ZF2i80SLjATvOOPfulQq/view?usp=sharing)
19 | 
20 | [.npy](https://drive.google.com/file/d/1RuI4NozNqQ7WLi9BtfKOnBltz5DyxCkM/view?usp=sharing)


--------------------------------------------------------------------------------
/Hindi/1_NewsCrawler (googel colab).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Hindi NewsCrawler.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": [
  9 |         "EqSXhVKvfXlA"
 10 |       ]
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "2MMYj8iMQBU7",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "from google.colab import drive\n",
 27 |         "drive.mount('/content/drive')"
 28 |       ],
 29 |       "execution_count": 0,
 30 |       "outputs": []
 31 |     },
 32 |     {
 33 |       "cell_type": "code",
 34 |       "metadata": {
 35 |         "id": "DZoPOVT5xRBR",
 36 |         "colab_type": "code",
 37 |         "colab": {}
 38 |       },
 39 |       "source": [
 40 |         "!pip3 install news-please   #https://github.com/fhamborg/news-please"
 41 |       ],
 42 |       "execution_count": 0,
 43 |       "outputs": []
 44 |     },
 45 |     {
 46 |       "cell_type": "markdown",
 47 |       "metadata": {
 48 |         "id": "FtiRZuhioVcs",
 49 |         "colab_type": "text"
 50 |       },
 51 |       "source": [
 52 |         "##Config\n",
 53 |         "\n",
 54 |         "save these configurations under /root/news-please-repo/config , in google colab when you open the files tab, go up one level, then create folder under the root directory name it \"news-please-repo\" then under it create folder named \"config\"\n"
 55 |       ]
 56 |     },
 57 |     {
 58 |       "cell_type": "markdown",
 59 |       "metadata": {
 60 |         "id": "EqSXhVKvfXlA",
 61 |         "colab_type": "text"
 62 |       },
 63 |       "source": [
 64 |         "### config.cfg\n",
 65 |         "\n",
 66 |         "save this cell to a file called /root/news-please-repo/config/config.cfg"
 67 |       ]
 68 |     },
 69 |     {
 70 |       "cell_type": "code",
 71 |       "metadata": {
 72 |         "id": "NIvXgX0IoXwR",
 73 |         "colab_type": "code",
 74 |         "colab": {}
 75 |       },
 76 |       "source": [
 77 |         "# IMPORTANT\n",
 78 |         "# All variables get parsed to the correct python-types (if not other declared)!\n",
 79 |         "# So bools have to be True or False (uppercase-first),\n",
 80 |         "# Floats need dots . (not comma)\n",
 81 |         "# Ints are just normal ints\n",
 82 |         "# dicts need to be like this { key: value }\n",
 83 |         "# arrays need to be like this [ value1, value2, value3 ]\n",
 84 |         "# All values in dicts and arrays will also be parsed.\n",
 85 |         "# Everything that does not match any of the above criteria will be parsed as string.\n",
 86 |         "\n",
 87 |         "\n",
 88 |         "\n",
 89 |         "[Crawler]\n",
 90 |         "\n",
 91 |         "# GENERAL\n",
 92 |         "# -------\n",
 93 |         "\n",
 94 |         "# Crawling heuristics\n",
 95 |         "# Default Crawlers:\n",
 96 |         "# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newsplease/crawler/spiders/-dir)\n",
 97 |         "# default: SitemapCrawler\n",
 98 |         "default = SitemapCrawler\n",
 99 |         "\n",
100 |         "# default:\n",
101 |         "# fallbacks = {\n",
102 |         "#     \"RssCrawler\": None,\n",
103 |         "#     \"RecursiveSitemapCrawler\": \"RecursiveCrawler\",\n",
104 |         "#     \"SitemapCrawler\": \"RecursiveCrawler\",\n",
105 |         "#     \"RecursiveCrawler\": None,\n",
106 |         "#     \"Download\": None\n",
107 |         "#     }\n",
108 |         "fallbacks = {\n",
109 |         "    \"RssCrawler\": None,\n",
110 |         "    \"RecursiveSitemapCrawler\": \"RecursiveCrawler\",\n",
111 |         "    \"SitemapCrawler\": \"RecursiveCrawler\",\n",
112 |         "    \"RecursiveCrawler\": None,\n",
113 |         "    \"Download\": None\n",
114 |         "    }\n",
115 |         "\n",
116 |         "# Determines how many hours need to pass since the last download of a webpage\n",
117 |         "# to be downloaded again by the RssCrawler\n",
118 |         "# default: 6\n",
119 |         "hours_to_pass_for_redownload_by_rss_crawler = 6\n",
120 |         "\n",
121 |         "\n",
122 |         "\n",
123 |         "# PROCESSES\n",
124 |         "# ---------\n",
125 |         "\n",
126 |         "# Number of crawlers, that should crawl parallel\n",
127 |         "# not counting in daemonized crawlers\n",
128 |         "# default: 5\n",
129 |         "number_of_parallel_crawlers = 5\n",
130 |         "\n",
131 |         "# Number of daemons, will be added to daemons.\n",
132 |         "# default: 10\n",
133 |         "number_of_parallel_daemons = 10\n",
134 |         "\n",
135 |         "\n",
136 |         "\n",
137 |         "# SPECIAL CASES\n",
138 |         "# -------------\n",
139 |         "\n",
140 |         "# urls which end on any of the following file extensions are ignored for recursive crawling\n",
141 |         "# default: \"(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)\"\n",
142 |         "ignore_file_extensions = \"(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)\"\n",
143 |         "\n",
144 |         "# urls which match the following regex are ignored for recursive crawling\n",
145 |         "# default: \"\"\n",
146 |         "ignore_regex = \"\"\n",
147 |         "\n",
148 |         "# Crawl the sitemaps of subdomains (if sitemap is enabled)\n",
149 |         "# If True, any SitemapCrawler will try to crawl on the sitemap of the given domain including subdomains instead of a domain's main sitemap.\n",
150 |         "# e.g. if True, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://blog.zeit.de/robots.txt. If not found, it will fall back to the False setting.\n",
151 |         "#      if False, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://zeit.de/robots.txt\n",
152 |         "# default: True\n",
153 |         "sitemap_allow_subdomains = True\n",
154 |         "\n",
155 |         "\n",
156 |         "\n",
157 |         "[Heuristics]\n",
158 |         "\n",
159 |         "# Enabled heuristics,\n",
160 |         "# Currently:\n",
161 |         "#    - og_type\n",
162 |         "#    - linked_headlines\n",
163 |         "#    - self_linked_headlines\n",
164 |         "#    - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded)\n",
165 |         "#    - meta_contains_article_keyword\n",
166 |         "#    - crawler_contains_only_article_alikes\n",
167 |         "# (maybe not up-to-date, see ./newsplease/helper_classes/heursitics.py:\n",
168 |         "#  Every method not starting with __ should be a heuristic, except is_article)\n",
169 |         "# These heuristics can be overwritten by sitelist.json for each site\n",
170 |         "# default: {\"og_type\": True, \"linked_headlines\": \"<=0.65\", \"self_linked_headlines\": \"<=0.56\"}\n",
171 |         "enabled_heuristics = {\"og_type\": True, \"linked_headlines\": \"<=0.65\", \"self_linked_headlines\": \"<=0.56\"}\n",
172 |         "\n",
173 |         "# Heuristics can be combined with others\n",
174 |         "# The heuristics need to have the same name as in enabled_heuristics\n",
175 |         "# Possible condition-characters / literals are: (, ), not, and, or\n",
176 |         "# All heuristics used here need to be enabled in enabled_heuristics as well!\n",
177 |         "# Examples:\n",
178 |         "#     \"og_type and (self_linked_headlines or linked_headlines)\"\n",
179 |         "#     \"og_type\"\n",
180 |         "# default: \"og_type and (linked_headlines or self_linked_headlines)\"\n",
181 |         "pass_heuristics_condition = \"og_type and (linked_headlines or self_linked_headlines)\"\n",
182 |         "\n",
183 |         "# The maximum ratio of headlines divided by linked_headlines in a file\n",
184 |         "\n",
185 |         "# The minimum number of headlines in a file to check for the ratio\n",
186 |         "# If less then this number are in the file, the file will pass the test.\n",
187 |         "# default: 5\n",
188 |         "min_headlines_for_linked_test = 5\n",
189 |         "\n",
190 |         "\n",
191 |         "\n",
192 |         "[Files]\n",
193 |         "\n",
194 |         "# GENERAL:\n",
195 |         "# -------\n",
196 |         "\n",
197 |         "# Paths:\n",
198 |         "# toggles relative paths to be relative to the start_processes.py script (True) or relative to this config file (False)\n",
199 |         "# This does not work for this config's 'Scrapy' section which is always relative to the dir the start_processes.py script is called from\n",
200 |         "# Default: True\n",
201 |         "relative_to_start_processes_file = True\n",
202 |         "\n",
203 |         "\n",
204 |         "\n",
205 |         "# INPUT:\n",
206 |         "# -----\n",
207 |         "\n",
208 |         "# Here you can specify the input JSON-Filename\n",
209 |         "# default: sitelist.hjson\n",
210 |         "url_input_file_name = sitelist.hjson\n",
211 |         "\n",
212 |         "\n",
213 |         "\n",
214 |         "# OUTPUT:\n",
215 |         "# ------\n",
216 |         "\n",
217 |         "# Toggles whether leading './' or '.\\' from above local_data_directory should be removed when saving the path into the Database\n",
218 |         "# True: ./data would become data\n",
219 |         "# default: True\n",
220 |         "working_path = \"/content/drive/My Drive/Hindi_News\"\n",
221 |         "\n",
222 |         "# Following Strings in the local_data_directory will be replaced: (md5 hashes have a standard length of 32 chars)\n",
223 |         "#\n",
224 |         "# %working_path                           = the path specified in OUTPUT[\"working_path\"]\n",
225 |         "# %time_download(<code>)                  = current time at download; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/\n",
226 |         "# %time_execution(<code>)                 = current time at execution; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/\n",
227 |         "# %timestamp_download                     = current time at download; unix-timestamp\n",
228 |         "# %timestamp_execution                    = current time at execution; unix-timestamp\n",
229 |         "# %domain(<size>)                         = first <size> chars of the domain of the crawled file (e.g. zeit.de)\n",
230 |         "# %appendmd5_domain(<size>)               = appends the md5 to %domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if domain is longer than <size>\n",
231 |         "# %md5_domain(<size>)                     = first <size> chars of md5 hash of %domain\n",
232 |         "# %full_domain(<size>)                    = first <size> chars of the domain including subdomains (e.g. panamapapers.sueddeutsche.de)\n",
233 |         "# %appendmd5_full_domain(<size>)          = appends the md5 to %full_domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if full_domain is longer than <size>\n",
234 |         "# %md5_full_domain(<size>)                = first <size> chars of md5 hash of %full_domain\n",
235 |         "# %subdomains(<size>)                     = first <size> chars of the domain's subdomains\n",
236 |         "# %appendmd5_subdomains(<size>)           = appends the md5 to %subdomains(<<size> - 32 (md5 length) - 1 (_ as separator)>) if subdomains is longer than <size>\n",
237 |         "# %md5_subdomains(<size>)                 = first <size> chars of md5 hash of %subdomains\n",
238 |         "# %url_directory_string(<size>)           = first <size> chars of the directories on the server (e.g. http://panamapapers.sueddeutsche.de/articles/56f2c00da1bb8d3c3495aa0a/ would evaluate to articles_56f2c00da1bb8d3c3495aa0a), no filename\n",
239 |         "# %appendmd5_url_directory_string(<size>) = appends the md5 to %url_directory_string(<<size> - 32 (md5 length) - 1 (_ as separator)>) if url_directory_string is longer than <size>\n",
240 |         "# %md5_url_directory_string(<size>)       = first <size> chars of md5 hash of %url_directory_string(<size>)\n",
241 |         "# %url_file_name(<size>)                  = first <size> chars of the file name (without type) on the server (e.g. http://www.spiegel.de/wirtschaft/soziales/ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466.html would evaluate to ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466, No filenames (indexes) will evaluate to index\n",
242 |         "# %md5_url_file_name(<size>)              = first <size> chars of md5 hash of %url_file_name\n",
243 |         "# %max_url_file_name                      = first x chars of %url_file_name, so the entire savepath has a length of the max possible length for a windows file system (260 characters - 1 <NUL>)\n",
244 |         "# %appendmd5_max_url_file_name            = appends the md5 to the first x - 32 (md5 length) - 1 (_ as separator) chars of %url_file_name if the entire savepath has a length longer than the max possible length for a windows file system (260 characters - 1 <NUL>)\n",
245 |         "#\n",
246 |         "# This path can be relative or absolute, though to be able to easily merge multiple data sets, it should be kept relative and consistent on all datasets.\n",
247 |         "# To be able to use cleanup commands, it should also start with a static folder name like 'data'.\n",
248 |         "#\n",
249 |         "# default: %working_path/data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html\n",
250 |         "local_data_directory = %working_path/data/%appendmd5_full_domain(32)/news%timestamp_download.html\n",
251 |         "\n",
252 |         "\n",
253 |         "# Toggles whether leading './' or '.\\' from above local_data_directory should be removed when saving the path into the Database\n",
254 |         "# True: ./data would become data\n",
255 |         "# default: True\n",
256 |         "format_relative_path = True\n",
257 |         "\n",
258 |         "\n",
259 |         "\n",
260 |         "[MySQL]\n",
261 |         "\n",
262 |         "# MySQL-Connection required for saving meta-informations\n",
263 |         "host = localhost\n",
264 |         "port = 3306\n",
265 |         "db = 'news-please'\n",
266 |         "username = 'root'\n",
267 |         "password = 'password'\n",
268 |         "\n",
269 |         "\n",
270 |         "\n",
271 |         "[Elasticsearch]\n",
272 |         "\n",
273 |         "# Elasticsearch-Connection required for saving detailed meta-information\n",
274 |         "host = localhost\n",
275 |         "port = 9200\n",
276 |         "index_current = 'news-please'\n",
277 |         "index_archive = 'news-please-archive'\n",
278 |         "\n",
279 |         "# Elasticsearch supports user authentication by CA certificates. If your database is protected by certificate\n",
280 |         "# fill in the following parameters, otherwise you can ignore them.\n",
281 |         "use_ca_certificates = False\n",
282 |         "ca_cert_path = /path/to/cacert.pem\n",
283 |         "client_cert_path = /path/to/client_cert.pem\n",
284 |         "client_key_path = /path/to/client_key.pem\n",
285 |         "username = 'root'\n",
286 |         "secret = 'password'\n",
287 |         "\n",
288 |         "# Properties of the document type used for storage.\n",
289 |         "mapping = {\"properties\": {\n",
290 |         "    \"url\": {\"type\": \"text\",\"fields\":{\"keyword\":{\"type\":\"keyword\"}}},\n",
291 |         "    \"source_domain\": {\"type\": \"text\",\"fields\":{\"keyword\":{\"type\":\"keyword\"}}},\n",
292 |         "    \"title_page\": {\"type\": \"text\",\"fields\":{\"keyword\":{\"type\":\"keyword\"}}},\n",
293 |         "    \"title_rss\": {\"type\": \"text\",\"fields\":{\"keyword\":{\"type\":\"keyword\"}}},\n",
294 |         "    \"localpath\": {\"type\": \"text\",\"fields\":{\"keyword\":{\"type\":\"keyword\"}}},\n",
295 |         "    \"filename\": {\"type\": \"keyword\"},\n",
296 |         "    \"ancestor\": {\"type\": \"keyword\"},\n",
297 |         "    \"descendant\": {\"type\": \"keyword\"},\n",
298 |         "    \"version\": {\"type\": \"long\"},\n",
299 |         "    \"date_download\": {\"type\": \"date\", \"format\":\"yyyy-MM-dd HH:mm:ss\"},\n",
300 |         "    \"date_modify\": {\"type\": \"date\", \"format\":\"yyyy-MM-dd HH:mm:ss\"},\n",
301 |         "    \"date_publish\": {\"type\": \"date\", \"format\":\"yyyy-MM-dd HH:mm:ss\"},\n",
302 |         "    \"title\": {\"type\": \"text\",\"fields\":{\"keyword\":{\"type\":\"keyword\"}}},\n",
303 |         "    \"description\":  {\"type\": \"text\",\"fields\":{\"keyword\":{\"type\":\"keyword\"}}},\n",
304 |         "    \"text\": {\"type\": \"text\"},\n",
305 |         "    \"authors\": {\"type\": \"text\",\"fields\":{\"keyword\":{\"type\":\"keyword\"}}},\n",
306 |         "    \"image_url\":  {\"type\": \"text\",\"fields\":{\"keyword\":{\"type\":\"keyword\"}}},\n",
307 |         "    \"language\": {\"type\": \"keyword\"}\n",
308 |         "    }}\n",
309 |         "\n",
310 |         "\n",
311 |         "\n",
312 |         "[ArticleMasterExtractor]\n",
313 |         "\n",
314 |         "# Choose which extractors you want to use.\n",
315 |         "#\n",
316 |         "# The Default is ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'],\n",
317 |         "# which are all integrated extractors right now.\n",
318 |         "# Possibly extractors are 'newspaper_extractor' , 'readability_extractor' , 'date_extractor_extractor and 'lang_detect_extractor'\n",
319 |         "# Examples: -Only Newspaper and date_extractor: extractors = ['newspaper', 'date_extractor']\n",
320 |         "#           -Only Newspaper: extractors = ['newspaper']\n",
321 |         "extractors = ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor']\n",
322 |         "\n",
323 |         "\n",
324 |         "\n",
325 |         "[DateFilter]\n",
326 |         "\n",
327 |         "# If added to the pipeline, this module provides the means to filter the extracted articles based on the publishing date.\n",
328 |         "# Therefore this module has to be placed after the KM4 article extractor to access the publishing dates.\n",
329 |         "#\n",
330 |         "# All articles, with a publishing date outside of the given time interval are dropped. The dates used to specify the\n",
331 |         "# time interval are included and should follow this format: 'yyyy-mm-dd hh:mm:ss'.\n",
332 |         "#\n",
333 |         "# It is also possible to only define one date, assigning the other variable the value 'None' to create an half-bounded\n",
334 |         "# interval.\n",
335 |         "\n",
336 |         "start_date = '1999-01-01 00:00:00'\n",
337 |         "end_date = '2999-12-31 00:00:00'\n",
338 |         "\n",
339 |         "# If 'True' articles without a publishing date are dropped.\n",
340 |         "strict_mode = False\n",
341 |         "\n",
342 |         "\n",
343 |         "\n",
344 |         "[Scrapy]\n",
345 |         "\n",
346 |         "# Possible levels (must be UC-only): CRITICAL, ERROR, WARNING, INFO, DEBUG\n",
347 |         "# default: WARNING\n",
348 |         "LOG_LEVEL = INFO\n",
349 |         "\n",
350 |         "# logformat, see https://docs.python.org/2/library/logging.html#logrecord-attributes\n",
351 |         "# default: [%(name)s:%(lineno)d|%(levelname)s] %(message)s\n",
352 |         "LOG_FORMAT = [%(name)s:%(lineno)d|%(levelname)s] %(message)s\n",
353 |         "\n",
354 |         "# Can be a filename or None\n",
355 |         "# default: None\n",
356 |         "LOG_FILE = None\n",
357 |         "\n",
358 |         "LOG_DATEFORMAT = %Y-%m-%d %H:%M:%S\n",
359 |         "\n",
360 |         "LOG_STDOUT = False\n",
361 |         "\n",
362 |         "LOG_ENCODING = utf-8\n",
363 |         "\n",
364 |         "BOT_NAME = 'news-please'\n",
365 |         "\n",
366 |         "SPIDER_MODULES = ['newsplease.crawler.spiders']\n",
367 |         "NEWSPIDER_MODULE = 'newsplease.crawler.spiders'\n",
368 |         "\n",
369 |         "# Resume/Pause functionality activation\n",
370 |         "# default: .resume_jobdir\n",
371 |         "JOBDIRNAME = .resume_jobdir\n",
372 |         "\n",
373 |         "# Respect robots.txt activation\n",
374 |         "# default: True\n",
375 |         "ROBOTSTXT_OBEY=True\n",
376 |         "\n",
377 |         "# Maximum number of concurrent requests across all domains\n",
378 |         "# default: 16\n",
379 |         "# IMPORTANT: This setting does not work since each crawler has its own scrapy instance, but it might limit the concurrent_requests_per_domain if said setting has a higher number set than this one.\n",
380 |         "CONCURRENT_REQUESTS=16\n",
381 |         "\n",
382 |         "# Maximum number of active requests per domain\n",
383 |         "# default: 4\n",
384 |         "CONCURRENT_REQUESTS_PER_DOMAIN=4\n",
385 |         "\n",
386 |         "# User-agent activation\n",
387 |         "# default: 'news-please (+http://www.example.com/)'\n",
388 |         "USER_AGENT = 'news-please (+http://www.example.com/)'\n",
389 |         "\n",
390 |         "# Pipeline activation\n",
391 |         "# Syntax: '<relative location>.<Pipeline name>': <Order of execution from 0-1000>\n",
392 |         "# default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.HtmlFileStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300}\n",
393 |         "# Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350\n",
394 |         "ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,\n",
395 |         "                  ##'newsplease.pipeline.pipelines.HtmlFileStorage':200,\n",
396 |         "                  'newsplease.pipeline.pipelines.JsonFileStorage':300\n",
397 |         "                  }\n",
398 |         "\n",
399 |         "[Pandas]\n",
400 |         "file_name = \"PandasStorage\""
401 |       ],
402 |       "execution_count": 0,
403 |       "outputs": []
404 |     },
405 |     {
406 |       "cell_type": "markdown",
407 |       "metadata": {
408 |         "id": "_LYp4_Crfhij",
409 |         "colab_type": "text"
410 |       },
411 |       "source": [
412 |         "### sitelist.hjson\n",
413 |         "\n",
414 |         "save this cell to a file called /root/news-please-repo/config/sitelist.hjson"
415 |       ]
416 |     },
417 |     {
418 |       "cell_type": "markdown",
419 |       "metadata": {
420 |         "id": "eP4S5BoLlWVM",
421 |         "colab_type": "text"
422 |       },
423 |       "source": [
424 |         "here we scrap from 9 websites, to add new websites, these are good links that would prove helpful\n",
425 |         "\n",
426 |         "https://bharatdiscovery.org/india/%E0%A4%B8%E0%A4%AE%E0%A4%BE%E0%A4%9A%E0%A4%BE%E0%A4%B0_%E0%A4%AA%E0%A4%A4%E0%A5%8D%E0%A4%B0\n",
427 |         "https://bharatdiscovery.org/india/%E0%A4%A8%E0%A4%88%E0%A4%A6%E0%A5%81%E0%A4%A8%E0%A4%BF%E0%A4%AF%E0%A4%BE\n",
428 |         "https://bharatdiscovery.org/india/%E0%A4%86%E0%A4%9C\n",
429 |         "https://www.naidunia.com/latest-news"
430 |       ]
431 |     },
432 |     {
433 |       "cell_type": "code",
434 |       "metadata": {
435 |         "id": "j6xAbovLocSK",
436 |         "colab_type": "code",
437 |         "colab": {}
438 |       },
439 |       "source": [
440 |         "# This is a HJSON-File, so comments and so on can be used! See https://hjson.org/\n",
441 |         "# Furthermore this is first of all the actual config file, but as default just filled with examples.\n",
442 |         "{\n",
443 |         "  # Every URL has to be in an array-object in \"base_urls\".\n",
444 |         "  # The same URL in combination with the same crawler may only appear once in this array.\n",
445 |         "  \"base_urls\" : [\n",
446 |         "\t{\n",
447 |         "      # Start crawling from timesofindia\n",
448 |         "      \"url\": \"https://www.livehindustan.com/\",\n",
449 |         "\n",
450 |         "      # Overwrite the default crawler and use th RecursiveCrawler instead\n",
451 |         "      \"crawler\": \"RecursiveCrawler\",\n",
452 |         "\n",
453 |         "      # Because this site is weirt, use the\n",
454 |         "      # meta_contains_article_keyword-heuristic and disable all others because\n",
455 |         "      # overwrite will merge the defaults from \"newscrawler.cfg\" with\n",
456 |         "      # this\n",
457 |         "      \"overwrite_heuristics\": {\n",
458 |         "        \"meta_contains_article_keyword\": true,\n",
459 |         "        \"og_type\": false,\n",
460 |         "        \"linked_headlines\": false,\n",
461 |         "        \"self_linked_headlines\": false\n",
462 |         "      },\n",
463 |         "      # Also state that in the condition, all heuristics used in the condition\n",
464 |         "      # have to be activated in \"overwrite_heuristics\" (or default) as well.\n",
465 |         "      \"pass_heuristics_condition\": \"meta_contains_article_keyword\"\n",
466 |         "    },\n",
467 |         "\t{\n",
468 |         "      # Start crawling from timesofindia\n",
469 |         "      \"url\": \"https://www.jagran.com/\",\n",
470 |         "\n",
471 |         "      # Overwrite the default crawler and use th RecursiveCrawler instead\n",
472 |         "      \"crawler\": \"RecursiveCrawler\",\n",
473 |         "\n",
474 |         "      # Because this site is weirt, use the\n",
475 |         "      # meta_contains_article_keyword-heuristic and disable all others because\n",
476 |         "      # overwrite will merge the defaults from \"newscrawler.cfg\" with\n",
477 |         "      # this\n",
478 |         "      \"overwrite_heuristics\": {\n",
479 |         "        \"meta_contains_article_keyword\": true,\n",
480 |         "        \"og_type\": false,\n",
481 |         "        \"linked_headlines\": false,\n",
482 |         "        \"self_linked_headlines\": false\n",
483 |         "      },\n",
484 |         "      # Also state that in the condition, all heuristics used in the condition\n",
485 |         "      # have to be activated in \"overwrite_heuristics\" (or default) as well.\n",
486 |         "      \"pass_heuristics_condition\": \"meta_contains_article_keyword\"\n",
487 |         "    },\n",
488 |         "\t{\n",
489 |         "      # Start crawling from timesofindia\n",
490 |         "      \"url\": \"https://aajtak.intoday.in/\",\n",
491 |         "\n",
492 |         "      # Overwrite the default crawler and use th RecursiveCrawler instead\n",
493 |         "      \"crawler\": \"RecursiveCrawler\",\n",
494 |         "\n",
495 |         "      # Because this site is weirt, use the\n",
496 |         "      # meta_contains_article_keyword-heuristic and disable all others because\n",
497 |         "      # overwrite will merge the defaults from \"newscrawler.cfg\" with\n",
498 |         "      # this\n",
499 |         "      \"overwrite_heuristics\": {\n",
500 |         "        \"meta_contains_article_keyword\": true,\n",
501 |         "        \"og_type\": false,\n",
502 |         "        \"linked_headlines\": false,\n",
503 |         "        \"self_linked_headlines\": false\n",
504 |         "      },\n",
505 |         "      # Also state that in the condition, all heuristics used in the condition\n",
506 |         "      # have to be activated in \"overwrite_heuristics\" (or default) as well.\n",
507 |         "      \"pass_heuristics_condition\": \"meta_contains_article_keyword\"\n",
508 |         "    },\n",
509 |         "\t{\n",
510 |         "      # Start crawling from timesofindia\n",
511 |         "      \"url\": \"http://money.bhaskar.com/\",\n",
512 |         "\n",
513 |         "      # Overwrite the default crawler and use th RecursiveCrawler instead\n",
514 |         "      \"crawler\": \"RecursiveCrawler\",\n",
515 |         "\n",
516 |         "      # Because this site is weirt, use the\n",
517 |         "      # meta_contains_article_keyword-heuristic and disable all others because\n",
518 |         "      # overwrite will merge the defaults from \"newscrawler.cfg\" with\n",
519 |         "      # this\n",
520 |         "      \"overwrite_heuristics\": {\n",
521 |         "        \"meta_contains_article_keyword\": true,\n",
522 |         "        \"og_type\": false,\n",
523 |         "        \"linked_headlines\": false,\n",
524 |         "        \"self_linked_headlines\": false\n",
525 |         "      },\n",
526 |         "      # Also state that in the condition, all heuristics used in the condition\n",
527 |         "      # have to be activated in \"overwrite_heuristics\" (or default) as well.\n",
528 |         "      \"pass_heuristics_condition\": \"meta_contains_article_keyword\"\n",
529 |         "    },\n",
530 |         "\t{\n",
531 |         "      # Start crawling from timesofindia\n",
532 |         "      \"url\": \"http://bhaskar.com/\",\n",
533 |         "\n",
534 |         "      # Overwrite the default crawler and use th RecursiveCrawler instead\n",
535 |         "      \"crawler\": \"RecursiveCrawler\",\n",
536 |         "\n",
537 |         "      # Because this site is weirt, use the\n",
538 |         "      # meta_contains_article_keyword-heuristic and disable all others because\n",
539 |         "      # overwrite will merge the defaults from \"newscrawler.cfg\" with\n",
540 |         "      # this\n",
541 |         "      \"overwrite_heuristics\": {\n",
542 |         "        \"meta_contains_article_keyword\": true,\n",
543 |         "        \"og_type\": false,\n",
544 |         "        \"linked_headlines\": false,\n",
545 |         "        \"self_linked_headlines\": false\n",
546 |         "      },\n",
547 |         "      # Also state that in the condition, all heuristics used in the condition\n",
548 |         "      # have to be activated in \"overwrite_heuristics\" (or default) as well.\n",
549 |         "      \"pass_heuristics_condition\": \"meta_contains_article_keyword\"\n",
550 |         "    },\n",
551 |         "\t{\n",
552 |         "      # Start crawling from timesofindia\n",
553 |         "      \"url\": \"https://navbharattimes.indiatimes.com/\",\n",
554 |         "\n",
555 |         "      # Overwrite the default crawler and use th RecursiveCrawler instead\n",
556 |         "      \"crawler\": \"RecursiveCrawler\",\n",
557 |         "\n",
558 |         "      # Because this site is weirt, use the\n",
559 |         "      # meta_contains_article_keyword-heuristic and disable all others because\n",
560 |         "      # overwrite will merge the defaults from \"newscrawler.cfg\" with\n",
561 |         "      # this\n",
562 |         "      \"overwrite_heuristics\": {\n",
563 |         "        \"meta_contains_article_keyword\": true,\n",
564 |         "        \"og_type\": false,\n",
565 |         "        \"linked_headlines\": false,\n",
566 |         "        \"self_linked_headlines\": false\n",
567 |         "      },\n",
568 |         "      # Also state that in the condition, all heuristics used in the condition\n",
569 |         "      # have to be activated in \"overwrite_heuristics\" (or default) as well.\n",
570 |         "      \"pass_heuristics_condition\": \"meta_contains_article_keyword\"\n",
571 |         "    },\n",
572 |         "\t{\n",
573 |         "      # Start crawling from timesofindia\n",
574 |         "      \"url\": \"http://naidunia.com/\",\n",
575 |         "\n",
576 |         "      # Overwrite the default crawler and use th RecursiveCrawler instead\n",
577 |         "      \"crawler\": \"RecursiveCrawler\",\n",
578 |         "\n",
579 |         "      # Because this site is weirt, use the\n",
580 |         "      # meta_contains_article_keyword-heuristic and disable all others because\n",
581 |         "      # overwrite will merge the defaults from \"newscrawler.cfg\" with\n",
582 |         "      # this\n",
583 |         "      \"overwrite_heuristics\": {\n",
584 |         "        \"meta_contains_article_keyword\": true,\n",
585 |         "        \"og_type\": false,\n",
586 |         "        \"linked_headlines\": false,\n",
587 |         "        \"self_linked_headlines\": false\n",
588 |         "      },\n",
589 |         "      # Also state that in the condition, all heuristics used in the condition\n",
590 |         "      # have to be activated in \"overwrite_heuristics\" (or default) as well.\n",
591 |         "      \"pass_heuristics_condition\": \"meta_contains_article_keyword\"\n",
592 |         "    },\n",
593 |         "\t{\n",
594 |         "      # Start crawling from timesofindia\n",
595 |         "      \"url\": \"https://www.abplive.com/\",\n",
596 |         "\n",
597 |         "      # Overwrite the default crawler and use th RecursiveCrawler instead\n",
598 |         "      \"crawler\": \"RecursiveCrawler\",\n",
599 |         "\n",
600 |         "      # Because this site is weirt, use the\n",
601 |         "      # meta_contains_article_keyword-heuristic and disable all others because\n",
602 |         "      # overwrite will merge the defaults from \"newscrawler.cfg\" with\n",
603 |         "      # this\n",
604 |         "      \"overwrite_heuristics\": {\n",
605 |         "        \"meta_contains_article_keyword\": true,\n",
606 |         "        \"og_type\": false,\n",
607 |         "        \"linked_headlines\": false,\n",
608 |         "        \"self_linked_headlines\": false\n",
609 |         "      },\n",
610 |         "      # Also state that in the condition, all heuristics used in the condition\n",
611 |         "      # have to be activated in \"overwrite_heuristics\" (or default) as well.\n",
612 |         "      \"pass_heuristics_condition\": \"meta_contains_article_keyword\"\n",
613 |         "    },\n",
614 |         "\t{\n",
615 |         "      # Start crawling from timesofindia\n",
616 |         "      \"url\": \"https://hindi.indiatvnews.com/\",\n",
617 |         "\n",
618 |         "      # Overwrite the default crawler and use th RecursiveCrawler instead\n",
619 |         "      \"crawler\": \"RecursiveCrawler\",\n",
620 |         "\n",
621 |         "      # Because this site is weirt, use the\n",
622 |         "      # meta_contains_article_keyword-heuristic and disable all others because\n",
623 |         "      # overwrite will merge the defaults from \"newscrawler.cfg\" with\n",
624 |         "      # this\n",
625 |         "      \"overwrite_heuristics\": {\n",
626 |         "        \"meta_contains_article_keyword\": true,\n",
627 |         "        \"og_type\": false,\n",
628 |         "        \"linked_headlines\": false,\n",
629 |         "        \"self_linked_headlines\": false\n",
630 |         "      },\n",
631 |         "      # Also state that in the condition, all heuristics used in the condition\n",
632 |         "      # have to be activated in \"overwrite_heuristics\" (or default) as well.\n",
633 |         "      \"pass_heuristics_condition\": \"meta_contains_article_keyword\"\n",
634 |         "    }\n",
635 |         "\t]\n",
636 |         "}\n"
637 |       ],
638 |       "execution_count": 0,
639 |       "outputs": []
640 |     },
641 |     {
642 |       "cell_type": "markdown",
643 |       "metadata": {
644 |         "id": "9yEXZmnWoeV3",
645 |         "colab_type": "text"
646 |       },
647 |       "source": [
648 |         "##Run"
649 |       ]
650 |     },
651 |     {
652 |       "cell_type": "code",
653 |       "metadata": {
654 |         "id": "8glo9SaJxdax",
655 |         "colab_type": "code",
656 |         "colab": {}
657 |       },
658 |       "source": [
659 |         "!news-please"
660 |       ],
661 |       "execution_count": 0,
662 |       "outputs": []
663 |     },
664 |     {
665 |       "cell_type": "code",
666 |       "metadata": {
667 |         "id": "KAGw0ugDA0im",
668 |         "colab_type": "code",
669 |         "colab": {}
670 |       },
671 |       "source": [
672 |         ""
673 |       ],
674 |       "execution_count": 0,
675 |       "outputs": []
676 |     }
677 |   ]
678 | }


--------------------------------------------------------------------------------
/Hindi/2_process (local).py:
--------------------------------------------------------------------------------
 1 | #pip install langdetect
 2 | import json
 3 | import os
 4 | import pandas
 5 | from langdetect import detect
 6 | import pickle
 7 | 
 8 | from __future__ import print_function
 9 | import io
10 | import sys
11 | import re
12 | 
13 | class ProgressBar(object):
14 |     DEFAULT = 'Progress: %(bar)s %(percent)3d%%'
15 |     FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go'
16 | 
17 |     def __init__(self, total, width=40, fmt=DEFAULT, symbol='=',
18 |                  output=sys.stderr):
19 |         assert len(symbol) == 1
20 | 
21 |         self.total = total
22 |         self.width = width
23 |         self.symbol = symbol
24 |         self.output = output
25 |         self.fmt = re.sub(r'(?P<name>%\(.+?\))d',
26 |             r'\g<name>%dd' % len(str(total)), fmt)
27 | 
28 |         self.current = 0
29 | 
30 |     def __call__(self):
31 |         percent = self.current / float(self.total)
32 |         size = int(self.width * percent)
33 |         remaining = self.total - self.current
34 |         bar = '[' + self.symbol * size + ' ' * (self.width - size) + ']'
35 | 
36 |         args = {
37 |             'total': self.total,
38 |             'bar': bar,
39 |             'current': self.current,
40 |             'percent': percent * 100,
41 |             'remaining': remaining
42 |         }
43 |         print('\r' + self.fmt % args, file=self.output, end='')
44 | 
45 |     def done(self):
46 |         self.current = self.total
47 |         self()
48 |         print('', file=self.output)
49 | 
50 | 
51 | 
52 | full_path = r"E:\Projects\Python\Hindi_News\data\data"
53 | dirs = os.listdir(full_path)
54 | resultdict = []
55 | 
56 | #with open('hindi.pkl', 'rb') as handle:
57 | #	b = pickle.load(handle)
58 | 
59 | for d in dirs:
60 | 	list_files = [file_json for file_json in os.listdir(full_path+"\\"+d) if file_json.endswith('.json')]
61 | 	if len(list_files)==0:
62 | 		continue
63 | 	progress = ProgressBar(len(list_files), fmt=ProgressBar.FULL)
64 | 	for fili in list_files:
65 | 		with open(os.path.join(full_path+"\\"+d, fili), encoding="utf-8") as inputjson:
66 | 			objj = json.load(inputjson) 
67 | 			row = {}
68 | 			row["title"] = objj["title"]
69 | 			row["text"] = objj["text"]
70 | 			try:
71 | 				if detect(objj["text"]) == "hi" and objj['text'] != "":
72 | 					resultdict.append( row )
73 | 			except:
74 | 				resultdict.append( row )
75 | 			progress.current += 1
76 | 			progress()
77 | 	progress.done()
78 | 
79 | with open('hindi.pkl', 'wb') as handle:
80 | 	pickle.dump(resultdict, handle, protocol=pickle.HIGHEST_PROTOCOL)
81 | 
82 | print("converting to dataframe ..")
83 | dataframe = pandas.DataFrame(resultdict)
84 | 
85 | print("saving ..")
86 | with open("HindiNewsBook.csv", "a" ,encoding='utf-8') as csvout:
87 | 	dataframe.to_csv(csvout, encoding='utf-8', index=False)
88 | 
89 | print("done el7")	
90 | 


--------------------------------------------------------------------------------
/Implementation A (seq2seq with attention and feature rich representation)/Model 2/Results/(model 2 with tf-idf and pos) result_featurerich_15_11_2018_5_28pm.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" ?>
  2 | <ZakSum bleu="9.675500840282693" rouge_1="0.38276723276723273" rouge_2="0.1434033744033744" rouge_L="0.36038339438339434" rouge_be="0.10304761904761905">
  3 |   <!--Generated by Amr Zaki-->
  4 |   <example>
  5 |     <article>five-time world champion michelle kwan withdrew from the # us figure skating championships on wednesday , but will petition us skating officials for the chance to compete at the # turin olympics #</article>
  6 |     <reference>injury leaves kwan 's olympic hopes in limbo</reference>
  7 |     <summary>nba olympic championship results</summary>
  8 |     <eval>
  9 |       <BLEU score="0.5255967942291402"/>
 10 |       <ROUGE_1 score="0.2"/>
 11 |       <ROUGE_2 score="0"/>
 12 |       <ROUGE_l score="0.2"/>
 13 |       <ROUGE_be score="0"/>
 14 |     </eval>
 15 |   </example>
 16 |   <example>
 17 |     <article>us business leaders lashed out wednesday at legislation that would penalize companies for employing illegal immigrants #</article>
 18 |     <reference>us business attacks tough immigration law</reference>
 19 |     <summary>business leaders lash out at tougher law on illegal immigrants</summary>
 20 |     <eval>
 21 |       <BLEU score="0.4463236137853328"/>
 22 |       <ROUGE_1 score="0.3333333333333333"/>
 23 |       <ROUGE_2 score="0"/>
 24 |       <ROUGE_l score="0.3333333333333333"/>
 25 |       <ROUGE_be score="0"/>
 26 |     </eval>
 27 |   </example>
 28 |   <example>
 29 |     <article>general motors corp# said wednesday its us sales fell # percent in december and four percent in # with the biggest losses coming from passenger car sales #</article>
 30 |     <reference>gm december sales fall # percent</reference>
 31 |     <summary>gm sales down # percent in december</summary>
 32 |     <eval>
 33 |       <BLEU score="2.777619034011791"/>
 34 |       <ROUGE_1 score="0.888888888888889"/>
 35 |       <ROUGE_2 score="0"/>
 36 |       <ROUGE_l score="0.6666666666666665"/>
 37 |       <ROUGE_be score="0"/>
 38 |     </eval>
 39 |   </example>
 40 |   <example>
 41 |     <article>several thousand people gathered on wednesday evening on the main square in zagreb for a public draw and an open air party to celebrate the croatian capital 's second chance to host the women 's slalom world cup #</article>
 42 |     <reference>thousands of croatians celebrate before world cup slalom</reference>
 43 |     <summary>croatia 's world cup ski results</summary>
 44 |     <eval>
 45 |       <BLEU score="1.9562170797817717"/>
 46 |       <ROUGE_1 score="0.3636363636363636"/>
 47 |       <ROUGE_2 score="0.22222222222222224"/>
 48 |       <ROUGE_l score="0.3636363636363636"/>
 49 |       <ROUGE_be score="0"/>
 50 |     </eval>
 51 |   </example>
 52 |   <example>
 53 |     <article>us first lady laura bush and us secretary of state condoleezza rice will represent the united states later this month at the inauguration of liberia 's president-elect ellen johnson sirleaf , the white house said wednesday #</article>
 54 |     <reference>laura bush &lt;unk&gt; rice to attend sirleaf 's inauguration in liberia</reference>
 55 |     <summary>bush cheney to visit liberia</summary>
 56 |     <eval>
 57 |       <BLEU score="0.253871431063062"/>
 58 |       <ROUGE_1 score="0.3333333333333333"/>
 59 |       <ROUGE_2 score="0"/>
 60 |       <ROUGE_l score="0.3333333333333333"/>
 61 |       <ROUGE_be score="0"/>
 62 |     </eval>
 63 |   </example>
 64 |   <example>
 65 |     <article>jack abramoff , a former lobbyist at the center of a mushrooming political scandal in washington , on wednesday pleaded guilty in a us court in miami to defrauding lenders in a florida gambling boat deal #</article>
 66 |     <reference>top republican lobbyist pleads guilty to florida fraud</reference>
 67 |     <summary>leading &lt; unk &gt; director pleads guilty in us court</summary>
 68 |     <eval>
 69 |       <BLEU score="1.4113991930789775"/>
 70 |       <ROUGE_1 score="0.30769230769230765"/>
 71 |       <ROUGE_2 score="0.1818181818181818"/>
 72 |       <ROUGE_l score="0.30769230769230765"/>
 73 |       <ROUGE_be score="0"/>
 74 |     </eval>
 75 |   </example>
 76 |   <example>
 77 |     <article>somalia 's feuding president and parliament speaker have agreed a compromise in a bitter row over the appropriate seat for their fledgling transitional government , a yemeni official said wednesday #</article>
 78 |     <reference>somalia rivals in compromise on seat of government</reference>
 79 |     <summary>new government to lead somali</summary>
 80 |     <eval>
 81 |       <BLEU score="0.5243581219839234"/>
 82 |       <ROUGE_1 score="0.25"/>
 83 |       <ROUGE_2 score="0"/>
 84 |       <ROUGE_l score="0.25"/>
 85 |       <ROUGE_be score="0"/>
 86 |     </eval>
 87 |   </example>
 88 |   <example>
 89 |     <article>a #-hour strike by airport workers in portugal planned for friday over job security issues could lead to flight cancellations , union and airport officials said #</article>
 90 |     <reference>portuguese airport workers strike could ground flights on friday</reference>
 91 |     <summary>portugal transport strike could lead flights</summary>
 92 |     <eval>
 93 |       <BLEU score="1.83255681299832"/>
 94 |       <ROUGE_1 score="0.3333333333333333"/>
 95 |       <ROUGE_2 score="0"/>
 96 |       <ROUGE_l score="0.3333333333333333"/>
 97 |       <ROUGE_be score="0"/>
 98 |     </eval>
 99 |   </example>
100 |   <example>
101 |     <article>jose mourinho renewed his partnership with portuguese international maniche on wednesday when he completed the loan signing of the #-year-old midfielder from dynamo moscow #</article>
102 |     <reference>maniche renews partnership with mourinho</reference>
103 |     <summary>afp americas news summary</summary>
104 |     <eval>
105 |       <BLEU score="0.3518629739981188"/>
106 |       <ROUGE_1 score="0"/>
107 |       <ROUGE_2 score="0"/>
108 |       <ROUGE_l score="0"/>
109 |       <ROUGE_be score="0"/>
110 |     </eval>
111 |   </example>
112 |   <example>
113 |     <article>hollywood is planning a new sequel to adventure flick `` ocean 's eleven , '' with star george clooney set to reprise his role as a charismatic thief in `` ocean 's thirteen , '' the entertainment press said wednesday #</article>
114 |     <reference>hollywood shores up support for ocean 's thirteen</reference>
115 |     <summary>hollywood to launch new &lt; unk &gt; one-man style</summary>
116 |     <eval>
117 |       <BLEU score="0.42643667978180183"/>
118 |       <ROUGE_1 score="0.20000000000000004"/>
119 |       <ROUGE_2 score="0"/>
120 |       <ROUGE_l score="0.20000000000000004"/>
121 |       <ROUGE_be score="0"/>
122 |     </eval>
123 |   </example>
124 |   <example>
125 |     <article>canada advised its nationals wednesday to avoid non-essential travel to nepal , following the resumption of a maoist rebel insurgency in the himalayan kingdom #</article>
126 |     <reference>canada recommends avoiding travel to nepal</reference>
127 |     <summary>canada advises nationals to avoid nepal travel</summary>
128 |     <eval>
129 |       <BLEU score="0.8307018474412793"/>
130 |       <ROUGE_1 score="0.5454545454545454"/>
131 |       <ROUGE_2 score="0"/>
132 |       <ROUGE_l score="0.3636363636363636"/>
133 |       <ROUGE_be score="0.3333333333333333"/>
134 |     </eval>
135 |   </example>
136 |   <example>
137 |     <article>us auto sales will likely be weaker in # , a senior executive at ford motor company said wednesday #</article>
138 |     <reference>ford executive sees weaker us auto sales in #</reference>
139 |     <summary>ford likely to be weaker in #</summary>
140 |     <eval>
141 |       <BLEU score="1.9740631366145514"/>
142 |       <ROUGE_1 score="0.5"/>
143 |       <ROUGE_2 score="0"/>
144 |       <ROUGE_l score="0.5"/>
145 |       <ROUGE_be score="0"/>
146 |     </eval>
147 |   </example>
148 |   <example>
149 |     <article>the united states on wednesday welcomed beijing 's early release of a chinese journalist jailed for exposing top-level graft but said it remain concerned over the lack of freedom in the world 's most populous nation #</article>
150 |     <reference>us welcomes chinese journalist 's release highlights freedom concerns</reference>
151 |     <summary>us welcomes early release of chinese journalist</summary>
152 |     <eval>
153 |       <BLEU score="2.4822529802838886"/>
154 |       <ROUGE_1 score="0.6666666666666666"/>
155 |       <ROUGE_2 score="0.2"/>
156 |       <ROUGE_l score="0.5"/>
157 |       <ROUGE_be score="0.28571428571428575"/>
158 |     </eval>
159 |   </example>
160 |   <example>
161 |     <article>two bank workers on wednesday admitted stealing almost #,# pounds -lrb- #,# euros -rrb- from the accounts of three former manchester city footballers #</article>
162 |     <reference>two bankers admit theft from city players</reference>
163 |     <summary>two bank workers admit murder of three man</summary>
164 |     <eval>
165 |       <BLEU score="0.5873949094699213"/>
166 |       <ROUGE_1 score="0.20000000000000004"/>
167 |       <ROUGE_2 score="0"/>
168 |       <ROUGE_l score="0.20000000000000004"/>
169 |       <ROUGE_be score="0"/>
170 |     </eval>
171 |   </example>
172 |   <example>
173 |     <article>israeli prime minister ariel sharon was admitted to hospital early for a planned heart procedure after feeling unwell wednesday evening , his office said #</article>
174 |     <reference>unwell sharon admitted to hospital early</reference>
175 |     <summary>&lt; unk &gt; sharon admitted to hospital</summary>
176 |     <eval>
177 |       <BLEU score="41.11336169005198"/>
178 |       <ROUGE_1 score="0.6666666666666665"/>
179 |       <ROUGE_2 score="0.5714285714285715"/>
180 |       <ROUGE_l score="0.6666666666666665"/>
181 |       <ROUGE_be score="0.6666666666666666"/>
182 |     </eval>
183 |   </example>
184 |   <example>
185 |     <article>sales of hybrid &lt;unk&gt; vehicles in the united states should more than triple over the next seven years , a leading research firm said wednesday #</article>
186 |     <reference>us hybrid vehicle sales expect to more than triple in seven years</reference>
187 |     <summary>us hopes of hybrid sales in should more than double</summary>
188 |     <eval>
189 |       <BLEU score="1.5207971224097845"/>
190 |       <ROUGE_1 score="0.4"/>
191 |       <ROUGE_2 score="0"/>
192 |       <ROUGE_l score="0.4"/>
193 |       <ROUGE_be score="0"/>
194 |     </eval>
195 |   </example>
196 |   <example>
197 |     <article>the los angeles dodgers acquired south korean right-hander jae seo from the new york mets on wednesday in a four-player swap #</article>
198 |     <reference>korea 's seo headed to dodgers from mets</reference>
199 |     <summary>south american 's &lt; unk &gt; gets world cup deal</summary>
200 |     <eval>
201 |       <BLEU score="0.37531192687516973"/>
202 |       <ROUGE_1 score="0"/>
203 |       <ROUGE_2 score="0"/>
204 |       <ROUGE_l score="0"/>
205 |       <ROUGE_be score="0"/>
206 |     </eval>
207 |   </example>
208 |   <example>
209 |     <article>teenage hollywood starlet lindsay lohan , who was rushed to hospital this week after suffering an asthma attack , has admitted fighting a fierce battle with the eating disorder bulimia #</article>
210 |     <reference>hollywood starlet lindsay lohan admits bulimia battle</reference>
211 |     <summary>australian teen wages revolt in suffers fierce battle</summary>
212 |     <eval>
213 |       <BLEU score="0.4939382737115371"/>
214 |       <ROUGE_1 score="0.14285714285714285"/>
215 |       <ROUGE_2 score="0"/>
216 |       <ROUGE_l score="0.14285714285714285"/>
217 |       <ROUGE_be score="0"/>
218 |     </eval>
219 |   </example>
220 |   <example>
221 |     <article>the white house vigorously rejected wednesday suggestions that us soldiers in iraq do not hesitate to fire on civilians , after a bombing killed eight iraqi non-combatants , including two children #</article>
222 |     <reference>us insists soldiers act with restraint to protect civilians</reference>
223 |     <summary>us does not oppose use of iraqi civilians</summary>
224 |     <eval>
225 |       <BLEU score="0.5183741882011644"/>
226 |       <ROUGE_1 score="0.2222222222222222"/>
227 |       <ROUGE_2 score="0"/>
228 |       <ROUGE_l score="0.2222222222222222"/>
229 |       <ROUGE_be score="0"/>
230 |     </eval>
231 |   </example>
232 |   <example>
233 |     <article>the us special envoy to multilateral talks aimed at ending north korea 's nuclear weapons drive has quit , the state department said wednesday amid reported divisions within the administration over the nuclear issue #</article>
234 |     <reference>us special envoy to korean nuclear talks quits</reference>
235 |     <summary>us envoy says quits north korea</summary>
236 |     <eval>
237 |       <BLEU score="0.6846046760764756"/>
238 |       <ROUGE_1 score="0.4"/>
239 |       <ROUGE_2 score="0"/>
240 |       <ROUGE_l score="0.4"/>
241 |       <ROUGE_be score="0"/>
242 |     </eval>
243 |   </example>
244 |   <example>
245 |     <article>at least two people have tested positive for the bird flu virus in eastern turkey , health minister recep akdag told a news conference wednesday #</article>
246 |     <reference>two test positive for bird flu virus in turkey</reference>
247 |     <summary>at least two tests positive for bird flu virus</summary>
248 |     <eval>
249 |       <BLEU score="46.713797772819994"/>
250 |       <ROUGE_1 score="0.7272727272727272"/>
251 |       <ROUGE_2 score="0.6666666666666665"/>
252 |       <ROUGE_l score="0.7272727272727272"/>
253 |       <ROUGE_be score="0"/>
254 |     </eval>
255 |   </example>
256 |   <example>
257 |     <article>britain 's un envoy on wednesday urged stronger international support , including greater eu funding , for the african union -lrb- au -rrb- peacekeeping mission in sudan 's troubled darfur region to improve security on the ground #</article>
258 |     <reference>britain urges stronger international support for au in darfur</reference>
259 |     <summary>britain urges support for african peacekeeping mission in darfur</summary>
260 |     <eval>
261 |       <BLEU score="2.7776190340117917"/>
262 |       <ROUGE_1 score="0.5714285714285714"/>
263 |       <ROUGE_2 score="0.16666666666666666"/>
264 |       <ROUGE_l score="0.5714285714285714"/>
265 |       <ROUGE_be score="0"/>
266 |     </eval>
267 |   </example>
268 |   <example>
269 |     <article>australian foreign minister alexander downer called wednesday for the reform of the un security council and expressed support for brazil , india , japan and an african country to join the council #</article>
270 |     <reference>australia backs brazil others for un security council</reference>
271 |     <summary>australia fm calls for reform of un security council</summary>
272 |     <eval>
273 |       <BLEU score="7.583235673413396"/>
274 |       <ROUGE_1 score="0.5454545454545454"/>
275 |       <ROUGE_2 score="0.22222222222222224"/>
276 |       <ROUGE_l score="0.5454545454545454"/>
277 |       <ROUGE_be score="0"/>
278 |     </eval>
279 |   </example>
280 |   <example>
281 |     <article>two egyptian border guards were killed wednesday in clashes with palestinian militants near the rafah crossing on the border with gaza , a medical source said #</article>
282 |     <reference>two egyptian guards killed on border with gaza</reference>
283 |     <summary>two border guards killed in clashes with palestinians near gaza</summary>
284 |     <eval>
285 |       <BLEU score="1.85750579991336"/>
286 |       <ROUGE_1 score="0.7272727272727272"/>
287 |       <ROUGE_2 score="0.22222222222222224"/>
288 |       <ROUGE_l score="0.5454545454545454"/>
289 |       <ROUGE_be score="0"/>
290 |     </eval>
291 |   </example>
292 |   <example>
293 |     <article>stephen harper and his conservative party pulled ahead of prime minister paul martin 's liberals wednesday , weeks before canada 's federal election on january # , according to the latest poll #</article>
294 |     <reference>conservatives gain momentum ahead of ruling liberals</reference>
295 |     <summary>michael ignatieff ahead of canadian election</summary>
296 |     <eval>
297 |       <BLEU score="2.310997417025822"/>
298 |       <ROUGE_1 score="0.1818181818181818"/>
299 |       <ROUGE_2 score="0"/>
300 |       <ROUGE_l score="0.1818181818181818"/>
301 |       <ROUGE_be score="0"/>
302 |     </eval>
303 |   </example>
304 |   <example>
305 |     <article>dutch bank abn amro was hit wednesday with its second us fine in two months after settling a government probe into its mortgage lending in the industrial state of michigan #</article>
306 |     <reference>abn amro hit with second us fine</reference>
307 |     <summary>dutch bank account in us hit with second fine two years</summary>
308 |     <eval>
309 |       <BLEU score="5.9609942732680965"/>
310 |       <ROUGE_1 score="0.4"/>
311 |       <ROUGE_2 score="0.25"/>
312 |       <ROUGE_l score="0.4"/>
313 |       <ROUGE_be score="0"/>
314 |     </eval>
315 |   </example>
316 |   <example>
317 |     <article>the buffalo bills sacked tom donahoe as president and general manager on wednesday , fulfilling expectations of a shake-up after another failure to make the national football league playoffs #</article>
318 |     <reference>nfl 's bills shake up front office</reference>
319 |     <summary>new &lt; unk &gt; in general manager</summary>
320 |     <eval>
321 |       <BLEU score="0.18575057999133598"/>
322 |       <ROUGE_1 score="0"/>
323 |       <ROUGE_2 score="0"/>
324 |       <ROUGE_l score="0"/>
325 |       <ROUGE_be score="0"/>
326 |     </eval>
327 |   </example>
328 |   <example>
329 |     <article>tottenham sealed a miserable festive period for a toothless manchester city here wednesday with a #-# win thanks to goals from their confident striking duo of ahmed mido and robbie keane #</article>
330 |     <reference>spurs compound manchester city woes</reference>
331 |     <summary>spurs swoop on say keegan</summary>
332 |     <eval>
333 |       <BLEU score="0.9554427922043668"/>
334 |       <ROUGE_1 score="0.25"/>
335 |       <ROUGE_2 score="0"/>
336 |       <ROUGE_l score="0.25"/>
337 |       <ROUGE_be score="0"/>
338 |     </eval>
339 |   </example>
340 |   <example>
341 |     <article>the dollar fell to a two-month low against the euro wednesday on expectations that us interest rates will soon stop going up , depriving the greenback of its biggest investment appeal #</article>
342 |     <reference>us rate outlook sends dollar to two-month low against euro</reference>
343 |     <summary>dollar drops to two-month low</summary>
344 |     <eval>
345 |       <BLEU score="5.911295955652279"/>
346 |       <ROUGE_1 score="0.5454545454545454"/>
347 |       <ROUGE_2 score="0.2222222222222222"/>
348 |       <ROUGE_l score="0.5454545454545454"/>
349 |       <ROUGE_be score="0"/>
350 |     </eval>
351 |   </example>
352 |   <example>
353 |     <article>two top us movie groups picked their # awards nominees wednesday , with gay drama `` brokeback mountain , '' george clooney 's `` good night and good '' luck and `` crash '' taking early leads in the oscars race #</article>
354 |     <reference>top us movie groups pick awards nominees as oscars loom</reference>
355 |     <summary>us film groups among # oscars nominees</summary>
356 |     <eval>
357 |       <BLEU score="0.5411516285864537"/>
358 |       <ROUGE_1 score="0.5"/>
359 |       <ROUGE_2 score="0"/>
360 |       <ROUGE_l score="0.3333333333333333"/>
361 |       <ROUGE_be score="0"/>
362 |     </eval>
363 |   </example>
364 |   <example>
365 |     <article>right fielder jeromy burnitz inked a one-year , # million-dollar contract with the pittsburgh pirates on wednesday , a week after he was reported as close to a deal with baltimore #</article>
366 |     <reference>pirates ink outfielder burnitz to one-year deal</reference>
367 |     <summary>&lt; unk &gt; signs deal with pirates</summary>
368 |     <eval>
369 |       <BLEU score="0.6985342056580097"/>
370 |       <ROUGE_1 score="0.4"/>
371 |       <ROUGE_2 score="0"/>
372 |       <ROUGE_l score="0.2"/>
373 |       <ROUGE_be score="0"/>
374 |     </eval>
375 |   </example>
376 |   <example>
377 |     <article>us president george w# bush said late wednesday that he and the first lady shared `` the concerns of the israeli people about prime minister ariel sharon 's health '' and were praying for his recovery #</article>
378 |     <reference>bush says he shares israelis concern over sharon</reference>
379 |     <summary>bush sees sharon 's illness</summary>
380 |     <eval>
381 |       <BLEU score="0.6235704094727464"/>
382 |       <ROUGE_1 score="0.4444444444444445"/>
383 |       <ROUGE_2 score="0"/>
384 |       <ROUGE_l score="0.4444444444444445"/>
385 |       <ROUGE_be score="0"/>
386 |     </eval>
387 |   </example>
388 |   <example>
389 |     <article>the prime minister of burkina faso , &lt;unk&gt; ernest yonli , has handed in his resignation to president blaise compaore , the president 's office announced wednesday #</article>
390 |     <reference>burkina faso 's prime minister resigns</reference>
391 |     <summary>burkina faso pm resigns over president 's resignation</summary>
392 |     <eval>
393 |       <BLEU score="2.208959113415788"/>
394 |       <ROUGE_1 score="0.5454545454545454"/>
395 |       <ROUGE_2 score="0.22222222222222224"/>
396 |       <ROUGE_l score="0.5454545454545454"/>
397 |       <ROUGE_be score="0"/>
398 |     </eval>
399 |   </example>
400 |   <example>
401 |     <article>a top hollywood event promoter is seeking a court order to stop socialite and hotel heiress paris hilton from allegedly harassing him and threatening his life , court documents showed wednesday #</article>
402 |     <reference>man seeks stay-away order against paris hilton</reference>
403 |     <summary>top hollywood judge seeks to cancel paris hilton</summary>
404 |     <eval>
405 |       <BLEU score="2.0556680845025985"/>
406 |       <ROUGE_1 score="0.4615384615384615"/>
407 |       <ROUGE_2 score="0.1818181818181818"/>
408 |       <ROUGE_l score="0.4615384615384615"/>
409 |       <ROUGE_be score="0"/>
410 |     </eval>
411 |   </example>
412 |   <example>
413 |     <article>the un security council will hold a ministerial session on the situation in africa 's great lakes region late this month , &lt;unk&gt; 's un envoy augustine &lt;unk&gt; said wednesday #</article>
414 |     <reference>security council to hold ministerial session on africa 's great lakes</reference>
415 |     <summary>un to hold ministerial session on africa 's great lakes region</summary>
416 |     <eval>
417 |       <BLEU score="78.60753021519781"/>
418 |       <ROUGE_1 score="0.7999999999999999"/>
419 |       <ROUGE_2 score="0.7692307692307692"/>
420 |       <ROUGE_l score="0.7999999999999999"/>
421 |       <ROUGE_be score="0.8"/>
422 |     </eval>
423 |   </example>
424 |   <example>
425 |     <article>political leaders in israel united in prayers for ariel sharon on thursday as the prime minister underwent emergency surgery after suffering a massive stroke #</article>
426 |     <reference>israeli leaders unite in prayer for ailing sharon</reference>
427 |     <summary>new israeli political leaders in mourning for sharon undergoes surgery</summary>
428 |     <eval>
429 |       <BLEU score="0.5612222324305729"/>
430 |       <ROUGE_1 score="0.4615384615384615"/>
431 |       <ROUGE_2 score="0"/>
432 |       <ROUGE_l score="0.4615384615384615"/>
433 |       <ROUGE_be score="0"/>
434 |     </eval>
435 |   </example>
436 |   <example>
437 |     <article>israeli prime minister ariel sharon was undergoing an emergency operation thursday after suffering a massive stroke #</article>
438 |     <reference>timeline of sharon era</reference>
439 |     <summary>israeli pm undergoes civilian operation</summary>
440 |     <eval>
441 |       <BLEU score="0.3021375397356768"/>
442 |       <ROUGE_1 score="0"/>
443 |       <ROUGE_2 score="0"/>
444 |       <ROUGE_l score="0"/>
445 |       <ROUGE_be score="0"/>
446 |     </eval>
447 |   </example>
448 |   <example>
449 |     <article>human trafficking victims could be spared deportation from britain and get automatic permission to stay under government proposals revealed thursday #</article>
450 |     <reference>human trafficking victims could get right to remain in britain</reference>
451 |     <summary>human trafficking victims could be spares</summary>
452 |     <eval>
453 |       <BLEU score="26.088405164365604"/>
454 |       <ROUGE_1 score="0.6666666666666665"/>
455 |       <ROUGE_2 score="0.5714285714285715"/>
456 |       <ROUGE_l score="0.6666666666666665"/>
457 |       <ROUGE_be score="0"/>
458 |     </eval>
459 |   </example>
460 |   <example>
461 |     <article>world number two rafael nadal is in doubt for this month 's australian open after withdrawing on thursday from next week 's lead-up sydney international tournament #</article>
462 |     <reference>nadal pulls out of sydney international</reference>
463 |     <summary>world number two for australian open</summary>
464 |     <eval>
465 |       <BLEU score="0.2295748846661433"/>
466 |       <ROUGE_1 score="0"/>
467 |       <ROUGE_2 score="0"/>
468 |       <ROUGE_l score="0"/>
469 |       <ROUGE_be score="0"/>
470 |     </eval>
471 |   </example>
472 |   <example>
473 |     <article>four-time all-star alexander mogilny was waived by new jersey on wednesday , one day after the devils welcomed back czech forward patrik elias #</article>
474 |     <reference>mogilny odd man out as devils welcome back elias</reference>
475 |     <summary>&lt; unk &gt; gets last jersey</summary>
476 |     <eval>
477 |       <BLEU score="0.1392442062500076"/>
478 |       <ROUGE_1 score="0"/>
479 |       <ROUGE_2 score="0"/>
480 |       <ROUGE_l score="0"/>
481 |       <ROUGE_be score="0"/>
482 |     </eval>
483 |   </example>
484 |   <example>
485 |     <article>finance minister ehud olmert will chair an emergency meeting of the israeli cabinet on thursday after prime minister ariel sharon suffered a massive stroke , public television reported #</article>
486 |     <reference>olmert to chair emergency israel cabinet meet</reference>
487 |     <summary>israeli finance minister to meet thursday</summary>
488 |     <eval>
489 |       <BLEU score="0.7308015504567585"/>
490 |       <ROUGE_1 score="0.1818181818181818"/>
491 |       <ROUGE_2 score="0"/>
492 |       <ROUGE_l score="0.1818181818181818"/>
493 |       <ROUGE_be score="0"/>
494 |     </eval>
495 |   </example>
496 |   <example>
497 |     <article>hollywood actor nick nolte has successfully completed three years of probation imposed for driving under the influence of drugs , a judge ruled on wednesday #</article>
498 |     <reference>us actor nick nolte ends &lt;unk&gt;</reference>
499 |     <summary>actor ashton has four years probation</summary>
500 |     <eval>
501 |       <BLEU score="0.5201870634468553"/>
502 |       <ROUGE_1 score="0.22222222222222224"/>
503 |       <ROUGE_2 score="0"/>
504 |       <ROUGE_l score="0.22222222222222224"/>
505 |       <ROUGE_be score="0"/>
506 |     </eval>
507 |   </example>
508 |   <example>
509 |     <article>south korea 's consumer confidence topped the benchmark #-point level for the first time in eight months in december , reflecting the country 's solid economic recovery , official data showed thursday #</article>
510 |     <reference>skorea 's consumer confidence tops benchmark #</reference>
511 |     <summary>south korea 's confidence dominates</summary>
512 |     <eval>
513 |       <BLEU score="0.7616306180370791"/>
514 |       <ROUGE_1 score="0.22222222222222224"/>
515 |       <ROUGE_2 score="0"/>
516 |       <ROUGE_l score="0.22222222222222224"/>
517 |       <ROUGE_be score="0"/>
518 |     </eval>
519 |   </example>
520 |   <example>
521 |     <article>hollywood star charlie sheen and actress denise richards have scrapped efforts to reconcile their differences and have decided to push ahead with a divorce , court documents showed wednesday #</article>
522 |     <reference>denise richards charlie sheen push ahead with divorce</reference>
523 |     <summary>&lt; unk &gt; conductor richards complete reconciliation efforts</summary>
524 |     <eval>
525 |       <BLEU score="0.4939382737115371"/>
526 |       <ROUGE_1 score="0.15384615384615383"/>
527 |       <ROUGE_2 score="0"/>
528 |       <ROUGE_l score="0.15384615384615383"/>
529 |       <ROUGE_be score="0"/>
530 |     </eval>
531 |   </example>
532 |   <example>
533 |     <article>hong kong gold prices opened higher thursday at #-# # us dollars an ounce , compared to wednesday 's close of #-# # dollars an ounce #</article>
534 |     <reference>hong kong gold opens higher</reference>
535 |     <summary>hong kong gold opens higher</summary>
536 |     <eval>
537 |       <BLEU score="100.00000000000004"/>
538 |       <ROUGE_1 score="1.0"/>
539 |       <ROUGE_2 score="1.0"/>
540 |       <ROUGE_l score="1.0"/>
541 |       <ROUGE_be score="1.0"/>
542 |     </eval>
543 |   </example>
544 |   <example>
545 |     <article>japanese share prices rose # percent in morning trade thursday to hit the highest level in more than five years as fresh gains on wall street fanned upbeat investor sentiment here , dealers said #</article>
546 |     <reference>tokyo shares rise # percent in morning trade</reference>
547 |     <summary>tokyo shares rise # percent in morning trade</summary>
548 |     <eval>
549 |       <BLEU score="100.00000000000004"/>
550 |       <ROUGE_1 score="1.0"/>
551 |       <ROUGE_2 score="1.0"/>
552 |       <ROUGE_l score="1.0"/>
553 |       <ROUGE_be score="1.0"/>
554 |     </eval>
555 |   </example>
556 |   <example>
557 |     <article>a top police officer appealed thursday to supermodel kate moss to return to britain and face arrest amid allegations of &lt;unk&gt; #</article>
558 |     <reference>british police seek to arrest moss amid cocaine inquiry</reference>
559 |     <summary>police arrest supermodel kate moss to britain</summary>
560 |     <eval>
561 |       <BLEU score="0.6242535756678117"/>
562 |       <ROUGE_1 score="0.4615384615384615"/>
563 |       <ROUGE_2 score="0"/>
564 |       <ROUGE_l score="0.4615384615384615"/>
565 |       <ROUGE_be score="0.4"/>
566 |     </eval>
567 |   </example>
568 |   <example>
569 |     <article>the cerebral hemorrhage suffered by israeli prime minister ariel sharon occurs when a defective artery in the brain bursts : it is among the deadliest of the major types of stroke #</article>
570 |     <reference>key facts about hemorrhagic stroke</reference>
571 |     <summary>&lt; unk &gt; dini voted out of electric decline</summary>
572 |     <eval>
573 |       <BLEU score="0.13485111859503685"/>
574 |       <ROUGE_1 score="0"/>
575 |       <ROUGE_2 score="0"/>
576 |       <ROUGE_l score="0"/>
577 |       <ROUGE_be score="0"/>
578 |     </eval>
579 |   </example>
580 |   <example>
581 |     <article>hong kong share prices opened # percent higher thursday on follow-through interest in properties after wednesday 's sharp gains on abating interest rate worries , dealers said #</article>
582 |     <reference>hong kong shares open higher as rate worries ease</reference>
583 |     <summary>hong kong shares open # percent higher</summary>
584 |     <eval>
585 |       <BLEU score="32.66828640925501"/>
586 |       <ROUGE_1 score="0.7142857142857143"/>
587 |       <ROUGE_2 score="0.5"/>
588 |       <ROUGE_l score="0.7142857142857143"/>
589 |       <ROUGE_be score="0.6666666666666666"/>
590 |     </eval>
591 |   </example>
592 |   <example>
593 |     <article>trade between south korea and its communist neighbor north korea doubled last year , totalling more than one billion dollars for the first time , official data showed #</article>
594 |     <reference>inter-korean trade doubles to one billion dollars in #</reference>
595 |     <summary>afp sports schedule for monday march #</summary>
596 |     <eval>
597 |       <BLEU score="0.4414139365346591"/>
598 |       <ROUGE_1 score="0"/>
599 |       <ROUGE_2 score="0"/>
600 |       <ROUGE_l score="0"/>
601 |       <ROUGE_be score="0"/>
602 |     </eval>
603 |   </example>
604 | </ZakSum>
605 | 
606 | last training
607 | -------------
608 | step 47000: loss = 8.033841133117676
609 | step 48000: loss = 9.481734275817871
610 | step 49000: loss = 7.188093662261963
611 | step 50000: loss = 14.354914665222168
612 |  Epoch 16: Model is saved. Elapsed: 02:19:23.39 


--------------------------------------------------------------------------------
/Implementation A (seq2seq with attention and feature rich representation)/Model 2/Results/(model 2)result_valid_29_10_2018_5_28pm.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" ?>
  2 | <ZakSum bleu="8.255320332262427" rouge_1="0.3242562992562992" rouge_2="0.12368686868686868" rouge_L="0.3062702852702852" rouge_be="0.06999999999999999">
  3 |   <!--Generated by Amr Zaki-->
  4 |   <example>
  5 |     <article>five-time world champion michelle kwan withdrew from the # us figure skating championships on wednesday , but will petition us skating officials for the chance to compete at the # turin olympics #</article>
  6 |     <reference>injury leaves kwan 's olympic hopes in limbo</reference>
  7 |     <summary>world figure skating championships results</summary>
  8 |     <eval>
  9 |       <BLEU score="0.1658165975077607"/>
 10 |       <ROUGE_1 score="0"/>
 11 |       <ROUGE_2 score="0"/>
 12 |       <ROUGE_l score="0"/>
 13 |       <ROUGE_be score="0"/>
 14 |     </eval>
 15 |   </example>
 16 |   <example>
 17 |     <article>us business leaders lashed out wednesday at legislation that would penalize companies for employing illegal immigrants #</article>
 18 |     <reference>us business attacks tough immigration law</reference>
 19 |     <summary>pentagon leaders condemn illegal use of immigrants</summary>
 20 |     <eval>
 21 |       <BLEU score="0.18575057999133598"/>
 22 |       <ROUGE_1 score="0"/>
 23 |       <ROUGE_2 score="0"/>
 24 |       <ROUGE_l score="0"/>
 25 |       <ROUGE_be score="0"/>
 26 |     </eval>
 27 |   </example>
 28 |   <example>
 29 |     <article>general motors corp# said wednesday its us sales fell # percent in december and four percent in # with the biggest losses coming from passenger car sales #</article>
 30 |     <reference>gm december sales fall # percent</reference>
 31 |     <summary>gm sales down # percent in second quarter</summary>
 32 |     <eval>
 33 |       <BLEU score="2.208959113415788"/>
 34 |       <ROUGE_1 score="0.6666666666666665"/>
 35 |       <ROUGE_2 score="0"/>
 36 |       <ROUGE_l score="0.6666666666666665"/>
 37 |       <ROUGE_be score="0"/>
 38 |     </eval>
 39 |   </example>
 40 |   <example>
 41 |     <article>several thousand people gathered on wednesday evening on the main square in zagreb for a public draw and an open air party to celebrate the croatian capital 's second chance to host the women 's slalom world cup #</article>
 42 |     <reference>thousands of croatians celebrate before world cup slalom</reference>
 43 |     <summary>zagreb prepare for world cup final</summary>
 44 |     <eval>
 45 |       <BLEU score="1.9562170797817717"/>
 46 |       <ROUGE_1 score="0.3636363636363636"/>
 47 |       <ROUGE_2 score="0.22222222222222224"/>
 48 |       <ROUGE_l score="0.3636363636363636"/>
 49 |       <ROUGE_be score="0"/>
 50 |     </eval>
 51 |   </example>
 52 |   <example>
 53 |     <article>us first lady laura bush and us secretary of state condoleezza rice will represent the united states later this month at the inauguration of liberia 's president-elect ellen johnson sirleaf , the white house said wednesday #</article>
 54 |     <reference>laura bush &lt;unk&gt; rice to attend sirleaf 's inauguration in liberia</reference>
 55 |     <summary>bush to attend liberia 's presidential inauguration</summary>
 56 |     <eval>
 57 |       <BLEU score="1.2337170820562469"/>
 58 |       <ROUGE_1 score="0.6153846153846154"/>
 59 |       <ROUGE_2 score="0"/>
 60 |       <ROUGE_l score="0.4615384615384615"/>
 61 |       <ROUGE_be score="0.5"/>
 62 |     </eval>
 63 |   </example>
 64 |   <example>
 65 |     <article>jack abramoff , a former lobbyist at the center of a mushrooming political scandal in washington , on wednesday pleaded guilty in a us court in miami to defrauding lenders in a florida gambling boat deal #</article>
 66 |     <reference>top republican lobbyist pleads guilty to florida fraud</reference>
 67 |     <summary>us egg lawmaker pleads guilty to financial fraud</summary>
 68 |     <eval>
 69 |       <BLEU score="8.307018474412793"/>
 70 |       <ROUGE_1 score="0.4615384615384615"/>
 71 |       <ROUGE_2 score="0.1818181818181818"/>
 72 |       <ROUGE_l score="0.4615384615384615"/>
 73 |       <ROUGE_be score="0"/>
 74 |     </eval>
 75 |   </example>
 76 |   <example>
 77 |     <article>somalia 's feuding president and parliament speaker have agreed a compromise in a bitter row over the appropriate seat for their fledgling transitional government , a yemeni official said wednesday #</article>
 78 |     <reference>somalia rivals in compromise on seat of government</reference>
 79 |     <summary>somali leaders agree compromise over cabinet</summary>
 80 |     <eval>
 81 |       <BLEU score="0.5201870634468553"/>
 82 |       <ROUGE_1 score="0.20000000000000004"/>
 83 |       <ROUGE_2 score="0"/>
 84 |       <ROUGE_l score="0.20000000000000004"/>
 85 |       <ROUGE_be score="0"/>
 86 |     </eval>
 87 |   </example>
 88 |   <example>
 89 |     <article>a #-hour strike by airport workers in portugal planned for friday over job security issues could lead to flight cancellations , union and airport officials said #</article>
 90 |     <reference>portuguese airport workers strike could ground flights on friday</reference>
 91 |     <summary>portuguese airport workers walk for air travel strike</summary>
 92 |     <eval>
 93 |       <BLEU score="7.330918073382302"/>
 94 |       <ROUGE_1 score="0.5714285714285714"/>
 95 |       <ROUGE_2 score="0.3333333333333333"/>
 96 |       <ROUGE_l score="0.5714285714285714"/>
 97 |       <ROUGE_be score="0"/>
 98 |     </eval>
 99 |   </example>
100 |   <example>
101 |     <article>jose mourinho renewed his partnership with portuguese international maniche on wednesday when he completed the loan signing of the #-year-old midfielder from dynamo moscow #</article>
102 |     <reference>maniche renews partnership with mourinho</reference>
103 |     <summary>mourinho signs for united loan</summary>
104 |     <eval>
105 |       <BLEU score="0.9554427922043668"/>
106 |       <ROUGE_1 score="0.25"/>
107 |       <ROUGE_2 score="0"/>
108 |       <ROUGE_l score="0.25"/>
109 |       <ROUGE_be score="0"/>
110 |     </eval>
111 |   </example>
112 |   <example>
113 |     <article>hollywood is planning a new sequel to adventure flick `` ocean 's eleven , '' with star george clooney set to reprise his role as a charismatic thief in `` ocean 's thirteen , '' the entertainment press said wednesday #</article>
114 |     <reference>hollywood shores up support for ocean 's thirteen</reference>
115 |     <summary>hollywood make new sequel to oscars</summary>
116 |     <eval>
117 |       <BLEU score="0.5201870634468553"/>
118 |       <ROUGE_1 score="0.22222222222222224"/>
119 |       <ROUGE_2 score="0"/>
120 |       <ROUGE_l score="0.22222222222222224"/>
121 |       <ROUGE_be score="0"/>
122 |     </eval>
123 |   </example>
124 |   <example>
125 |     <article>canada advised its nationals wednesday to avoid non-essential travel to nepal , following the resumption of a maoist rebel insurgency in the himalayan kingdom #</article>
126 |     <reference>canada recommends avoiding travel to nepal</reference>
127 |     <summary>canada advises nationals to avoid nepal</summary>
128 |     <eval>
129 |       <BLEU score="0.9554427922043669"/>
130 |       <ROUGE_1 score="0.4000000000000001"/>
131 |       <ROUGE_2 score="0"/>
132 |       <ROUGE_l score="0.4000000000000001"/>
133 |       <ROUGE_be score="0"/>
134 |     </eval>
135 |   </example>
136 |   <example>
137 |     <article>us auto sales will likely be weaker in # , a senior executive at ford motor company said wednesday #</article>
138 |     <reference>ford executive sees weaker us auto sales in #</reference>
139 |     <summary>ford us auto sector seen weaker next year</summary>
140 |     <eval>
141 |       <BLEU score="1.949399575525447"/>
142 |       <ROUGE_1 score="0.5454545454545454"/>
143 |       <ROUGE_2 score="0"/>
144 |       <ROUGE_l score="0.3636363636363636"/>
145 |       <ROUGE_be score="0"/>
146 |     </eval>
147 |   </example>
148 |   <example>
149 |     <article>the united states on wednesday welcomed beijing 's early release of a chinese journalist jailed for exposing top-level graft but said it remain concerned over the lack of freedom in the world 's most populous nation #</article>
150 |     <reference>us welcomes chinese journalist 's release highlights freedom concerns</reference>
151 |     <summary>us welcomes release of chinese reporter</summary>
152 |     <eval>
153 |       <BLEU score="1.9692104496063725"/>
154 |       <ROUGE_1 score="0.5454545454545454"/>
155 |       <ROUGE_2 score="0"/>
156 |       <ROUGE_l score="0.36363636363636365"/>
157 |       <ROUGE_be score="0.3333333333333333"/>
158 |     </eval>
159 |   </example>
160 |   <example>
161 |     <article>two bank workers on wednesday admitted stealing almost #,# pounds -lrb- #,# euros -rrb- from the accounts of three former manchester city footballers #</article>
162 |     <reference>two bankers admit theft from city players</reference>
163 |     <summary>two arrested city players admits scam</summary>
164 |     <eval>
165 |       <BLEU score="2.5575390578966206"/>
166 |       <ROUGE_1 score="0.4000000000000001"/>
167 |       <ROUGE_2 score="0.25"/>
168 |       <ROUGE_l score="0.4000000000000001"/>
169 |       <ROUGE_be score="0"/>
170 |     </eval>
171 |   </example>
172 |   <example>
173 |     <article>israeli prime minister ariel sharon was admitted to hospital early for a planned heart procedure after feeling unwell wednesday evening , his office said #</article>
174 |     <reference>unwell sharon admitted to hospital early</reference>
175 |     <summary>hurt israeli pm sharon en route to hospital</summary>
176 |     <eval>
177 |       <BLEU score="2.0556680845025985"/>
178 |       <ROUGE_1 score="0.3333333333333333"/>
179 |       <ROUGE_2 score="0"/>
180 |       <ROUGE_l score="0.3333333333333333"/>
181 |       <ROUGE_be score="0"/>
182 |     </eval>
183 |   </example>
184 |   <example>
185 |     <article>sales of hybrid &lt;unk&gt; vehicles in the united states should more than triple over the next seven years , a leading research firm said wednesday #</article>
186 |     <reference>us hybrid vehicle sales expect to more than triple in seven years</reference>
187 |     <summary>us companies favor triple sales in</summary>
188 |     <eval>
189 |       <BLEU score="0.37769817880229023"/>
190 |       <ROUGE_1 score="0.4"/>
191 |       <ROUGE_2 score="0"/>
192 |       <ROUGE_l score="0.2"/>
193 |       <ROUGE_be score="0"/>
194 |     </eval>
195 |   </example>
196 |   <example>
197 |     <article>the los angeles dodgers acquired south korean right-hander jae seo from the new york mets on wednesday in a four-player swap #</article>
198 |     <reference>korea 's seo headed to dodgers from mets</reference>
199 |     <summary>yankees get south korea</summary>
200 |     <eval>
201 |       <BLEU score="0.5255967942291402"/>
202 |       <ROUGE_1 score="0.25"/>
203 |       <ROUGE_2 score="0"/>
204 |       <ROUGE_l score="0.25"/>
205 |       <ROUGE_be score="0"/>
206 |     </eval>
207 |   </example>
208 |   <example>
209 |     <article>teenage hollywood starlet lindsay lohan , who was rushed to hospital this week after suffering an asthma attack , has admitted fighting a fierce battle with the eating disorder bulimia #</article>
210 |     <reference>hollywood starlet lindsay lohan admits bulimia battle</reference>
211 |     <summary>celebrity star ca n't fight me aussie tells journalists</summary>
212 |     <eval>
213 |       <BLEU score="0.13485111859503685"/>
214 |       <ROUGE_1 score="0"/>
215 |       <ROUGE_2 score="0"/>
216 |       <ROUGE_l score="0"/>
217 |       <ROUGE_be score="0"/>
218 |     </eval>
219 |   </example>
220 |   <example>
221 |     <article>the white house vigorously rejected wednesday suggestions that us soldiers in iraq do not hesitate to fire on civilians , after a bombing killed eight iraqi non-combatants , including two children #</article>
222 |     <reference>us insists soldiers act with restraint to protect civilians</reference>
223 |     <summary>white house vows to fired civilians after bombing</summary>
224 |     <eval>
225 |       <BLEU score="0.5183741882011644"/>
226 |       <ROUGE_1 score="0.16666666666666666"/>
227 |       <ROUGE_2 score="0"/>
228 |       <ROUGE_l score="0.16666666666666666"/>
229 |       <ROUGE_be score="0"/>
230 |     </eval>
231 |   </example>
232 |   <example>
233 |     <article>the us special envoy to multilateral talks aimed at ending north korea 's nuclear weapons drive has quit , the state department said wednesday amid reported divisions within the administration over the nuclear issue #</article>
234 |     <reference>us special envoy to korean nuclear talks quits</reference>
235 |     <summary>us envoy in seoul to resolve north korea 's nuclear program</summary>
236 |     <eval>
237 |       <BLEU score="0.4740604259878137"/>
238 |       <ROUGE_1 score="0.30769230769230765"/>
239 |       <ROUGE_2 score="0"/>
240 |       <ROUGE_l score="0.30769230769230765"/>
241 |       <ROUGE_be score="0"/>
242 |     </eval>
243 |   </example>
244 |   <example>
245 |     <article>at least two people have tested positive for the bird flu virus in eastern turkey , health minister recep akdag told a news conference wednesday #</article>
246 |     <reference>two test positive for bird flu virus in turkey</reference>
247 |     <summary>afp world news summary</summary>
248 |     <eval>
249 |       <BLEU score="0.12944315424334968"/>
250 |       <ROUGE_1 score="0"/>
251 |       <ROUGE_2 score="0"/>
252 |       <ROUGE_l score="0"/>
253 |       <ROUGE_be score="0"/>
254 |     </eval>
255 |   </example>
256 |   <example>
257 |     <article>britain 's un envoy on wednesday urged stronger international support , including greater eu funding , for the african union -lrb- au -rrb- peacekeeping mission in sudan 's troubled darfur region to improve security on the ground #</article>
258 |     <reference>britain urges stronger international support for au in darfur</reference>
259 |     <summary>british envoy urges peacekeeping mission for darfur security</summary>
260 |     <eval>
261 |       <BLEU score="0.5736753417215605"/>
262 |       <ROUGE_1 score="0.2857142857142857"/>
263 |       <ROUGE_2 score="0"/>
264 |       <ROUGE_l score="0.2857142857142857"/>
265 |       <ROUGE_be score="0"/>
266 |     </eval>
267 |   </example>
268 |   <example>
269 |     <article>australian foreign minister alexander downer called wednesday for the reform of the un security council and expressed support for brazil , india , japan and an african country to join the council #</article>
270 |     <reference>australia backs brazil others for un security council</reference>
271 |     <summary>australian fm urges reform of un security council</summary>
272 |     <eval>
273 |       <BLEU score="7.7305517569394535"/>
274 |       <ROUGE_1 score="0.3636363636363636"/>
275 |       <ROUGE_2 score="0.22222222222222224"/>
276 |       <ROUGE_l score="0.3636363636363636"/>
277 |       <ROUGE_be score="0"/>
278 |     </eval>
279 |   </example>
280 |   <example>
281 |     <article>two egyptian border guards were killed wednesday in clashes with palestinian militants near the rafah crossing on the border with gaza , a medical source said #</article>
282 |     <reference>two egyptian guards killed on border with gaza</reference>
283 |     <summary>two egyptian border guards killed in clashes with palestinian</summary>
284 |     <eval>
285 |       <BLEU score="2.509862124397896"/>
286 |       <ROUGE_1 score="0.7272727272727272"/>
287 |       <ROUGE_2 score="0.22222222222222224"/>
288 |       <ROUGE_l score="0.5454545454545454"/>
289 |       <ROUGE_be score="0"/>
290 |     </eval>
291 |   </example>
292 |   <example>
293 |     <article>stephen harper and his conservative party pulled ahead of prime minister paul martin 's liberals wednesday , weeks before canada 's federal election on january # , according to the latest poll #</article>
294 |     <reference>conservatives gain momentum ahead of ruling liberals</reference>
295 |     <summary>coalition partner ahead of canada 's liberal</summary>
296 |     <eval>
297 |       <BLEU score="2.208959113415788"/>
298 |       <ROUGE_1 score="0.1818181818181818"/>
299 |       <ROUGE_2 score="0"/>
300 |       <ROUGE_l score="0.1818181818181818"/>
301 |       <ROUGE_be score="0"/>
302 |     </eval>
303 |   </example>
304 |   <example>
305 |     <article>dutch bank abn amro was hit wednesday with its second us fine in two months after settling a government probe into its mortgage lending in the industrial state of michigan #</article>
306 |     <reference>abn amro hit with second us fine</reference>
307 |     <summary>dutch bank abn amro suffers second month in two months</summary>
308 |     <eval>
309 |       <BLEU score="1.561969968460128"/>
310 |       <ROUGE_1 score="0.36363636363636365"/>
311 |       <ROUGE_2 score="0.2222222222222222"/>
312 |       <ROUGE_l score="0.36363636363636365"/>
313 |       <ROUGE_be score="0"/>
314 |     </eval>
315 |   </example>
316 |   <example>
317 |     <article>the buffalo bills sacked tom donahoe as president and general manager on wednesday , fulfilling expectations of a shake-up after another failure to make the national football league playoffs #</article>
318 |     <reference>nfl 's bills shake up front office</reference>
319 |     <summary>bills sacked as head of fifa shake-up</summary>
320 |     <eval>
321 |       <BLEU score="0.5873949094699213"/>
322 |       <ROUGE_1 score="0.4000000000000001"/>
323 |       <ROUGE_2 score="0"/>
324 |       <ROUGE_l score="0.4000000000000001"/>
325 |       <ROUGE_be score="0"/>
326 |     </eval>
327 |   </example>
328 |   <example>
329 |     <article>tottenham sealed a miserable festive period for a toothless manchester city here wednesday with a #-# win thanks to goals from their confident striking duo of ahmed mido and robbie keane #</article>
330 |     <reference>spurs compound manchester city woes</reference>
331 |     <summary>spurs get a boost for</summary>
332 |     <eval>
333 |       <BLEU score="0.9554427922043668"/>
334 |       <ROUGE_1 score="0.28571428571428575"/>
335 |       <ROUGE_2 score="0"/>
336 |       <ROUGE_l score="0.28571428571428575"/>
337 |       <ROUGE_be score="0"/>
338 |     </eval>
339 |   </example>
340 |   <example>
341 |     <article>the dollar fell to a two-month low against the euro wednesday on expectations that us interest rates will soon stop going up , depriving the greenback of its biggest investment appeal #</article>
342 |     <reference>us rate outlook sends dollar to two-month low against euro</reference>
343 |     <summary>dollar gets two-month low on expectations</summary>
344 |     <eval>
345 |       <BLEU score="1.5512258520268645"/>
346 |       <ROUGE_1 score="0.5454545454545454"/>
347 |       <ROUGE_2 score="0.4444444444444444"/>
348 |       <ROUGE_l score="0.5454545454545454"/>
349 |       <ROUGE_be score="0"/>
350 |     </eval>
351 |   </example>
352 |   <example>
353 |     <article>two top us movie groups picked their # awards nominees wednesday , with gay drama `` brokeback mountain , '' george clooney 's `` good night and good '' luck and `` crash '' taking early leads in the oscars race #</article>
354 |     <reference>top us movie groups pick awards nominees as oscars loom</reference>
355 |     <summary>us oscar winner son 's # golden</summary>
356 |     <eval>
357 |       <BLEU score="0.3826519862236253"/>
358 |       <ROUGE_1 score="0"/>
359 |       <ROUGE_2 score="0"/>
360 |       <ROUGE_l score="0"/>
361 |       <ROUGE_be score="0"/>
362 |     </eval>
363 |   </example>
364 |   <example>
365 |     <article>right fielder jeromy burnitz inked a one-year , # million-dollar contract with the pittsburgh pirates on wednesday , a week after he was reported as close to a deal with baltimore #</article>
366 |     <reference>pirates ink outfielder burnitz to one-year deal</reference>
367 |     <summary>san angeles sign news summary</summary>
368 |     <eval>
369 |       <BLEU score="0.20252884954471367"/>
370 |       <ROUGE_1 score="0"/>
371 |       <ROUGE_2 score="0"/>
372 |       <ROUGE_l score="0"/>
373 |       <ROUGE_be score="0"/>
374 |     </eval>
375 |   </example>
376 |   <example>
377 |     <article>us president george w# bush said late wednesday that he and the first lady shared `` the concerns of the israeli people about prime minister ariel sharon 's health '' and were praying for his recovery #</article>
378 |     <reference>bush says he shares israelis concern over sharon</reference>
379 |     <summary>bush revives israeli health concerns</summary>
380 |     <eval>
381 |       <BLEU score="0.5243581219839234"/>
382 |       <ROUGE_1 score="0.20000000000000004"/>
383 |       <ROUGE_2 score="0"/>
384 |       <ROUGE_l score="0.20000000000000004"/>
385 |       <ROUGE_be score="0"/>
386 |     </eval>
387 |   </example>
388 |   <example>
389 |     <article>the prime minister of burkina faso , &lt;unk&gt; ernest yonli , has handed in his resignation to president blaise compaore , the president 's office announced wednesday #</article>
390 |     <reference>burkina faso 's prime minister resigns</reference>
391 |     <summary>burkina faso 's prime minister resigns</summary>
392 |     <eval>
393 |       <BLEU score="100.00000000000004"/>
394 |       <ROUGE_1 score="1.0"/>
395 |       <ROUGE_2 score="1.0"/>
396 |       <ROUGE_l score="1.0"/>
397 |       <ROUGE_be score="0"/>
398 |     </eval>
399 |   </example>
400 |   <example>
401 |     <article>a top hollywood event promoter is seeking a court order to stop socialite and hotel heiress paris hilton from allegedly harassing him and threatening his life , court documents showed wednesday #</article>
402 |     <reference>man seeks stay-away order against paris hilton</reference>
403 |     <summary>top hollywood event sues for violating theft</summary>
404 |     <eval>
405 |       <BLEU score="0.18575057999133598"/>
406 |       <ROUGE_1 score="0"/>
407 |       <ROUGE_2 score="0"/>
408 |       <ROUGE_l score="0"/>
409 |       <ROUGE_be score="0"/>
410 |     </eval>
411 |   </example>
412 |   <example>
413 |     <article>the un security council will hold a ministerial session on the situation in africa 's great lakes region late this month , &lt;unk&gt; 's un envoy augustine &lt;unk&gt; said wednesday #</article>
414 |     <reference>security council to hold ministerial session on africa 's great lakes</reference>
415 |     <summary>un security council to meet on africa situation</summary>
416 |     <eval>
417 |       <BLEU score="6.680901420728027"/>
418 |       <ROUGE_1 score="0.4615384615384615"/>
419 |       <ROUGE_2 score="0.18181818181818182"/>
420 |       <ROUGE_l score="0.4615384615384615"/>
421 |       <ROUGE_be score="0"/>
422 |     </eval>
423 |   </example>
424 |   <example>
425 |     <article>political leaders in israel united in prayers for ariel sharon on thursday as the prime minister underwent emergency surgery after suffering a massive stroke #</article>
426 |     <reference>israeli leaders unite in prayer for ailing sharon</reference>
427 |     <summary>israeli political leaders in emergency for sharon</summary>
428 |     <eval>
429 |       <BLEU score="0.7614310991217833"/>
430 |       <ROUGE_1 score="0.5454545454545454"/>
431 |       <ROUGE_2 score="0"/>
432 |       <ROUGE_l score="0.5454545454545454"/>
433 |       <ROUGE_be score="0"/>
434 |     </eval>
435 |   </example>
436 |   <example>
437 |     <article>israeli prime minister ariel sharon was undergoing an emergency operation thursday after suffering a massive stroke #</article>
438 |     <reference>timeline of sharon era</reference>
439 |     <summary>israeli pm eyes emergency operation after stroke</summary>
440 |     <eval>
441 |       <BLEU score="0.18575057999133598"/>
442 |       <ROUGE_1 score="0"/>
443 |       <ROUGE_2 score="0"/>
444 |       <ROUGE_l score="0"/>
445 |       <ROUGE_be score="0"/>
446 |     </eval>
447 |   </example>
448 |   <example>
449 |     <article>human trafficking victims could be spared deportation from britain and get automatic permission to stay under government proposals revealed thursday #</article>
450 |     <reference>human trafficking victims could get right to remain in britain</reference>
451 |     <summary>britain would allow loggers deportation from</summary>
452 |     <eval>
453 |       <BLEU score="0.372730318315106"/>
454 |       <ROUGE_1 score="0.25"/>
455 |       <ROUGE_2 score="0"/>
456 |       <ROUGE_l score="0.25"/>
457 |       <ROUGE_be score="0"/>
458 |     </eval>
459 |   </example>
460 |   <example>
461 |     <article>world number two rafael nadal is in doubt for this month 's australian open after withdrawing on thursday from next week 's lead-up sydney international tournament #</article>
462 |     <reference>nadal pulls out of sydney international</reference>
463 |     <summary>corretja in doubt for australian open</summary>
464 |     <eval>
465 |       <BLEU score="0.2295748846661433"/>
466 |       <ROUGE_1 score="0"/>
467 |       <ROUGE_2 score="0"/>
468 |       <ROUGE_l score="0"/>
469 |       <ROUGE_be score="0"/>
470 |     </eval>
471 |   </example>
472 |   <example>
473 |     <article>four-time all-star alexander mogilny was waived by new jersey on wednesday , one day after the devils welcomed back czech forward patrik elias #</article>
474 |     <reference>mogilny odd man out as devils welcome back elias</reference>
475 |     <summary>&lt; unk &gt; stuttgart # for new jersey</summary>
476 |     <eval>
477 |       <BLEU score="0.13784336590962212"/>
478 |       <ROUGE_1 score="0"/>
479 |       <ROUGE_2 score="0"/>
480 |       <ROUGE_l score="0"/>
481 |       <ROUGE_be score="0"/>
482 |     </eval>
483 |   </example>
484 |   <example>
485 |     <article>finance minister ehud olmert will chair an emergency meeting of the israeli cabinet on thursday after prime minister ariel sharon suffered a massive stroke , public television reported #</article>
486 |     <reference>olmert to chair emergency israel cabinet meet</reference>
487 |     <summary>olmert to chair israeli cabinet session after massive stroke</summary>
488 |     <eval>
489 |       <BLEU score="7.171781507224268"/>
490 |       <ROUGE_1 score="0.4615384615384615"/>
491 |       <ROUGE_2 score="0.1818181818181818"/>
492 |       <ROUGE_l score="0.4615384615384615"/>
493 |       <ROUGE_be score="0"/>
494 |     </eval>
495 |   </example>
496 |   <example>
497 |     <article>hollywood actor nick nolte has successfully completed three years of probation imposed for driving under the influence of drugs , a judge ruled on wednesday #</article>
498 |     <reference>us actor nick nolte ends &lt;unk&gt;</reference>
499 |     <summary>alvarez rules to be illegal cancer cases</summary>
500 |     <eval>
501 |       <BLEU score="0.16102307266026747"/>
502 |       <ROUGE_1 score="0"/>
503 |       <ROUGE_2 score="0"/>
504 |       <ROUGE_l score="0"/>
505 |       <ROUGE_be score="0"/>
506 |     </eval>
507 |   </example>
508 |   <example>
509 |     <article>south korea 's consumer confidence topped the benchmark #-point level for the first time in eight months in december , reflecting the country 's solid economic recovery , official data showed thursday #</article>
510 |     <reference>skorea 's consumer confidence tops benchmark #</reference>
511 |     <summary>skorea 's economic confidence hits key recovery</summary>
512 |     <eval>
513 |       <BLEU score="2.4446151121745046"/>
514 |       <ROUGE_1 score="0.3636363636363636"/>
515 |       <ROUGE_2 score="0"/>
516 |       <ROUGE_l score="0.3636363636363636"/>
517 |       <ROUGE_be score="0"/>
518 |     </eval>
519 |   </example>
520 |   <example>
521 |     <article>hollywood star charlie sheen and actress denise richards have scrapped efforts to reconcile their differences and have decided to push ahead with a divorce , court documents showed wednesday #</article>
522 |     <reference>denise richards charlie sheen push ahead with divorce</reference>
523 |     <summary>hollywood star &lt; unk &gt; stop reviving reconciliation</summary>
524 |     <eval>
525 |       <BLEU score="0.1561969968460128"/>
526 |       <ROUGE_1 score="0"/>
527 |       <ROUGE_2 score="0"/>
528 |       <ROUGE_l score="0"/>
529 |       <ROUGE_be score="0"/>
530 |     </eval>
531 |   </example>
532 |   <example>
533 |     <article>hong kong gold prices opened higher thursday at #-# # us dollars an ounce , compared to wednesday 's close of #-# # dollars an ounce #</article>
534 |     <reference>hong kong gold opens higher</reference>
535 |     <summary>hong kong gold opens higher</summary>
536 |     <eval>
537 |       <BLEU score="100.00000000000004"/>
538 |       <ROUGE_1 score="1.0"/>
539 |       <ROUGE_2 score="1.0"/>
540 |       <ROUGE_l score="1.0"/>
541 |       <ROUGE_be score="1.0"/>
542 |     </eval>
543 |   </example>
544 |   <example>
545 |     <article>japanese share prices rose # percent in morning trade thursday to hit the highest level in more than five years as fresh gains on wall street fanned upbeat investor sentiment here , dealers said #</article>
546 |     <reference>tokyo shares rise # percent in morning trade</reference>
547 |     <summary>tokyo shares rise # percent in morning trade</summary>
548 |     <eval>
549 |       <BLEU score="100.00000000000004"/>
550 |       <ROUGE_1 score="1.0"/>
551 |       <ROUGE_2 score="1.0"/>
552 |       <ROUGE_l score="1.0"/>
553 |       <ROUGE_be score="1.0"/>
554 |     </eval>
555 |   </example>
556 |   <example>
557 |     <article>a top police officer appealed thursday to supermodel kate moss to return to britain and face arrest amid allegations of &lt;unk&gt; #</article>
558 |     <reference>british police seek to arrest moss amid cocaine inquiry</reference>
559 |     <summary>supermodel deputy tries to return britain face charge of &lt; unk &gt;</summary>
560 |     <eval>
561 |       <BLEU score="0.30289764018096393"/>
562 |       <ROUGE_1 score="0"/>
563 |       <ROUGE_2 score="0"/>
564 |       <ROUGE_l score="0"/>
565 |       <ROUGE_be score="0"/>
566 |     </eval>
567 |   </example>
568 |   <example>
569 |     <article>the cerebral hemorrhage suffered by israeli prime minister ariel sharon occurs when a defective artery in the brain bursts : it is among the deadliest of the major types of stroke #</article>
570 |     <reference>key facts about hemorrhagic stroke</reference>
571 |     <summary>housing &lt; unk &gt; cause heart of government</summary>
572 |     <eval>
573 |       <BLEU score="0.1561969968460128"/>
574 |       <ROUGE_1 score="0"/>
575 |       <ROUGE_2 score="0"/>
576 |       <ROUGE_l score="0"/>
577 |       <ROUGE_be score="0"/>
578 |     </eval>
579 |   </example>
580 |   <example>
581 |     <article>hong kong share prices opened # percent higher thursday on follow-through interest in properties after wednesday 's sharp gains on abating interest rate worries , dealers said #</article>
582 |     <reference>hong kong shares open higher as rate worries ease</reference>
583 |     <summary>hong kong shares open # percent higher</summary>
584 |     <eval>
585 |       <BLEU score="32.66828640925501"/>
586 |       <ROUGE_1 score="0.7142857142857143"/>
587 |       <ROUGE_2 score="0.5"/>
588 |       <ROUGE_l score="0.7142857142857143"/>
589 |       <ROUGE_be score="0.6666666666666666"/>
590 |     </eval>
591 |   </example>
592 |   <example>
593 |     <article>trade between south korea and its communist neighbor north korea doubled last year , totalling more than one billion dollars for the first time , official data showed #</article>
594 |     <reference>inter-korean trade doubles to one billion dollars in #</reference>
595 |     <summary>skorea chalks up # billion dollars in first</summary>
596 |     <eval>
597 |       <BLEU score="7.330918073382302"/>
598 |       <ROUGE_1 score="0.3636363636363636"/>
599 |       <ROUGE_2 score="0.22222222222222224"/>
600 |       <ROUGE_l score="0.3636363636363636"/>
601 |       <ROUGE_be score="0"/>
602 |     </eval>
603 |   </example>
604 | </ZakSum>
605 | 
606 | 
607 | last training
608 | -------------
609 | step 35000: loss = 7.746098518371582
610 | step 36000: loss = 7.245748519897461
611 | step 37000: loss = 5.104197025299072
612 |  Epoch 12: Model is saved. Elapsed: 02:48:51.43 
613 | 
614 | 


--------------------------------------------------------------------------------
/Implementation A (seq2seq with attention and feature rich representation)/README.md:
--------------------------------------------------------------------------------
 1 | # Text Summurization models
 2 | 
 3 | This repo is built to collect multiple implementations for abstractive approaches to address text summurization , 
 4 | - it is built to simply run on google colab , in one notebook and to simply connect to your drive , so you would only need an internet connection to run these examples without the need to have a powerful machine , so all the code examples would be in a jupiter format .
 5 | 
 6 | ---------------------------------------------------------------------------------
 7 | 
 8 | # Model_1.ipynb
 9 |   is a modification on of David Currie's https://github.com/Currie32/Text-Summarization-with-Amazon-Reviews seq2seq , modifications made were , 
10 |   #### the model
11 |   - uses an encoder with multi layer RNN with LSTM
12 |   - Decoder is built using bahadau attention model
13 |   - Inference was cstom built by hand
14 |   #### Data is 
15 |   - Amazon Reviews
16 |   #### Word2Vec
17 |   - Conceptnet Numberbatch's (CN) embeddings, similar to GloVe, but probably better 
18 |   (https://github.com/commonsense/conceptnet-numberbatch)
19 |    ####  My Modifications
20 |    - enable the model to run on tf 1.11
21 |    - save data onto google drive , and connect the notebook to the drive
22 |    
23 | ---------------------------------------------------------------------------------
24 | 
25 | # Model_2 Folder
26 | ## Files:
27 | ### 1- Model_2/Model_2.ipynb
28 | a modification to https://github.com/dongjun-Lee/text-summarization-tensorflow 
29 |   #### the model
30 |   - uses an encoder with multi layer RNN with LSTM
31 |   - Decoder is built using bahadau attention model
32 |   - BeamSearchDecoder for inference
33 |   #### Data is 
34 |   - Dataset is available at harvardnlp/sent-summary (https://github.com/harvardnlp/sent-summary). Locate the summary.tar.gz file in project root directory. which is a collection of news and their titles
35 |   #### Word2Vec
36 |   - Used Glove pre-trained vectors to initialize word embedding
37 |    ####  My Modifications
38 |    - collected all of different modules in one notebook
39 |    - made it compatible with jupiter notebook
40 |    - save data onto google drive , and connect the notebook to the drive
41 |    - Add bleu and Rouge evaluation
42 |    - Save results with evaluations in xml format
43 | 
44 | ### 2- Model_2/Model 2 features(tf-idf , pos tags).ipynb
45 | a modification to Model 2.ipynb by using concepts from http://www.aclweb.org/anthology/K16-1028
46 | 
47 | It is a modification to the embedding vector , increasing its size from 300 to 320 by using additional features
48 |     - TF-IDF (term freq - inverse document freq)
49 |     - Parts of speach tags (POS Tags)
50 | 
51 | ## Results
52 | A folder contains the results of both the 2 models , from validation text samples 
53 | in a zaksum format , which is combineing all of 
54 |     - bleu
55 |     - rouge_1
56 |     - rouge_2
57 |     - rouge_L
58 |     - rouge_be
59 | for each sentence , and average of all of them
60 | 
61 | we can see that there is an increase el7 in the results when using additional features of tf-idf and pos tags ,
62 | , you can export your own zaksum format by using code from the notebooks of model 2
63 | ---------------------------------------------------------------------------------
64 | 
65 | # Model_3.ipynb
66 | a modification to https://github.com/thomasschmied/Text_Summarization_with_Tensorflow/blob/master/summarizer_amazon_reviews.ipynb 
67 |    ####  My Modifications
68 |    - made it compatible with jupiter notebook
69 |    - save data onto google drive , and connect the notebook to the drive
70 | 


--------------------------------------------------------------------------------
/Implementation B (Pointer Generator seq2seq network)/PreProcessData/README.md:
--------------------------------------------------------------------------------
 1 | # Preproccessing Data for Model 4 & Model 5
 2 | 
 3 | this implementation is a continuation of the amazing work done by
 4 | - abisee's https://github.com/abisee/cnn-dailymail
 5 | that is used in both 
 6 |     - https://github.com/abisee/pointer-generator **([Model 4](https://github.com/theamrzaki/text_summurization_abstractive_methods/tree/master/Implementation%20B%20(Pointer%20Generator%20seq2seq%20network)))**
 7 |     - https://github.com/yaserkl/RLSeq2Seq **([Model 5](https://github.com/theamrzaki/text_summurization_abstractive_methods/blob/master/Implementation%20C%20(Reinforcement%20Learning%20with%20seq2seq)/Model%205%20RL.ipynb))**
 8 | 
 9 | -------------------------------------------------
10 | ## My modification has been
11 | 1. input data has been simplified to be in a csv format 
12 | 
13 | | content        | title         
14 | | -------------  |:-------------:
15 | | tex1           | summary1
16 | | tex2           | summary2      
17 | | tex3           | asummary3   
18 | 
19 | 2. replacing Stanford CoreNLP with nltk tokenizer to make it easy to process data without the need to download java files
20 | 
21 | -------------------------------------------------
22 | ## How to Run 
23 | the requirements are , having
24 | 1. csv of your dataset must have 2 columbs 
25 | 
26 | | content        | title         
27 | | -------------  |:-------------:
28 | 
29 | 2.  modify the variable `cnn_stories_dir`
30 |      to point to your main directory 
31 | 
32 | 3.  replace `reviews_csv`
33 |     with your csv path
34 |  
35 | -------------------------------------------------
36 | 
37 | ## Output 
38 | 1. folder (cnn_stories_tokenized) used internally here 
39 | 2. **finished files** (the folder that we would use)
40 |     |--> **(folder) chunks** ==> (used in upload)
41 |     |--> test.bin  |--> not used in upload
42 |     |--> train.bin |--> not used in upload
43 |     |--> val.bin  |--> not used in upload
44 |     |--> **vocab**  ==> (used in upload)
45 | 
46 | 
47 | then 
48 | put both 
49 |   >>|--> **(folder) chunks** ==> (used in upload)
50 |   >>|--> **vocab**  ==> (used in upload)
51 |   in a zip and upload online
52 | 
53 | 
54 | 
55 | -------------------------------------------------
56 | 
57 | 
58 | ## Use In model 4
59 |  in the last code cell modify both
60 | `FLAGS.data_path`  : to point to the chunked files
61 | `FLAGS.vocab_path` : to point to the vocab file
62 | 


--------------------------------------------------------------------------------
/Implementation B (Pointer Generator seq2seq network)/PreProcessData/process_English.py:
--------------------------------------------------------------------------------
  1 | ﻿import ProgressBar
  2 | 
  3 | import sys
  4 | import os
  5 | import hashlib
  6 | import struct
  7 | import subprocess
  8 | import collections
  9 | import tensorflow as tf
 10 | from tensorflow.core.example import example_pb2
 11 | import nltk
 12 | import pandas as pd
 13 | 
 14 | #for cleaning text 
 15 | def clean_text(text, remove_stopwords = True):
 16 |     '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
 17 |     
 18 |     # Convert words to lower case
 19 |     text = text.lower()
 20 |     
 21 |     # Replace contractions with their longer forms 
 22 |     if True:
 23 |         text = text.split()
 24 |         new_text = []
 25 |         for word in text:
 26 |             if word in contractions:
 27 |                 new_text.append(contractions[word])
 28 |             else:
 29 |                 new_text.append(word)
 30 |         text = " ".join(new_text)
 31 |     
 32 |     # Format words and remove unwanted characters
 33 |     text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
 34 |     text = re.sub(r'\<a href', ' ', text)
 35 |     text = re.sub(r'&amp;', '', text) 
 36 |     text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
 37 |     text = re.sub(r'<br />', ' ', text)
 38 |     text = re.sub(r'\'', ' ', text)
 39 |     
 40 |     # Optionally, remove stop words
 41 |     if remove_stopwords:
 42 |         text = text.split()
 43 |         stops = set(stopwords.words("english"))
 44 |         text = [w for w in text if not w in stops]
 45 |         text = " ".join(text)
 46 | 
 47 |     return text
 48 | 
 49 | 
 50 | dm_single_close_quote = u'\u2019' # unicode
 51 | dm_double_close_quote = u'\u201d'
 52 | END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")"] # acceptable ways to end a sentence
 53 | 
 54 | # We use these to separate the summary sentences in the .bin datafiles
 55 | SENTENCE_START = '<s>'
 56 | SENTENCE_END = '</s>'
 57 | 
 58 | all_train_urls = ""
 59 | all_val_urls = ""
 60 | all_test_urls = ""
 61 | 
 62 | cnn_tokenized_stories_dir = "cnn_stories_tokenized" #location of folder to tokenize text
 63 | dm_tokenized_stories_dir = "dm_stories_tokenized" #not used
 64 | finished_files_dir = "arabic_finished_files" #final ouput
 65 | chunks_dir = os.path.join(finished_files_dir, "chunked")
 66 | 
 67 | 
 68 | 
 69 | VOCAB_SIZE = 200000
 70 | CHUNK_SIZE = 1000 # num examples per chunk, for the chunked data
 71 | 
 72 | 
 73 | def chunk_file(set_name):
 74 |   in_file = finished_files_dir + '/%s.bin' % set_name
 75 |   reader = open(in_file, "rb")
 76 |   chunk = 0
 77 |   finished = False
 78 |   while not finished:
 79 |     chunk_fname = os.path.join(chunks_dir, '%s_%03d.bin' % (set_name, chunk)) # new chunk
 80 |     with open(chunk_fname, 'wb') as writer:
 81 |       for _ in range(CHUNK_SIZE):
 82 |         len_bytes = reader.read(8)
 83 |         if not len_bytes:
 84 |           finished = True
 85 |           break
 86 |         str_len = struct.unpack('q', len_bytes)[0]
 87 |         example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
 88 |         writer.write(struct.pack('q', str_len))
 89 |         writer.write(struct.pack('%ds' % str_len, example_str))
 90 |       chunk += 1
 91 | 
 92 | 
 93 | def chunk_all():
 94 |   # Make a dir to hold the chunks
 95 |   if not os.path.isdir(chunks_dir):
 96 |     os.mkdir(chunks_dir)
 97 |   # Chunk the data
 98 |   for set_name in ['train', 'val', 'test']:
 99 |     print ("Splitting %s data into chunks..." % set_name)
100 |     chunk_file(set_name)
101 |   print ("Saved chunked data in %s" % chunks_dir)
102 | 
103 | 
104 | def tokenize_stories(reviews, tokenized_stories_dir):
105 |   """Maps a whole directory of .story files to a tokenized version using Stanford CoreNLP Tokenizer"""
106 |   progress = ProgressBar.ProgressBar(len(reviews), fmt=ProgressBar.ProgressBar.FULL)
107 | 
108 |   for i, row in reviews.iterrows():
109 |         #if i==20:
110 |         #    break
111 |         filename = str(i) + '.tok'
112 |         with open(os.path.join(tokenized_stories_dir, filename), 'w', encoding="utf-8") as temp_file:
113 |             text = row["content"]
114 |             text = clean_text(text , remove_stopwords = True)
115 |             tok = nltk.word_tokenize(text)
116 |             tok.append("@highlight")
117 |             Summary = row["title"]
118 |             Summary = clean_text(Summary ,remove_stopwords = False)
119 |             tok.extend(nltk.word_tokenize(Summary))
120 |             list = tok.copy()
121 | 
122 |             for i in tok:
123 |                 if(i=='``' or i=="''" ):
124 |                     list.remove(i)
125 |             tok_string = "\n".join(str(x) for x in list)
126 |             temp_file.write(tok_string)
127 | 
128 |         progress.current += 1
129 |         progress()
130 |   print ("Successfully finished tokenizing to %s .\n" % (tokenized_stories_dir))
131 | 
132 | 
133 | def fix_missing_period(line):
134 |   """Adds a period to a line that is missing a period"""
135 |   if "@highlight" in line: return line
136 |   if line=="": return line
137 |   if line[-1] in END_TOKENS: return line
138 |   # print line[-1]
139 |   return line + " ."
140 | 
141 | def read_text_file(text_file):
142 |   lines = []
143 |   with open(text_file, "r", encoding="utf-8") as f:
144 |     for line in f:
145 |       lines.append(line.strip())
146 |   return lines
147 | 
148 | def get_art_abs(story_file):
149 |   lines = read_text_file(story_file)
150 | 
151 |   # Lowercase everything
152 |   lines = [line.lower() for line in lines]
153 | 
154 |   # Put periods on the ends of lines that are missing them (this is a problem in the dataset because many image captions don't end in periods; consequently they end up in the body of the article as run-on sentences)
155 |   lines = [fix_missing_period(line) for line in lines]
156 | 
157 |   # Separate out article and abstract sentences
158 |   article_lines = []
159 |   highlights = []
160 |   next_is_highlight = False
161 |   for idx,line in enumerate(lines):
162 |     if line == "":
163 |       continue # empty line
164 |     elif line.startswith("@highlight"):
165 |       next_is_highlight = True
166 |     elif next_is_highlight:
167 |       highlights.append(line)
168 |     else:
169 |       article_lines.append(line)
170 | 
171 |   # Make article into a single string
172 |   article = ' '.join(article_lines)
173 | 
174 |   # Make abstract into a signle string, putting <s> and </s> tags around the sentences
175 |   abstract = ' '.join(["%s %s %s" % (SENTENCE_START, sent, SENTENCE_END) for sent in highlights])
176 | 
177 |   return article, abstract
178 | 
179 | 
180 | def write_to_bin(file_names, out_file, makevocab=False):
181 |   """Reads the tokenized .story files corresponding to the urls listed in the url_file and writes them to a out_file."""
182 |  
183 |   story_fnames = [str(s)+".tok" for s in file_names]
184 |   num_stories = len(story_fnames)
185 | 
186 |   if makevocab:
187 |     vocab_counter = collections.Counter()
188 | 
189 |   with open(out_file, 'wb') as writer:
190 |     for idx,s in enumerate(story_fnames):
191 |       if idx % 1000 == 0:
192 |         print( "Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories)))
193 | 
194 |       # Look in the tokenized story dirs to find the .story file corresponding to this url
195 |       if os.path.isfile(os.path.join(cnn_tokenized_stories_dir, s)):
196 |         story_file = os.path.join(cnn_tokenized_stories_dir, s)
197 |       elif os.path.isfile(os.path.join(dm_tokenized_stories_dir, s)):
198 |         story_file = os.path.join(dm_tokenized_stories_dir, s)
199 |       else:
200 |         print ("Error: Couldn't find tokenized story file %s in either tokenized story directories %s and %s. Was there an error during tokenization?" % (s, cnn_tokenized_stories_dir, dm_tokenized_stories_dir))
201 |         # Check again if tokenized stories directories contain correct number of files
202 |         print ("Checking that the tokenized stories directories %s and %s contain correct number of files..." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir))
203 |         #check_num_stories(cnn_tokenized_stories_dir, num_expected_cnn_stories)
204 |         #check_num_stories(dm_tokenized_stories_dir, num_expected_dm_stories)
205 |         #raise Exception("Tokenized stories directories %s and %s contain correct number of files but story file %s found in neither." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir, s))
206 |         
207 |       # Get the strings to write to .bin file
208 |       article, abstract = get_art_abs(story_file)
209 | 
210 |       
211 |       # Write to tf.Example
212 |       tf_example = example_pb2.Example()
213 |       tf_example.features.feature['article'].bytes_list.value.extend([article.encode('utf-8')])
214 |       tf_example.features.feature['abstract'].bytes_list.value.extend([abstract.encode('utf-8')])
215 |       tf_example_str = tf_example.SerializeToString()
216 |       str_len = len(tf_example_str)
217 |       writer.write(struct.pack('q', str_len))
218 |       writer.write(struct.pack('%ds' % str_len, tf_example_str))
219 |    
220 | 
221 |       # Write the vocab to file, if applicable
222 |       if makevocab:
223 |         art_tokens = article.split(' ')
224 |         abs_tokens = abstract.split(' ')
225 |         abs_tokens = [t for t in abs_tokens if t not in [SENTENCE_START, SENTENCE_END]] # remove these tags from vocab
226 |         tokens = art_tokens + abs_tokens
227 |         tokens = [t.strip() for t in tokens] # strip
228 |         tokens = [t for t in tokens if t!=""] # remove empty
229 |         vocab_counter.update(tokens)
230 | 
231 |   print ("Finished writing file %s\n" % out_file)
232 | 
233 |   # write vocab to file
234 |   if makevocab:
235 |     print ("Writing vocab file...")
236 |     with open(os.path.join(finished_files_dir, "vocab"), 'w', encoding="utf-8") as writer:
237 |       for word, count in vocab_counter.most_common(VOCAB_SIZE):
238 |         writer.write(word + ' ' + str(count) + '\n')
239 |     print ("Finished writing vocab file")
240 | 
241 | 
242 | def check_num_stories(stories_dir, num_expected):
243 |   num_stories = len(os.listdir(stories_dir))
244 |   if num_stories != num_expected:
245 |     raise Exception("stories directory %s contains %i files but should contain %i" % (stories_dir, num_stories, num_expected))
246 | 
247 | 
248 | 
249 | 
250 | 
251 | """
252 | the requirements are , having
253 |     1- csv of your data set having 2 columbs 
254 |         content(text) |  summary 
255 |         by modifying 
256 |         cnn_stories_dir to pointtto your main directory 
257 |         and then replacing \ArabicBook00.csv
258 |         with your csv
259 | 
260 | 
261 | output would be 
262 |     1- folder (cnn_stories_tokenized) used internally here 
263 |     2- finished files (the folder that we would use)
264 |         |--> (folder) chunks ==> (used in upload)
265 |         |--> test.bin  |
266 |         |--> train.bin |--> not used in upload
267 |         |--> val.bin   |
268 |         |--> vocab  ==> (used in upload)
269 | 
270 |     then 
271 |     put both 
272 |       |--> (folder) chunks ==> (used in upload)
273 |       |--> vocab  ==> (used in upload)
274 |       in a zip and upload online
275 | """
276 | 
277 | 
278 | 
279 | if __name__ == '__main__':
280 |   #main directory
281 |   cnn_stories_dir =  r"E:\Handasa\Majester\thesis\python\DataProcessing\DataProcessing\arhelpers"
282 | 
283 |   # Create some new directories
284 |   if not os.path.exists(cnn_tokenized_stories_dir): os.makedirs(cnn_tokenized_stories_dir)
285 |   if not os.path.exists(finished_files_dir): os.makedirs(finished_files_dir)
286 | 
287 |   #data needed is in a csv format
288 |   #containg 2 columbs (content , title)
289 |   reviews_csv =cnn_stories_dir + "\ArabicBook00.csv"
290 |   reviews = pd.read_csv(reviews_csv)
291 |   reviews = reviews.filter(['content', 'title'])
292 |   reviews = reviews.dropna()
293 |   reviews = reviews.reset_index(drop=True)
294 |   reviews.head()
295 | 
296 |   # Run nltk tokenizer on both text and summary , outputting to tokenized stories directories
297 |   tokenize_stories(reviews, cnn_tokenized_stories_dir)
298 | 
299 |   #to get the length of your dataset
300 |   num_expected_cnn_stories =reviews.shape[0]
301 | 
302 |   #testing len = 2000
303 |   #validation lenght = 2000
304 |   all_train_urls = range(0,num_expected_cnn_stories-2000)
305 |   all_val_urls = range(num_expected_cnn_stories-2000,num_expected_cnn_stories-1000)
306 |   all_test_urls = range(num_expected_cnn_stories-1000,num_expected_cnn_stories)
307 | 
308 |   #for testing
309 |   ##############all_train_urls= range(0,80)
310 |   ##############all_val_urls = range(80,90)
311 |   ##############all_test_urls = range(90,100)
312 | 
313 |   # Read the tokenized stories, do a little postprocessing then write to bin files
314 |   write_to_bin(all_test_urls, os.path.join(finished_files_dir, "test.bin"))
315 |   write_to_bin(all_val_urls, os.path.join(finished_files_dir, "val.bin"))
316 |   write_to_bin(all_train_urls, os.path.join(finished_files_dir, "train.bin"), makevocab=True)
317 | 
318 |   # Chunk the data. This splits each of train.bin, val.bin and test.bin into smaller chunks, each containing e.g. 1000 examples, and saves them in finished_files/chunks
319 |   chunk_all()
320 | 
321 | 


--------------------------------------------------------------------------------
/Implementation B (Pointer Generator seq2seq network)/README.md:
--------------------------------------------------------------------------------
 1 | # Implementation B (Pointer Generator seq2seq network)
 2 | it is a continuation of the amazing work of
 3 | 	https://github.com/abisee/pointer-generator
 4 | 	https://arxiv.org/abs/1704.04368
 5 | this implementation uses the concept of having a pointer generator network to diminish some problems that appears with the normal 
 6 | seq2seq network
 7 | 
 8 | Know more about how this model was built , and about pointer genrator through this tutorial 
 9 | generator 
10 | - **Tutorial 7** [Pointer generator for combination of Abstractive & Extractive methods for Text Summarization](http://bit.ly/2EhcRIZ)
11 | 
12 | Try out this pointer genrator model through [this website (eazymind)](http://bit.ly/2VxhPqU) ,
13 | ![eazymind](https://scontent.fcai3-1.fna.fbcdn.net/v/t1.0-9/60785552_445522029607880_7282873905209933824_o.jpg?_nc_cat=101&_nc_ht=scontent.fcai3-1.fna&oh=927d1fae6521813b3d6e7a7d7a5b01aa&oe=5D5C3AD5) which enables you to summarize your text through
14 | - curl call
15 | ```
16 | curl -X POST 
17 | http://eazymind.herokuapp.com/arabic_sum/eazysum
18 | -H 'cache-control: no-cache' 
19 | -H 'content-type: application/x-www-form-urlencoded' 
20 | -d "eazykey={eazymind api key}&sentence={your sentence to be summarized}"
21 | ```
22 | - python package ([pip install eazymind](http://bit.ly/2Ef5XnS))
23 | 	```pip install eazymind```
24 | 	
25 | ```
26 | from nlp.eazysum import Summarizer
27 | 
28 | #---key from eazymind website---
29 | key = "xxxxxxxxxxxxxxxxxxxxx"
30 | 
31 | #---sentence to be summarized---
32 | sentence = """(CNN)The White House has instructed former
33 |     White House Counsel Don McGahn not to comply with a subpoena
34 |     for documents from House Judiciary Chairman Jerry Nadler, 
35 |     teeing up the latest in a series of escalating oversight 
36 |     showdowns between the Trump administration and congressional Democrats."""
37 |     
38 | summarizer = Summarizer(key)
39 | print(summarizer.run(sentence))
40 | ```
41 | 
42 | 
43 | 
44 | 
45 | ### Model_4_generator_.ipynb
46 | uses a pointer generator with seq2seq with attention 
47 | it is built using python2.7
48 | ### zaksum_eval.ipynb
49 | built by python3 for evaluation
50 | ### Results/Pointer Generator
51 | - output from generator (article / reference / summary) used as input to the zaksum_eval.ipynb
52 | - result from zaksum_eval
53 | 	
54 | 	
55 | i will still work on their implementation of coverage mechanism , so much work is yet to come if God wills it isA
56 | 


--------------------------------------------------------------------------------
/Implementation B (Pointer Generator seq2seq network)/zaksum_eval.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "zaksum eval.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": [],
 10 |       "include_colab_link": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "view-in-github",
 22 |         "colab_type": "text"
 23 |       },
 24 |       "source": [
 25 |         "<a href=\"https://colab.research.google.com/github/theamrzaki/text_summurization_abstractive_methods/blob/master/Implementation%20B%20(Pointer%20Generator%20seq2seq%20network)/zaksum_eval.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 26 |       ]
 27 |     },
 28 |     {
 29 |       "metadata": {
 30 |         "id": "HY3btgJWEqBP",
 31 |         "colab_type": "text"
 32 |       },
 33 |       "cell_type": "markdown",
 34 |       "source": [
 35 |         "### Google Drive"
 36 |       ]
 37 |     },
 38 |     {
 39 |       "metadata": {
 40 |         "id": "tdcKjPVpEjk8",
 41 |         "colab_type": "code",
 42 |         "colab": {
 43 |           "base_uri": "https://localhost:8080/",
 44 |           "height": 222
 45 |         },
 46 |         "outputId": "14ffae67-4fc4-4dfe-cd75-96505c64e06e"
 47 |       },
 48 |       "cell_type": "code",
 49 |       "source": [
 50 |         "#https://stackoverflow.com/questions/47744131/colaboratory-can-i-access-to-my-google-drive-folder-and-file\n",
 51 |         "\n",
 52 |         "!apt-get install -y -qq software-properties-common python-software-properties module-init-tools\n",
 53 |         "!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null\n",
 54 |         "!apt-get update -qq 2>&1 > /dev/null\n",
 55 |         "!apt-get -y install -qq google-drive-ocamlfuse fuse\n",
 56 |         "from google.colab import auth\n",
 57 |         "auth.authenticate_user()\n",
 58 |         "from oauth2client.client import GoogleCredentials\n",
 59 |         "creds = GoogleCredentials.get_application_default()\n",
 60 |         "import getpass\n",
 61 |         "!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL\n",
 62 |         "vcode = getpass.getpass()\n",
 63 |         "!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}\n",
 64 |         "\n",
 65 |         "!mkdir -p drive\n",
 66 |         "!google-drive-ocamlfuse drive"
 67 |       ],
 68 |       "execution_count": 2,
 69 |       "outputs": [
 70 |         {
 71 |           "output_type": "stream",
 72 |           "text": [
 73 |             "E: Package 'python-software-properties' has no installation candidate\n",
 74 |             "Selecting previously unselected package google-drive-ocamlfuse.\n",
 75 |             "(Reading database ... 110845 files and directories currently installed.)\n",
 76 |             "Preparing to unpack .../google-drive-ocamlfuse_0.7.1-0ubuntu3~ubuntu18.04.1_amd64.deb ...\n",
 77 |             "Unpacking google-drive-ocamlfuse (0.7.1-0ubuntu3~ubuntu18.04.1) ...\n",
 78 |             "Setting up google-drive-ocamlfuse (0.7.1-0ubuntu3~ubuntu18.04.1) ...\n",
 79 |             "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
 80 |             "Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force\n",
 81 |             "··········\n",
 82 |             "Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force\n",
 83 |             "Please enter the verification code: Access token retrieved correctly.\n"
 84 |           ],
 85 |           "name": "stdout"
 86 |         }
 87 |       ]
 88 |     },
 89 |     {
 90 |       "metadata": {
 91 |         "id": "uZlHlsJAE75n",
 92 |         "colab_type": "text"
 93 |       },
 94 |       "cell_type": "markdown",
 95 |       "source": [
 96 |         "### Parser"
 97 |       ]
 98 |     },
 99 |     {
100 |       "metadata": {
101 |         "id": "GBUTmucLFHy4",
102 |         "colab_type": "code",
103 |         "colab": {}
104 |       },
105 |       "cell_type": "code",
106 |       "source": [
107 |         "article=[]\n",
108 |         "reference=[]\n",
109 |         "summary =[]\n",
110 |         "\n",
111 |         "import xml.etree.ElementTree\n",
112 |         "e = xml.etree.ElementTree.parse(\"drive/Colab Notebooks/Model 4 generator/result_pointer_4_1_2019_9_39pm.xml\").getroot()\n",
113 |         "\n",
114 |         "for atype in e.findall('example/article'):\n",
115 |         "    article.append(atype.text)\n",
116 |         "    \n",
117 |         "for atype in e.findall('example/reference'):\n",
118 |         "     reference.append(atype.text)\n",
119 |         "    \n",
120 |         "for atype in e.findall('example/summary'):\n",
121 |         "     summary.append(atype.text)"
122 |       ],
123 |       "execution_count": 0,
124 |       "outputs": []
125 |     },
126 |     {
127 |       "metadata": {
128 |         "id": "XJinxjb1Gwxg",
129 |         "colab_type": "text"
130 |       },
131 |       "cell_type": "markdown",
132 |       "source": [
133 |         "### zaksum"
134 |       ]
135 |     },
136 |     {
137 |       "metadata": {
138 |         "id": "a23wJfc6Gzg4",
139 |         "colab_type": "code",
140 |         "colab": {
141 |           "base_uri": "https://localhost:8080/",
142 |           "height": 440
143 |         },
144 |         "outputId": "8f1ea9b5-f8ac-40b7-d297-7ff0e3632dd3"
145 |       },
146 |       "cell_type": "code",
147 |       "source": [
148 |         "!pip install sumeval\n",
149 |         "!python -m spacy download en"
150 |       ],
151 |       "execution_count": 11,
152 |       "outputs": [
153 |         {
154 |           "output_type": "stream",
155 |           "text": [
156 |             "Collecting sumeval\n",
157 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/10/a4/cd9da40dd7f32a2141808ba3af7101f0fcb0abfed918defd2df05b1410ac/sumeval-0.1.7.tar.gz (44kB)\n",
158 |             "\u001b[K    100% |████████████████████████████████| 51kB 2.0MB/s \n",
159 |             "\u001b[?25hRequirement already satisfied: plac>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from sumeval) (0.9.6)\n",
160 |             "Collecting sacrebleu>=1.1.7 (from sumeval)\n",
161 |             "  Downloading https://files.pythonhosted.org/packages/37/51/bffea2b666d59d77be0413d35220022040a1f308c39009e5b023bc4eb8ab/sacrebleu-1.2.12.tar.gz\n",
162 |             "Collecting typing (from sacrebleu>=1.1.7->sumeval)\n",
163 |             "  Downloading https://files.pythonhosted.org/packages/4a/bd/eee1157fc2d8514970b345d69cb9975dcd1e42cd7e61146ed841f6e68309/typing-3.6.6-py3-none-any.whl\n",
164 |             "Building wheels for collected packages: sumeval, sacrebleu\n",
165 |             "  Running setup.py bdist_wheel for sumeval ... \u001b[?25l-\b \b\\\b \bdone\n",
166 |             "\u001b[?25h  Stored in directory: /root/.cache/pip/wheels/f1/7c/18/177331e7a5154401ab5fec8f3c6d2062508c38d832aaa3d5e2\n",
167 |             "  Running setup.py bdist_wheel for sacrebleu ... \u001b[?25l-\b \bdone\n",
168 |             "\u001b[?25h  Stored in directory: /root/.cache/pip/wheels/ea/0a/7d/ddcbdcd15a04b72de1b3f78e7e754aab415aff81c423376385\n",
169 |             "Successfully built sumeval sacrebleu\n",
170 |             "Installing collected packages: typing, sacrebleu, sumeval\n",
171 |             "Successfully installed sacrebleu-1.2.12 sumeval-0.1.7 typing-3.6.6\n",
172 |             "Requirement already satisfied: en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0 in /usr/local/lib/python3.6/dist-packages (2.0.0)\n",
173 |             "\n",
174 |             "\u001b[93m    Linking successful\u001b[0m\n",
175 |             "    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->\n",
176 |             "    /usr/local/lib/python3.6/dist-packages/spacy/data/en\n",
177 |             "\n",
178 |             "    You can now load the model via spacy.load('en')\n",
179 |             "\n"
180 |           ],
181 |           "name": "stdout"
182 |         }
183 |       ]
184 |     },
185 |     {
186 |       "metadata": {
187 |         "id": "7gN0BNbaG35g",
188 |         "colab_type": "code",
189 |         "colab": {}
190 |       },
191 |       "cell_type": "code",
192 |       "source": [
193 |         "#https://github.com/chakki-works/sumeval\n",
194 |         "#https://github.com/Tian312/awesome-text-summarization\n",
195 |         "\n",
196 |         "from sumeval.metrics.rouge import RougeCalculator\n",
197 |         "from sumeval.metrics.bleu import BLEUCalculator\n",
198 |         "\n",
199 |         "def eval_rouges(refrence_summary,model_summary):\n",
200 |         "    rouge = RougeCalculator(stopwords=True, lang=\"en\")\n",
201 |         "\n",
202 |         "    rouge_1 = rouge.rouge_n(\n",
203 |         "                summary=model_summary,\n",
204 |         "                references=refrence_summary,\n",
205 |         "                n=1)\n",
206 |         "\n",
207 |         "    rouge_2 = rouge.rouge_n(\n",
208 |         "                summary=model_summary,\n",
209 |         "                references=[refrence_summary],\n",
210 |         "                n=2)\n",
211 |         "    \n",
212 |         "    rouge_l = rouge.rouge_l(\n",
213 |         "                summary=model_summary,\n",
214 |         "                references=[refrence_summary])\n",
215 |         "    \n",
216 |         "    # You need spaCy to calculate ROUGE-BE\n",
217 |         "    \n",
218 |         "    rouge_be = rouge.rouge_be(\n",
219 |         "                summary=model_summary,\n",
220 |         "                references=[refrence_summary])\n",
221 |         "\n",
222 |         "    bleu = BLEUCalculator()\n",
223 |         "    bleu_score = bleu.bleu( summary=model_summary,\n",
224 |         "                        references=[refrence_summary])\n",
225 |         "    \n",
226 |         "    return rouge_1, rouge_2,rouge_l,rouge_be,bleu_score"
227 |       ],
228 |       "execution_count": 0,
229 |       "outputs": []
230 |     },
231 |     {
232 |       "metadata": {
233 |         "id": "i6atapqsG52A",
234 |         "colab_type": "code",
235 |         "colab": {}
236 |       },
237 |       "cell_type": "code",
238 |       "source": [
239 |         "#https://pymotw.com/2/xml/etree/ElementTree/create.html\n",
240 |         "\n",
241 |         "bleu_arr = []\n",
242 |         "rouge_1_arr  = []\n",
243 |         "rouge_2_arr  = []\n",
244 |         "rouge_L_arr  = []\n",
245 |         "rouge_be_arr = []\n",
246 |         "\n",
247 |         "from xml.etree import ElementTree\n",
248 |         "from xml.dom import minidom\n",
249 |         "from functools import reduce\n",
250 |         "\n",
251 |         "def prettify(elem):\n",
252 |         "    \"\"\"Return a pretty-printed XML string for the Element.\n",
253 |         "    \"\"\"\n",
254 |         "    rough_string = ElementTree.tostring(elem, 'utf-8')\n",
255 |         "    reparsed = minidom.parseString(rough_string)\n",
256 |         "    return reparsed.toprettyxml(indent=\"  \")\n",
257 |         "  \n",
258 |         "from xml.etree.ElementTree import Element, SubElement, Comment\n",
259 |         "\n",
260 |         "top = Element('ZakSum')\n",
261 |         "\n",
262 |         "def zaksum(article,reference,summary_array,default_path):\n",
263 |         "  comment = Comment('Generated by Amr Zaki')\n",
264 |         "  top.append(comment)\n",
265 |         "\n",
266 |         "  i=0\n",
267 |         "  for summ in summary_array:\n",
268 |         "    example = SubElement(top, 'example')\n",
269 |         "    article_element   = SubElement(example, 'article')\n",
270 |         "    article_element.text = article[i]\n",
271 |         "\n",
272 |         "    reference_element = SubElement(example, 'reference')\n",
273 |         "    reference_element.text = reference[i]\n",
274 |         "\n",
275 |         "    summary_element   = SubElement(example, 'summary')\n",
276 |         "    summary_element.text = summ\n",
277 |         "\n",
278 |         "    rouge_1, rouge_2,rouge_L,rouge_be,bleu_score = eval_rouges(reference[i],summ )\n",
279 |         "\n",
280 |         "    eval_element = SubElement(example, 'eval')\n",
281 |         "    bleu_score_element = SubElement(eval_element,'BLEU', {'score':str(bleu_score)})\n",
282 |         "    ROUGE_1_element  = SubElement(eval_element, 'ROUGE_1' , {'score':str(rouge_1)})\n",
283 |         "    ROUGE_2_element  = SubElement(eval_element, 'ROUGE_2' , {'score':str(rouge_2)})\n",
284 |         "    ROUGE_L_element  = SubElement(eval_element, 'ROUGE_l' , {'score':str(rouge_L)})\n",
285 |         "    ROUGE_be_element  = SubElement(eval_element,'ROUGE_be', {'score':str(rouge_be)})\n",
286 |         "\n",
287 |         "    bleu_arr.append(bleu_score) \n",
288 |         "    rouge_1_arr.append(rouge_1) \n",
289 |         "    rouge_2_arr.append(rouge_2) \n",
290 |         "    rouge_L_arr.append(rouge_L) \n",
291 |         "    rouge_be_arr.append(rouge_be) \n",
292 |         "\n",
293 |         "    i+=1\n",
294 |         "\n",
295 |         "  top.set('bleu', str(reduce(lambda x, y: x + y,  bleu_arr) / len(bleu_arr)))\n",
296 |         "  top.set('rouge_1', str(reduce(lambda x, y: x + y,  rouge_1_arr) / len(rouge_1_arr)))\n",
297 |         "  top.set('rouge_2', str(reduce(lambda x, y: x + y,  rouge_2_arr) / len(rouge_2_arr)))\n",
298 |         "  top.set('rouge_L', str(reduce(lambda x, y: x + y,  rouge_L_arr) / len(rouge_L_arr)))\n",
299 |         "  top.set('rouge_be', str(reduce(lambda x, y: x + y, rouge_be_arr) / len(rouge_be_arr)))\n",
300 |         "\n",
301 |         "  with open(default_path + \"result.xml\", \"w+\") as f:\n",
302 |         "    print(prettify(top), file=f)"
303 |       ],
304 |       "execution_count": 0,
305 |       "outputs": []
306 |     },
307 |     {
308 |       "metadata": {
309 |         "id": "O7cG7_wZHdOf",
310 |         "colab_type": "text"
311 |       },
312 |       "cell_type": "markdown",
313 |       "source": [
314 |         "#Run"
315 |       ]
316 |     },
317 |     {
318 |       "metadata": {
319 |         "id": "ZCMS9Bb_HQH4",
320 |         "colab_type": "code",
321 |         "colab": {}
322 |       },
323 |       "cell_type": "code",
324 |       "source": [
325 |         "zaksum(article,reference,summary,\"drive/Colab Notebooks/Model 4 generator/\")"
326 |       ],
327 |       "execution_count": 0,
328 |       "outputs": []
329 |     }
330 |   ]
331 | }


--------------------------------------------------------------------------------
/Implementation C (Reinforcement Learning with seq2seq)/Policy Gradient/README.md:
--------------------------------------------------------------------------------
 1 | # Policy-Gradient 
 2 | 
 3 | Here we implement Renforcement learning with deep learning to implement Policy-Gradient from [Paulus et al](https://arxiv.org/abs/1705.04304)
 4 | This is a library for implementing **Reinforcment learning with deep learning for text summarization** .
 5 | which has been converted to a jupyter notebook format to run seamlesly within google colab
 6 | 
 7 | ## Description
 8 | description from [yasterk](https://github.com/yaserkl/RLSeq2Seq#scheduled-sampling-soft-scheduled-sampling-and-end2endbackprop) 
 9 | 
10 | [Paulus et al](https://arxiv.org/abs/1705.04304). proposed a self-critic policy-gradient model for abstractive text summarization. The following figure represents how this method works and how we implemented this method:
11 | 
12 | image from [yasterk](https://github.com/yaserkl/RLSeq2Seq#scheduled-sampling-soft-scheduled-sampling-and-end2endbackprop) 
13 | ![Policy-Gradient](https://github.com/yaserkl/RLSeq2Seq/raw/master/docs/_img/selfcritic.png)
14 | 
15 | ```
16 | @ARTICLE{2017arXiv170504304P,
17 |        author = {{Paulus}, Romain and {Xiong}, Caiming and {Socher}, Richard},
18 |         title = "{A Deep Reinforced Model for Abstractive Summarization}",
19 |       journal = {arXiv e-prints},
20 |      keywords = {Computer Science - Computation and Language},
21 |          year = "2017",
22 |         month = "May",
23 |           eid = {arXiv:1705.04304},
24 |         pages = {arXiv:1705.04304},
25 | archivePrefix = {arXiv},
26 |        eprint = {1705.04304},
27 |  primaryClass = {cs.CL},
28 |        adsurl = {https://ui.adsabs.harvard.edu/abs/2017arXiv170504304P},
29 |       adsnote = {Provided by the SAO/NASA Astrophysics Data System}
30 | }
31 | ```
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/Implementation C (Reinforcement Learning with seq2seq)/Policy Gradient/zaksum_eval.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "zaksum eval.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "metadata": {
 20 |         "id": "HY3btgJWEqBP",
 21 |         "colab_type": "text"
 22 |       },
 23 |       "source": [
 24 |         "### Google Drive"
 25 |       ]
 26 |     },
 27 |     {
 28 |       "cell_type": "code",
 29 |       "metadata": {
 30 |         "id": "tdcKjPVpEjk8",
 31 |         "colab_type": "code",
 32 |         "outputId": "78a77680-e219-411e-82d2-c7d548a266ee",
 33 |         "colab": {
 34 |           "base_uri": "https://localhost:8080/",
 35 |           "height": 237
 36 |         }
 37 |       },
 38 |       "source": [
 39 |         "#https://stackoverflow.com/questions/47744131/colaboratory-can-i-access-to-my-google-drive-folder-and-file\n",
 40 |         "\n",
 41 |         "!apt-get install -y -qq software-properties-common python-software-properties module-init-tools\n",
 42 |         "!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null\n",
 43 |         "!apt-get update -qq 2>&1 > /dev/null\n",
 44 |         "!apt-get -y install -qq google-drive-ocamlfuse fuse\n",
 45 |         "from google.colab import auth\n",
 46 |         "auth.authenticate_user()\n",
 47 |         "from oauth2client.client import GoogleCredentials\n",
 48 |         "creds = GoogleCredentials.get_application_default()\n",
 49 |         "import getpass\n",
 50 |         "!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL\n",
 51 |         "vcode = getpass.getpass()\n",
 52 |         "!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}\n",
 53 |         "\n",
 54 |         "!mkdir -p drive\n",
 55 |         "!google-drive-ocamlfuse drive"
 56 |       ],
 57 |       "execution_count": 5,
 58 |       "outputs": [
 59 |         {
 60 |           "output_type": "stream",
 61 |           "text": [
 62 |             "E: Package 'python-software-properties' has no installation candidate\n",
 63 |             "Selecting previously unselected package google-drive-ocamlfuse.\n",
 64 |             "(Reading database ... 130911 files and directories currently installed.)\n",
 65 |             "Preparing to unpack .../google-drive-ocamlfuse_0.7.3-0ubuntu3~ubuntu18.04.1_amd64.deb ...\n",
 66 |             "Unpacking google-drive-ocamlfuse (0.7.3-0ubuntu3~ubuntu18.04.1) ...\n",
 67 |             "Setting up google-drive-ocamlfuse (0.7.3-0ubuntu3~ubuntu18.04.1) ...\n",
 68 |             "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
 69 |             "Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force\n",
 70 |             "··········\n",
 71 |             "Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force\n",
 72 |             "Please enter the verification code: Access token retrieved correctly.\n"
 73 |           ],
 74 |           "name": "stdout"
 75 |         }
 76 |       ]
 77 |     },
 78 |     {
 79 |       "cell_type": "markdown",
 80 |       "metadata": {
 81 |         "id": "uZlHlsJAE75n",
 82 |         "colab_type": "text"
 83 |       },
 84 |       "source": [
 85 |         "### Parser"
 86 |       ]
 87 |     },
 88 |     {
 89 |       "cell_type": "code",
 90 |       "metadata": {
 91 |         "id": "GBUTmucLFHy4",
 92 |         "colab_type": "code",
 93 |         "colab": {}
 94 |       },
 95 |       "source": [
 96 |         "article=[]\n",
 97 |         "reference=[]\n",
 98 |         "summary =[]\n",
 99 |         "\n",
100 |         "import xml.etree.ElementTree\n",
101 |         "e = xml.etree.ElementTree.parse(\"drive/Colab Notebooks/Model 5 RL/Policy Gradient/result_policy_gradient_31_5_2019_9_37am.xml\").getroot()\n",
102 |         "\n",
103 |         "for atype in e.findall('example/article'):\n",
104 |         "    article.append(atype.text)\n",
105 |         "    \n",
106 |         "for atype in e.findall('example/reference'):\n",
107 |         "     reference.append(atype.text)\n",
108 |         "    \n",
109 |         "for atype in e.findall('example/summary'):\n",
110 |         "     summary.append(atype.text)"
111 |       ],
112 |       "execution_count": 0,
113 |       "outputs": []
114 |     },
115 |     {
116 |       "cell_type": "code",
117 |       "metadata": {
118 |         "id": "g6lMM3PCWZYh",
119 |         "colab_type": "code",
120 |         "outputId": "093dcb85-4426-4470-a01a-ee9490de25d4",
121 |         "colab": {
122 |           "base_uri": "https://localhost:8080/",
123 |           "height": 35
124 |         }
125 |       },
126 |       "source": [
127 |         "len(summary)"
128 |       ],
129 |       "execution_count": 0,
130 |       "outputs": [
131 |         {
132 |           "output_type": "execute_result",
133 |           "data": {
134 |             "text/plain": [
135 |               "10"
136 |             ]
137 |           },
138 |           "metadata": {
139 |             "tags": []
140 |           },
141 |           "execution_count": 4
142 |         }
143 |       ]
144 |     },
145 |     {
146 |       "cell_type": "markdown",
147 |       "metadata": {
148 |         "id": "XJinxjb1Gwxg",
149 |         "colab_type": "text"
150 |       },
151 |       "source": [
152 |         "### zaksum"
153 |       ]
154 |     },
155 |     {
156 |       "cell_type": "code",
157 |       "metadata": {
158 |         "id": "a23wJfc6Gzg4",
159 |         "colab_type": "code",
160 |         "outputId": "72949397-c6f7-42e0-a1f2-63fc6f7c8ecb",
161 |         "colab": {
162 |           "base_uri": "https://localhost:8080/",
163 |           "height": 455
164 |         }
165 |       },
166 |       "source": [
167 |         "!pip install sumeval\n",
168 |         "!python -m spacy download en"
169 |       ],
170 |       "execution_count": 1,
171 |       "outputs": [
172 |         {
173 |           "output_type": "stream",
174 |           "text": [
175 |             "Collecting sumeval\n",
176 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f1/cf/51bcf1d3b48003f7fd0cc6a7c89ef39a252c08acab842143709b5c679ea3/sumeval-0.1.8.tar.gz (44kB)\n",
177 |             "\u001b[K     |████████████████████████████████| 51kB 1.7MB/s \n",
178 |             "\u001b[?25hRequirement already satisfied: plac>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from sumeval) (0.9.6)\n",
179 |             "Collecting sacrebleu>=1.3.2 (from sumeval)\n",
180 |             "  Downloading https://files.pythonhosted.org/packages/16/54/165b8d208788b99546032b0053b7c0fa61fa7bf2b55237d78a895168c24e/sacrebleu-1.3.4.tar.gz\n",
181 |             "Requirement already satisfied: typing in /usr/local/lib/python3.6/dist-packages (from sacrebleu>=1.3.2->sumeval) (3.6.6)\n",
182 |             "Building wheels for collected packages: sumeval, sacrebleu\n",
183 |             "  Building wheel for sumeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
184 |             "  Stored in directory: /root/.cache/pip/wheels/6a/f8/76/0b7fc4ff1ed9b973a3edb4ec38f6aa5309b7725d500d3a0d31\n",
185 |             "  Building wheel for sacrebleu (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
186 |             "  Stored in directory: /root/.cache/pip/wheels/5a/80/a8/e149303db81e58e7dbb29ee7943c43da9861872a5ea550256c\n",
187 |             "Successfully built sumeval sacrebleu\n",
188 |             "Installing collected packages: sacrebleu, sumeval\n",
189 |             "Successfully installed sacrebleu-1.3.4 sumeval-0.1.8\n",
190 |             "Requirement already satisfied: en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0 in /usr/local/lib/python3.6/dist-packages (2.0.0)\n",
191 |             "\n",
192 |             "\u001b[93m    Linking successful\u001b[0m\n",
193 |             "    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->\n",
194 |             "    /usr/local/lib/python3.6/dist-packages/spacy/data/en\n",
195 |             "\n",
196 |             "    You can now load the model via spacy.load('en')\n",
197 |             "\n"
198 |           ],
199 |           "name": "stdout"
200 |         }
201 |       ]
202 |     },
203 |     {
204 |       "cell_type": "code",
205 |       "metadata": {
206 |         "id": "7gN0BNbaG35g",
207 |         "colab_type": "code",
208 |         "colab": {}
209 |       },
210 |       "source": [
211 |         "#https://github.com/chakki-works/sumeval\n",
212 |         "#https://github.com/Tian312/awesome-text-summarization\n",
213 |         "\n",
214 |         "from sumeval.metrics.rouge import RougeCalculator\n",
215 |         "from sumeval.metrics.bleu import BLEUCalculator\n",
216 |         "\n",
217 |         "def eval_rouges(refrence_summary,model_summary):\n",
218 |         "    rouge = RougeCalculator(stopwords=True, lang=\"en\")\n",
219 |         "\n",
220 |         "    rouge_1 = rouge.rouge_n(\n",
221 |         "                summary=model_summary,\n",
222 |         "                references=refrence_summary,\n",
223 |         "                n=1)\n",
224 |         "\n",
225 |         "    rouge_2 = rouge.rouge_n(\n",
226 |         "                summary=model_summary,\n",
227 |         "                references=[refrence_summary],\n",
228 |         "                n=2)\n",
229 |         "    \n",
230 |         "    rouge_l = rouge.rouge_l(\n",
231 |         "                summary=model_summary,\n",
232 |         "                references=[refrence_summary])\n",
233 |         "    \n",
234 |         "    # You need spaCy to calculate ROUGE-BE\n",
235 |         "    \n",
236 |         "    rouge_be = rouge.rouge_be(\n",
237 |         "                summary=model_summary,\n",
238 |         "                references=[refrence_summary])\n",
239 |         "\n",
240 |         "    bleu = BLEUCalculator()\n",
241 |         "    bleu_score = bleu.bleu( summary=model_summary,\n",
242 |         "                        references=[refrence_summary])\n",
243 |         "    \n",
244 |         "    return rouge_1, rouge_2,rouge_l,rouge_be,bleu_score"
245 |       ],
246 |       "execution_count": 0,
247 |       "outputs": []
248 |     },
249 |     {
250 |       "cell_type": "code",
251 |       "metadata": {
252 |         "id": "i6atapqsG52A",
253 |         "colab_type": "code",
254 |         "colab": {}
255 |       },
256 |       "source": [
257 |         "#https://pymotw.com/2/xml/etree/ElementTree/create.html\n",
258 |         "\n",
259 |         "bleu_arr = []\n",
260 |         "rouge_1_arr  = []\n",
261 |         "rouge_2_arr  = []\n",
262 |         "rouge_L_arr  = []\n",
263 |         "rouge_be_arr = []\n",
264 |         "\n",
265 |         "from xml.etree import ElementTree\n",
266 |         "from xml.dom import minidom\n",
267 |         "from functools import reduce\n",
268 |         "\n",
269 |         "def prettify(elem):\n",
270 |         "    \"\"\"Return a pretty-printed XML string for the Element.\n",
271 |         "    \"\"\"\n",
272 |         "    rough_string = ElementTree.tostring(elem, 'utf-8')\n",
273 |         "    reparsed = minidom.parseString(rough_string)\n",
274 |         "    return reparsed.toprettyxml(indent=\"  \")\n",
275 |         "  \n",
276 |         "from xml.etree.ElementTree import Element, SubElement, Comment\n",
277 |         "\n",
278 |         "top = Element('ZakSum')\n",
279 |         "\n",
280 |         "def zaksum(article,reference,summary_array,default_path):\n",
281 |         "  comment = Comment('Generated by Amr Zaki')\n",
282 |         "  top.append(comment)\n",
283 |         "\n",
284 |         "  i=0\n",
285 |         "  for summ in summary_array:\n",
286 |         "    example = SubElement(top, 'example')\n",
287 |         "    article_element   = SubElement(example, 'article')\n",
288 |         "    article_element.text = article[i]\n",
289 |         "\n",
290 |         "    reference_element = SubElement(example, 'reference')\n",
291 |         "    reference_element.text = reference[i]\n",
292 |         "\n",
293 |         "    summary_element   = SubElement(example, 'summary')\n",
294 |         "    summary_element.text = summ\n",
295 |         "\n",
296 |         "    rouge_1, rouge_2,rouge_L,rouge_be,bleu_score = eval_rouges(reference[i],summ )\n",
297 |         "\n",
298 |         "    eval_element = SubElement(example, 'eval')\n",
299 |         "    bleu_score_element = SubElement(eval_element,'BLEU', {'score':str(bleu_score)})\n",
300 |         "    ROUGE_1_element  = SubElement(eval_element, 'ROUGE_1' , {'score':str(rouge_1)})\n",
301 |         "    ROUGE_2_element  = SubElement(eval_element, 'ROUGE_2' , {'score':str(rouge_2)})\n",
302 |         "    ROUGE_L_element  = SubElement(eval_element, 'ROUGE_l' , {'score':str(rouge_L)})\n",
303 |         "    ROUGE_be_element  = SubElement(eval_element,'ROUGE_be', {'score':str(rouge_be)})\n",
304 |         "\n",
305 |         "    bleu_arr.append(bleu_score) \n",
306 |         "    rouge_1_arr.append(rouge_1) \n",
307 |         "    rouge_2_arr.append(rouge_2) \n",
308 |         "    rouge_L_arr.append(rouge_L) \n",
309 |         "    rouge_be_arr.append(rouge_be) \n",
310 |         "\n",
311 |         "    i+=1\n",
312 |         "\n",
313 |         "  top.set('bleu', str(reduce(lambda x, y: x + y,  bleu_arr) / len(bleu_arr)))\n",
314 |         "  top.set('rouge_1', str(reduce(lambda x, y: x + y,  rouge_1_arr) / len(rouge_1_arr)))\n",
315 |         "  top.set('rouge_2', str(reduce(lambda x, y: x + y,  rouge_2_arr) / len(rouge_2_arr)))\n",
316 |         "  top.set('rouge_L', str(reduce(lambda x, y: x + y,  rouge_L_arr) / len(rouge_L_arr)))\n",
317 |         "  top.set('rouge_be', str(reduce(lambda x, y: x + y, rouge_be_arr) / len(rouge_be_arr)))\n",
318 |         "\n",
319 |         "  with open(default_path, \"w\") as f:\n",
320 |         "    print(prettify(top), file=f)"
321 |       ],
322 |       "execution_count": 0,
323 |       "outputs": []
324 |     },
325 |     {
326 |       "cell_type": "markdown",
327 |       "metadata": {
328 |         "id": "O7cG7_wZHdOf",
329 |         "colab_type": "text"
330 |       },
331 |       "source": [
332 |         "#Run"
333 |       ]
334 |     },
335 |     {
336 |       "cell_type": "code",
337 |       "metadata": {
338 |         "id": "ZCMS9Bb_HQH4",
339 |         "colab_type": "code",
340 |         "colab": {}
341 |       },
342 |       "source": [
343 |         "zaksum(article,reference,summary,\"drive/Colab Notebooks/Model 5 RL/Policy Gradient/result_policy.xml\")"
344 |       ],
345 |       "execution_count": 0,
346 |       "outputs": []
347 |     },
348 |     {
349 |       "cell_type": "code",
350 |       "metadata": {
351 |         "id": "mjExkxplhbaU",
352 |         "colab_type": "code",
353 |         "outputId": "3d8e89a6-5622-4f98-9145-bec4e9ea9c63",
354 |         "colab": {
355 |           "base_uri": "https://localhost:8080/",
356 |           "height": 207
357 |         }
358 |       },
359 |       "source": [
360 |         "reference"
361 |       ],
362 |       "execution_count": 0,
363 |       "outputs": [
364 |         {
365 |           "output_type": "execute_result",
366 |           "data": {
367 |             "text/plain": [
368 |               "[\"mentally ill inmates in miami are housed on the `` forgotten floor '' judge steven __leifman__ says most are there as a result of `` avoidable felonies '' while cnn tours facility , patient shouts : `` i am the son of the president '' __leifman__ says the system is unjust and he 's fighting for change .\",\n",
369 |               " \"harry potter star daniel radcliffe gets # 20m fortune as he turns 18 monday . young actor says he has no plans to __fritter__ his cash away . radcliffe 's earnings from first five potter films have been held in trust fund .\",\n",
370 |               " \"new : `` i thought i was going to die , '' driver says . man says pickup truck was folded in half ; he just has cut on face . driver : `` i probably had a 30 - , __35-foot__ free fall '' minnesota bridge collapsed during rush hour wednesday .\",\n",
371 |               " \"parents beam with pride , ca n't stop from smiling from outpouring of support . mom : `` i was so happy i did n't know what to do '' burn center in u.s. has offered to provide treatment for reconstructive surgeries . dad says , `` anything for youssif ''\",\n",
372 |               " \"five small __polyps__ found during procedure ; `` none worrisome , '' spokesman says . president !!__reclaims__!! powers transferred to vice president . bush undergoes routine colonoscopy at camp david .\",\n",
373 |               " \"new : nfl chief , atlanta falcons owner critical of michael vick 's conduct . nfl suspends falcons quarterback indefinitely without pay . vick admits funding dogfighting operation but says he did not gamble . vick due in federal court monday ; future in nfl remains uncertain .\",\n",
374 |               " \"aid workers : violence , increased cost of living drive women to prostitution . group is working to raise awareness of the problem with iraq 's political leaders . two iraqi mothers tell cnn they turned to prostitution to help feed their children . `` everything is for the children , '' one woman says .\",\n",
375 |               " \"president bush says tony snow `` will battle cancer and win '' job of press secretary `` has been a dream for me , '' snow says snow leaving on september 14 , will be succeeded by dana perino .\",\n",
376 |               " \"president bush to address the veterans of foreign wars on wednesday . bush to say that withdrawing from vietnam emboldened today 's terrorists . speech will be latest white house attempt to try to !!__reframe__!! the debate over iraq .\",\n",
377 |               " \"new : president bush says he and first lady are deeply saddened by the tragedy . mine safety and health administration chief : we 've run out of options . the six men have been trapped underground since august 6 . seven bore holes drilled into the mountain have found no signs of life .\"]"
378 |             ]
379 |           },
380 |           "metadata": {
381 |             "tags": []
382 |           },
383 |           "execution_count": 12
384 |         }
385 |       ]
386 |     },
387 |     {
388 |       "cell_type": "code",
389 |       "metadata": {
390 |         "id": "XUdXqYHKhNhm",
391 |         "colab_type": "code",
392 |         "outputId": "9ec95fc1-07b6-4bc6-ecfc-2408ecdbda80",
393 |         "colab": {
394 |           "base_uri": "https://localhost:8080/",
395 |           "height": 207
396 |         }
397 |       },
398 |       "source": [
399 |         "summary"
400 |       ],
401 |       "execution_count": 0,
402 |       "outputs": [
403 |         {
404 |           "output_type": "execute_result",
405 |           "data": {
406 |             "text/plain": [
407 |               "[\"soledad , soledad o'brien takes users inside a jail in covering before trial . he says face drug charges in miami are mentally ill . the ninth . he is well known in miami before trial ill . [unk]\",\n",
408 |               " 'daniel potter says he has no plans to fritter his cash # 20 million . he insists the money of the world five the world , the young actor he has not been able to be one to touch . [unk]',\n",
409 |               " '`` the whole bridge from one side of the mississippi to the other just completely gave . `` probably was churning and could move around to the scene . he he had could no way of getting to them . [unk]',\n",
410 |               " \"5-year-old youssif held his sister 's sister 's hand . he 's parents . he could n't stop in a request . he says he was so happy to held to help burn victims . [unk]\",\n",
411 |               " \"doctors removed five small polyps from president bush 's colon on saturday , and `` none appeared worrisome , and will resume his activities at camp david colon . the polyps were expected in two to three days . [unk]\",\n",
412 |               " 'nfl star michael vick is set to appear to appear in court monday . a judge will have the final say on a plea deal . the national football league vick in a plea . [unk]',\n",
413 |               " 'suha , 37 , is driven to sell to sell food . she says her husband she is cleaning to sell a day . she was cleaning houses , is cleaning houses when she says . [unk]',\n",
414 |               " \"white house press secretary tony snow will step down from his post on september 14 . `` will `` will sadly accept '' he will `` sadly accept for cancer 's resignation . `` he he 'll be a a solid contributor . [unk]\",\n",
415 |               " 'president bush will tell will try to put a twist on iraq of the war summit in canada city . bush bush will say will tell members of the veterans of foreign wars in canada . [unk]',\n",
416 |               " 'isaac is `` no remaining hope in a utah for miners . `` over the past 25 in price , the mine . `` the failed and prayers . `` are no sign of the six miners . [unk]']"
417 |             ]
418 |           },
419 |           "metadata": {
420 |             "tags": []
421 |           },
422 |           "execution_count": 11
423 |         }
424 |       ]
425 |     },
426 |     {
427 |       "cell_type": "code",
428 |       "metadata": {
429 |         "id": "8qRwSyz0hN0c",
430 |         "colab_type": "code",
431 |         "colab": {}
432 |       },
433 |       "source": [
434 |         ""
435 |       ],
436 |       "execution_count": 0,
437 |       "outputs": []
438 |     }
439 |   ]
440 | }


--------------------------------------------------------------------------------
/Implementation C (Reinforcement Learning with seq2seq)/README.md:
--------------------------------------------------------------------------------
 1 | # Implementation C (Reinforcement Learning For Sequence to Sequence )
 2 | 
 3 | this implementation is a continuation of the amazing work done by
 4 | https://github.com/yaserkl/RLSeq2Seq
 5 | https://arxiv.org/abs/1805.09461
 6 | 
 7 | 
 8 | ```
 9 | @article{keneshloo2018deep,
10 |  title={Deep Reinforcement Learning For Sequence to Sequence Models},
11 |  author={Keneshloo, Yaser and Shi, Tian and Ramakrishnan, Naren and Reddy, Chandan K.},
12 |  journal={arXiv preprint arXiv:1805.09461},
13 |  year={2018}
14 | }
15 | ```
16 | 
17 | This is a library for implementing **Reinforcment learning with deep learning for text summarization** , which has been converted to a jupyter notebook format to run seamlesly within google colab
18 | here apply some experiments from published papers using this library . 
19 | 
20 | ## Scheduled Sampling with intradecoder 
21 | description from [yasterk](https://github.com/yaserkl/RLSeq2Seq#scheduled-sampling-soft-scheduled-sampling-and-end2endbackprop) 
22 | 
23 | [Bengio et al](https://arxiv.org/abs/1506.03099). proposed the idea of scheduled sampling for avoiding exposure bias problem.
24 | ```
25 | @ARTICLE{2015arXiv150603099B,
26 |        author = {{Bengio}, Samy and {Vinyals}, Oriol and {Jaitly}, Navdeep and
27 |          {Shazeer}, Noam},
28 |         title = "{Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks}",
29 |       journal = {arXiv e-prints},
30 |      keywords = {Computer Science - Machine Learning, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
31 |          year = "2015",
32 |         month = "Jun",
33 |           eid = {arXiv:1506.03099},
34 |         pages = {arXiv:1506.03099},
35 | archivePrefix = {arXiv},
36 |        eprint = {1506.03099},
37 |  primaryClass = {cs.LG},
38 |        adsurl = {https://ui.adsabs.harvard.edu/abs/2015arXiv150603099B},
39 |       adsnote = {Provided by the SAO/NASA Astrophysics Data System}
40 | }
41 | ```
42 | 
43 | 
44 | ## Policy-Gradient 
45 | description from [yasterk](https://github.com/yaserkl/RLSeq2Seq#scheduled-sampling-soft-scheduled-sampling-and-end2endbackprop) 
46 | 
47 | [Paulus et al](https://arxiv.org/abs/1705.04304). proposed a self-critic policy-gradient model for abstractive text summarization. The following figure represents how this method works and how we implemented this method:
48 | 
49 | image from [yasterk](https://github.com/yaserkl/RLSeq2Seq#scheduled-sampling-soft-scheduled-sampling-and-end2endbackprop) 
50 | ![Policy-Gradient](https://github.com/yaserkl/RLSeq2Seq/raw/master/docs/_img/selfcritic.png)
51 | 
52 | ```
53 | @ARTICLE{2017arXiv170504304P,
54 |        author = {{Paulus}, Romain and {Xiong}, Caiming and {Socher}, Richard},
55 |         title = "{A Deep Reinforced Model for Abstractive Summarization}",
56 |       journal = {arXiv e-prints},
57 |      keywords = {Computer Science - Computation and Language},
58 |          year = "2017",
59 |         month = "May",
60 |           eid = {arXiv:1705.04304},
61 |         pages = {arXiv:1705.04304},
62 | archivePrefix = {arXiv},
63 |        eprint = {1705.04304},
64 |  primaryClass = {cs.CL},
65 |        adsurl = {https://ui.adsabs.harvard.edu/abs/2017arXiv170504304P},
66 |       adsnote = {Provided by the SAO/NASA Astrophysics Data System}
67 | }
68 | ```
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/Implementation C (Reinforcement Learning with seq2seq)/Scheduled Sampling with intradecoder/README.md:
--------------------------------------------------------------------------------
 1 | # Scheduled Sampling with intradecoder
 2 | Here we implement Scheduled Sampling from [Bengio et al](https://arxiv.org/abs/1506.03099)
 3 | This is a library for implementing **Reinforcment learning with deep learning for text summarization** .
 4 | which has been converted to a jupyter notebook format to run seamlesly within google colab
 5 | 
 6 | ## Description
 7 | description from [yasterk](https://github.com/yaserkl/RLSeq2Seq#scheduled-sampling-soft-scheduled-sampling-and-end2endbackprop) 
 8 | 
 9 | [Bengio et al](https://arxiv.org/abs/1506.03099). proposed the idea of scheduled sampling for avoiding exposure bias problem.
10 | ```
11 | @ARTICLE{2015arXiv150603099B,
12 |        author = {{Bengio}, Samy and {Vinyals}, Oriol and {Jaitly}, Navdeep and
13 |          {Shazeer}, Noam},
14 |         title = "{Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks}",
15 |       journal = {arXiv e-prints},
16 |      keywords = {Computer Science - Machine Learning, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
17 |          year = "2015",
18 |         month = "Jun",
19 |           eid = {arXiv:1506.03099},
20 |         pages = {arXiv:1506.03099},
21 | archivePrefix = {arXiv},
22 |        eprint = {1506.03099},
23 |  primaryClass = {cs.LG},
24 |        adsurl = {https://ui.adsabs.harvard.edu/abs/2015arXiv150603099B},
25 |       adsnote = {Provided by the SAO/NASA Astrophysics Data System}
26 | }
27 | ```
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/Implementation C (Reinforcement Learning with seq2seq)/Scheduled Sampling with intradecoder/zaksum eval.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"zaksum eval.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"metadata":{"id":"HY3btgJWEqBP","colab_type":"text"},"cell_type":"markdown","source":["### Google Drive"]},{"metadata":{"id":"tdcKjPVpEjk8","colab_type":"code","outputId":"8c4c5254-435b-477a-d7a3-02441870f058","executionInfo":{"status":"ok","timestamp":1547620108007,"user_tz":-120,"elapsed":79197,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}},"colab":{"base_uri":"https://localhost:8080/","height":237}},"cell_type":"code","source":["#https://stackoverflow.com/questions/47744131/colaboratory-can-i-access-to-my-google-drive-folder-and-file\n","\n","!apt-get install -y -qq software-properties-common python-software-properties module-init-tools\n","!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null\n","!apt-get update -qq 2>&1 > /dev/null\n","!apt-get -y install -qq google-drive-ocamlfuse fuse\n","from google.colab import auth\n","auth.authenticate_user()\n","from oauth2client.client import GoogleCredentials\n","creds = GoogleCredentials.get_application_default()\n","import getpass\n","!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL\n","vcode = getpass.getpass()\n","!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}\n","\n","!mkdir -p drive\n","!google-drive-ocamlfuse drive"],"execution_count":1,"outputs":[{"output_type":"stream","text":["E: Package 'python-software-properties' has no installation candidate\n","Selecting previously unselected package google-drive-ocamlfuse.\n","(Reading database ... 110851 files and directories currently installed.)\n","Preparing to unpack .../google-drive-ocamlfuse_0.7.1-0ubuntu3~ubuntu18.04.1_amd64.deb ...\n","Unpacking google-drive-ocamlfuse (0.7.1-0ubuntu3~ubuntu18.04.1) ...\n","Setting up google-drive-ocamlfuse (0.7.1-0ubuntu3~ubuntu18.04.1) ...\n","Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n","Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force\n","··········\n","Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force\n","Please enter the verification code: Access token retrieved correctly.\n"],"name":"stdout"}]},{"metadata":{"id":"uZlHlsJAE75n","colab_type":"text"},"cell_type":"markdown","source":["### Parser"]},{"metadata":{"id":"GBUTmucLFHy4","colab_type":"code","colab":{}},"cell_type":"code","source":["article=[]\n","reference=[]\n","summary =[]\n","\n","import xml.etree.ElementTree\n","e = xml.etree.ElementTree.parse(\"drive/Colab Notebooks/Model 5 RL/result_rlfeaasf_9_1_2019_1_46am.xml\").getroot()\n","\n","for atype in e.findall('example/article'):\n","    article.append(atype.text)\n","    \n","for atype in e.findall('example/reference'):\n","     reference.append(atype.text)\n","    \n","for atype in e.findall('example/summary'):\n","     summary.append(atype.text)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"g6lMM3PCWZYh","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":35},"outputId":"093dcb85-4426-4470-a01a-ee9490de25d4","executionInfo":{"status":"ok","timestamp":1547620168488,"user_tz":-120,"elapsed":2497,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}}},"cell_type":"code","source":["len(summary)"],"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":["10"]},"metadata":{"tags":[]},"execution_count":4}]},{"metadata":{"id":"XJinxjb1Gwxg","colab_type":"text"},"cell_type":"markdown","source":["### zaksum"]},{"metadata":{"id":"a23wJfc6Gzg4","colab_type":"code","outputId":"cb950758-2f4f-462c-8c8e-9947a5f8a963","executionInfo":{"status":"ok","timestamp":1547620168486,"user_tz":-120,"elapsed":10486,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}},"colab":{"base_uri":"https://localhost:8080/","height":528}},"cell_type":"code","source":["!pip install sumeval\n","!python -m spacy download en"],"execution_count":3,"outputs":[{"output_type":"stream","text":["Collecting sumeval\n","\u001b[?25l  Downloading https://files.pythonhosted.org/packages/10/a4/cd9da40dd7f32a2141808ba3af7101f0fcb0abfed918defd2df05b1410ac/sumeval-0.1.7.tar.gz (44kB)\n","\u001b[K    100% |████████████████████████████████| 51kB 1.7MB/s \n","\u001b[?25hRequirement already satisfied: plac>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from sumeval) (0.9.6)\n","Collecting sacrebleu>=1.1.7 (from sumeval)\n","  Downloading https://files.pythonhosted.org/packages/37/51/bffea2b666d59d77be0413d35220022040a1f308c39009e5b023bc4eb8ab/sacrebleu-1.2.12.tar.gz\n","Collecting typing (from sacrebleu>=1.1.7->sumeval)\n","  Downloading https://files.pythonhosted.org/packages/4a/bd/eee1157fc2d8514970b345d69cb9975dcd1e42cd7e61146ed841f6e68309/typing-3.6.6-py3-none-any.whl\n","Building wheels for collected packages: sumeval, sacrebleu\n","  Running setup.py bdist_wheel for sumeval ... \u001b[?25l-\b \b\\\b \bdone\n","\u001b[?25h  Stored in directory: /root/.cache/pip/wheels/f1/7c/18/177331e7a5154401ab5fec8f3c6d2062508c38d832aaa3d5e2\n","  Running setup.py bdist_wheel for sacrebleu ... \u001b[?25l-\b \bdone\n","\u001b[?25h  Stored in directory: /root/.cache/pip/wheels/ea/0a/7d/ddcbdcd15a04b72de1b3f78e7e754aab415aff81c423376385\n","Successfully built sumeval sacrebleu\n","Installing collected packages: typing, sacrebleu, sumeval\n","Successfully installed sacrebleu-1.2.12 sumeval-0.1.7 typing-3.6.6\n","\u001b[0;31;1mWARNING: The following packages were previously imported in this runtime:\n","  [typing]\n","You must restart the runtime in order to use newly installed versions.\u001b[0m\n","Requirement already satisfied: en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0 in /usr/local/lib/python3.6/dist-packages (2.0.0)\n","\n","\u001b[93m    Linking successful\u001b[0m\n","    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->\n","    /usr/local/lib/python3.6/dist-packages/spacy/data/en\n","\n","    You can now load the model via spacy.load('en')\n","\n"],"name":"stdout"}]},{"metadata":{"id":"7gN0BNbaG35g","colab_type":"code","colab":{}},"cell_type":"code","source":["#https://github.com/chakki-works/sumeval\n","#https://github.com/Tian312/awesome-text-summarization\n","\n","from sumeval.metrics.rouge import RougeCalculator\n","from sumeval.metrics.bleu import BLEUCalculator\n","\n","def eval_rouges(refrence_summary,model_summary):\n","    rouge = RougeCalculator(stopwords=True, lang=\"en\")\n","\n","    rouge_1 = rouge.rouge_n(\n","                summary=model_summary,\n","                references=refrence_summary,\n","                n=1)\n","\n","    rouge_2 = rouge.rouge_n(\n","                summary=model_summary,\n","                references=[refrence_summary],\n","                n=2)\n","    \n","    rouge_l = rouge.rouge_l(\n","                summary=model_summary,\n","                references=[refrence_summary])\n","    \n","    # You need spaCy to calculate ROUGE-BE\n","    \n","    rouge_be = rouge.rouge_be(\n","                summary=model_summary,\n","                references=[refrence_summary])\n","\n","    bleu = BLEUCalculator()\n","    bleu_score = bleu.bleu( summary=model_summary,\n","                        references=[refrence_summary])\n","    \n","    return rouge_1, rouge_2,rouge_l,rouge_be,bleu_score"],"execution_count":0,"outputs":[]},{"metadata":{"id":"i6atapqsG52A","colab_type":"code","colab":{}},"cell_type":"code","source":["#https://pymotw.com/2/xml/etree/ElementTree/create.html\n","\n","bleu_arr = []\n","rouge_1_arr  = []\n","rouge_2_arr  = []\n","rouge_L_arr  = []\n","rouge_be_arr = []\n","\n","from xml.etree import ElementTree\n","from xml.dom import minidom\n","from functools import reduce\n","\n","def prettify(elem):\n","    \"\"\"Return a pretty-printed XML string for the Element.\n","    \"\"\"\n","    rough_string = ElementTree.tostring(elem, 'utf-8')\n","    reparsed = minidom.parseString(rough_string)\n","    return reparsed.toprettyxml(indent=\"  \")\n","  \n","from xml.etree.ElementTree import Element, SubElement, Comment\n","\n","top = Element('ZakSum')\n","\n","def zaksum(article,reference,summary_array,default_path):\n","  comment = Comment('Generated by Amr Zaki')\n","  top.append(comment)\n","\n","  i=0\n","  for summ in summary_array:\n","    example = SubElement(top, 'example')\n","    article_element   = SubElement(example, 'article')\n","    article_element.text = article[i]\n","\n","    reference_element = SubElement(example, 'reference')\n","    reference_element.text = reference[i]\n","\n","    summary_element   = SubElement(example, 'summary')\n","    summary_element.text = summ\n","\n","    rouge_1, rouge_2,rouge_L,rouge_be,bleu_score = eval_rouges(reference[i],summ )\n","\n","    eval_element = SubElement(example, 'eval')\n","    bleu_score_element = SubElement(eval_element,'BLEU', {'score':str(bleu_score)})\n","    ROUGE_1_element  = SubElement(eval_element, 'ROUGE_1' , {'score':str(rouge_1)})\n","    ROUGE_2_element  = SubElement(eval_element, 'ROUGE_2' , {'score':str(rouge_2)})\n","    ROUGE_L_element  = SubElement(eval_element, 'ROUGE_l' , {'score':str(rouge_L)})\n","    ROUGE_be_element  = SubElement(eval_element,'ROUGE_be', {'score':str(rouge_be)})\n","\n","    bleu_arr.append(bleu_score) \n","    rouge_1_arr.append(rouge_1) \n","    rouge_2_arr.append(rouge_2) \n","    rouge_L_arr.append(rouge_L) \n","    rouge_be_arr.append(rouge_be) \n","\n","    i+=1\n","\n","  top.set('bleu', str(reduce(lambda x, y: x + y,  bleu_arr) / len(bleu_arr)))\n","  top.set('rouge_1', str(reduce(lambda x, y: x + y,  rouge_1_arr) / len(rouge_1_arr)))\n","  top.set('rouge_2', str(reduce(lambda x, y: x + y,  rouge_2_arr) / len(rouge_2_arr)))\n","  top.set('rouge_L', str(reduce(lambda x, y: x + y,  rouge_L_arr) / len(rouge_L_arr)))\n","  top.set('rouge_be', str(reduce(lambda x, y: x + y, rouge_be_arr) / len(rouge_be_arr)))\n","\n","  with open(default_path + \"result_rlafddfaa2.xml\", \"w\") as f:\n","    print(prettify(top), file=f)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"O7cG7_wZHdOf","colab_type":"text"},"cell_type":"markdown","source":["#Run"]},{"metadata":{"id":"ZCMS9Bb_HQH4","colab_type":"code","colab":{}},"cell_type":"code","source":["zaksum(article,reference,summary,\"drive/Colab Notebooks/Model 5 RL/\")"],"execution_count":0,"outputs":[]},{"metadata":{"id":"mjExkxplhbaU","colab_type":"code","outputId":"3d8e89a6-5622-4f98-9145-bec4e9ea9c63","executionInfo":{"status":"ok","timestamp":1547220405451,"user_tz":-120,"elapsed":1336,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}},"colab":{"base_uri":"https://localhost:8080/","height":207}},"cell_type":"code","source":["reference"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["[\"mentally ill inmates in miami are housed on the `` forgotten floor '' judge steven __leifman__ says most are there as a result of `` avoidable felonies '' while cnn tours facility , patient shouts : `` i am the son of the president '' __leifman__ says the system is unjust and he 's fighting for change .\",\n"," \"harry potter star daniel radcliffe gets # 20m fortune as he turns 18 monday . young actor says he has no plans to __fritter__ his cash away . radcliffe 's earnings from first five potter films have been held in trust fund .\",\n"," \"new : `` i thought i was going to die , '' driver says . man says pickup truck was folded in half ; he just has cut on face . driver : `` i probably had a 30 - , __35-foot__ free fall '' minnesota bridge collapsed during rush hour wednesday .\",\n"," \"parents beam with pride , ca n't stop from smiling from outpouring of support . mom : `` i was so happy i did n't know what to do '' burn center in u.s. has offered to provide treatment for reconstructive surgeries . dad says , `` anything for youssif ''\",\n"," \"five small __polyps__ found during procedure ; `` none worrisome , '' spokesman says . president !!__reclaims__!! powers transferred to vice president . bush undergoes routine colonoscopy at camp david .\",\n"," \"new : nfl chief , atlanta falcons owner critical of michael vick 's conduct . nfl suspends falcons quarterback indefinitely without pay . vick admits funding dogfighting operation but says he did not gamble . vick due in federal court monday ; future in nfl remains uncertain .\",\n"," \"aid workers : violence , increased cost of living drive women to prostitution . group is working to raise awareness of the problem with iraq 's political leaders . two iraqi mothers tell cnn they turned to prostitution to help feed their children . `` everything is for the children , '' one woman says .\",\n"," \"president bush says tony snow `` will battle cancer and win '' job of press secretary `` has been a dream for me , '' snow says snow leaving on september 14 , will be succeeded by dana perino .\",\n"," \"president bush to address the veterans of foreign wars on wednesday . bush to say that withdrawing from vietnam emboldened today 's terrorists . speech will be latest white house attempt to try to !!__reframe__!! the debate over iraq .\",\n"," \"new : president bush says he and first lady are deeply saddened by the tragedy . mine safety and health administration chief : we 've run out of options . the six men have been trapped underground since august 6 . seven bore holes drilled into the mountain have found no signs of life .\"]"]},"metadata":{"tags":[]},"execution_count":12}]},{"metadata":{"id":"XUdXqYHKhNhm","colab_type":"code","outputId":"9ec95fc1-07b6-4bc6-ecfc-2408ecdbda80","executionInfo":{"status":"ok","timestamp":1547220373060,"user_tz":-120,"elapsed":1373,"user":{"displayName":"amr zaki","photoUrl":"","userId":"09456039094530776333"}},"colab":{"base_uri":"https://localhost:8080/","height":207}},"cell_type":"code","source":["summary"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["[\"soledad , soledad o'brien takes users inside a jail in covering before trial . he says face drug charges in miami are mentally ill . the ninth . he is well known in miami before trial ill . [unk]\",\n"," 'daniel potter says he has no plans to fritter his cash # 20 million . he insists the money of the world five the world , the young actor he has not been able to be one to touch . [unk]',\n"," '`` the whole bridge from one side of the mississippi to the other just completely gave . `` probably was churning and could move around to the scene . he he had could no way of getting to them . [unk]',\n"," \"5-year-old youssif held his sister 's sister 's hand . he 's parents . he could n't stop in a request . he says he was so happy to held to help burn victims . [unk]\",\n"," \"doctors removed five small polyps from president bush 's colon on saturday , and `` none appeared worrisome , and will resume his activities at camp david colon . the polyps were expected in two to three days . [unk]\",\n"," 'nfl star michael vick is set to appear to appear in court monday . a judge will have the final say on a plea deal . the national football league vick in a plea . [unk]',\n"," 'suha , 37 , is driven to sell to sell food . she says her husband she is cleaning to sell a day . she was cleaning houses , is cleaning houses when she says . [unk]',\n"," \"white house press secretary tony snow will step down from his post on september 14 . `` will `` will sadly accept '' he will `` sadly accept for cancer 's resignation . `` he he 'll be a a solid contributor . [unk]\",\n"," 'president bush will tell will try to put a twist on iraq of the war summit in canada city . bush bush will say will tell members of the veterans of foreign wars in canada . [unk]',\n"," 'isaac is `` no remaining hope in a utah for miners . `` over the past 25 in price , the mine . `` the failed and prayers . `` are no sign of the six miners . [unk]']"]},"metadata":{"tags":[]},"execution_count":11}]},{"metadata":{"id":"8qRwSyz0hN0c","colab_type":"code","colab":{}},"cell_type":"code","source":[""],"execution_count":0,"outputs":[]}]}


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Text Summarization models
  2 | 
  3 | if you are able to endorse me on Arxiv, i would be more than glad https://arxiv.org/auth/endorse?x=FRBB89 thanks 
  4 | This repo is built to collect multiple implementations for abstractive approaches to address text summarization , for different languages (Hindi, Amharic, English, and soon isA Arabic)
  5 | 
  6 | If you found this project helpful please consider citing our work, it would truly mean so much for me
  7 | 
  8 | ```
  9 | @INPROCEEDINGS{9068171,
 10 |   author={A. M. {Zaki} and M. I. {Khalil} and H. M. {Abbas}},
 11 |   booktitle={2019 14th International Conference on Computer Engineering and Systems (ICCES)}, 
 12 |   title={Deep Architectures for Abstractive Text Summarization in Multiple Languages}, 
 13 |   year={2019},
 14 |   volume={},
 15 |   number={},
 16 |   pages={22-27},}
 17 | ```
 18 | 
 19 | ```
 20 | @misc{zaki2020amharic,
 21 |     title={Amharic Abstractive Text Summarization},
 22 |     author={Amr M. Zaki and Mahmoud I. Khalil and Hazem M. Abbas},
 23 |     year={2020},
 24 |     eprint={2003.13721},
 25 |     archivePrefix={arXiv},
 26 |     primaryClass={cs.CL}
 27 | }
 28 | ```
 29 | 
 30 | **it is built to simply run on google colab , in one notebook  so you would only need an internet connection to run these examples without the need to have a powerful machine , so all the code examples would be in a jupiter format , and you don't have to download data to your device as we connect these jupiter notebooks to google drive**
 31 | 
 32 | - **Arabic Summarization** Model using the corner stone implemtnation (seq2seq using Bidirecional LSTM Encoder and attention in the decoder) for summarizing Arabic news
 33 | -  **implementation A** Corner stone seq2seq with attention (using bidirectional ltsm ) , three different models for this implemntation 
 34 | -  **implementation B** seq2seq with pointer genrator model
 35 | -  **implementation C** seq2seq with reinforcement learning
 36 | 
 37 | # Blogs
 38 | This repo has been explained in a series of Blogs
 39 | - to understand how to work with google colab eco system , and how to integrate it with your google drive , this blog can prove useful [DeepLearning Free Ecosystem](https://hackernoon.com/begin-your-deep-learning-project-for-free-free-gpu-processing-free-storage-free-easy-upload-b4dba18abebc "DeepLearning Free Ecosystem")
 40 | - **Tutorial 1** [ Overview on the different appraches used for abstractive text summarization](https://hackernoon.com/text-summarizer-using-deep-learning-made-easy-490880df6cd?source=post_stats_page--------------------------- "Overview on  abstractive text summarization")
 41 | - **Tutorial 2**  [ How to represent text for our text summarization task ](https://hackernoon.com/abstractive-text-summarization-tutorial-2-text-representation-made-very-easy-ef4511a1a46?source=post_stats_page--------------------------- "text represneataion for text summarization")
 42 | - **Tutorial 3**  [ What seq2seq and why do we use it in text summarization ](https://hackernoon.com/tutorial-3-what-is-seq2seq-for-text-summarization-and-why-68ebaa644db0?source=post_stats_page--------------------------- "What and why seq2seq")
 43 | - **Tutorial 4** [Multilayer Bidirectional Lstm/Gru for text summarization](https://medium.com/@theamrzaki/multilayer-bidirectional-lstm-gru-for-text-summarization-made-easy-tutorial-4-a63db108b44f)
 44 | - **Tutorial 5** [Beam Search & Attention for text summarization](https://medium.com/@theamrzaki/beam-search-attention-for-text-summarization-made-easy-tutorial-5-3b7186df7086)
 45 | - **Tutorial 6** [Build an Abstractive Text Summarizer in 94 Lines of Tensorflow](http://bit.ly/2ZeEmvO)
 46 | - **Tutorial 7** [Pointer generator for combination of Abstractive & Extractive methods for Text Summarization](http://bit.ly/2EhcRIZ)
 47 | - **Tutorial 8** [Teach seq2seq models to learn from their mistakes using deep curriculum learning](http://bit.ly/2My51kX)
 48 | - **Tutorial 9** [Deep Reinforcement Learning (DeepRL) for Abstractive Text Summarization made easy](http://bit.ly/2MDlUHC)
 49 | - **Tutorial 10** [Hindi Text Summarization](https://medium.com/analytics-vidhya/hindi-abstractive-text-summarization-tutorial-10-eac471bdafad)
 50 | ---------------------------------------------------------------------------------
 51 | 
 52 | Try out this text summarization through [this website (eazymind)](http://bit.ly/2VxhPqU) ,
 53 | ![eazymind](https://scontent.fcai3-1.fna.fbcdn.net/v/t1.0-9/60785552_445522029607880_7282873905209933824_o.jpg?_nc_cat=101&_nc_ht=scontent.fcai3-1.fna&oh=927d1fae6521813b3d6e7a7d7a5b01aa&oe=5D5C3AD5) which enables you to summarize your text through
 54 | - curl call
 55 | ```
 56 | curl -X POST 
 57 | http://eazymind.herokuapp.com/arabic_sum/eazysum
 58 | -H 'cache-control: no-cache' 
 59 | -H 'content-type: application/x-www-form-urlencoded' 
 60 | -d "eazykey={eazymind api key}&sentence={your sentence to be summarized}"
 61 | ```
 62 | - python package ([pip install eazymind](http://bit.ly/2Ef5XnS))
 63 | 	```pip install eazymind```
 64 | 	
 65 | ```
 66 | from eazymind.nlp.eazysum import Summarizer
 67 | 
 68 | #---key from eazymind website---
 69 | key = "xxxxxxxxxxxxxxxxxxxxx"
 70 | 
 71 | #---sentence to be summarized---
 72 | sentence = """(CNN)The White House has instructed former
 73 |     White House Counsel Don McGahn not to comply with a subpoena
 74 |     for documents from House Judiciary Chairman Jerry Nadler, 
 75 |     teeing up the latest in a series of escalating oversight 
 76 |     showdowns between the Trump administration and congressional Democrats."""
 77 |     
 78 | summarizer = Summarizer(key)
 79 | print(summarizer.run(sentence))
 80 | ```
 81 | 
 82 | ---------------------------------------------------------------------------------
 83 | 
 84 | ## Implementation A (seq2seq with attention and feature rich representation)
 85 | contains 3 different models that implements the concept of hving a seq2seq network with attention 
 86 | also adding concepts like having a feature rich word representation 
 87 | This work is a continuation of these amazing repos
 88 | 
 89 | ### Model 1 
 90 | is a modification on of David Currie's https://github.com/Currie32/Text-Summarization-with-Amazon-Reviews seq2seq 
 91 | 
 92 | ### Model 2 
 93 | #### 1- Model_2/Model_2.ipynb
 94 | a modification to https://github.com/dongjun-Lee/text-summarization-tensorflow 
 95 | #### 2- Model_2/Model 2 features(tf-idf , pos tags).ipynb
 96 | a modification to Model 2.ipynb by using concepts from http://www.aclweb.org/anthology/K16-1028
 97 | #### Results
 98 | A folder contains the results of both the 2 models , from validation text samples 
 99 | in a zaksum format , which is combining all of 
100 | - bleu
101 | - rouge_1
102 | - rouge_2
103 | - rouge_L
104 | - rouge_be
105 | for each sentence , and average of all of them
106 | 
107 | ### Model 3
108 | a modification to https://github.com/thomasschmied/Text_Summarization_with_Tensorflow/blob/master/summarizer_amazon_reviews.ipynb
109 | 
110 | 		
111 | ---------------------------------------------------------------------------------
112 | 
113 | ## Implementation B (Pointer Generator seq2seq network)
114 | it is a continuation of the amazing work of
115 | 	https://github.com/abisee/pointer-generator
116 | 	https://arxiv.org/abs/1704.04368
117 | this implementation uses the concept of having a pointer generator network to diminish some problems that appears with the normal 
118 | seq2seq network
119 | 	
120 | ### Model_4_generator_.ipynb
121 | uses a pointer generator with seq2seq with attention 
122 | it is built using python2.7
123 | ### zaksum_eval.ipynb
124 | built by python3 for evaluation
125 | ### Results/Pointer Generator
126 | - output from generator (article / reference / summary) used as input to the zaksum_eval.ipynb
127 | - result from zaksum_eval
128 | 	
129 | 	
130 | i will still work on their implementation of coverage mechanism , so much work is yet to come if God wills it isA
131 | 
132 | ---------------------------------------------------------------------------------
133 | ## Implementation C (Reinforcement Learning For Sequence to Sequence )
134 | 
135 | this implementation is a continuation of the amazing work done by
136 | https://github.com/yaserkl/RLSeq2Seq
137 | https://arxiv.org/abs/1805.09461
138 | 
139 | ```
140 | @article{keneshloo2018deep,
141 |  title={Deep Reinforcement Learning For Sequence to Sequence Models},
142 |  author={Keneshloo, Yaser and Shi, Tian and Ramakrishnan, Naren and Reddy, Chandan K.},
143 |  journal={arXiv preprint arXiv:1805.09461},
144 |  year={2018}
145 | }
146 | ```
147 | 
148 | 
149 | ### Model 5 RL
150 | this is a library for building multiple approaches using Reinforcement Learning with seq2seq , i have gathered their code to run in a jupiter notebook , and to access google drive 
151 | built for python 2.7
152 | 
153 | ### zaksum_eval.ipynb
154 | built by python3 for evaluation
155 | 
156 | ### Results/Reinforcement Learning
157 | - output from Model 5 RL used as input to the zaksum_eval.ipynb
158 | 
159 | 


--------------------------------------------------------------------------------