├── LabelMyTextWidget
    ├── __init__.py
    └── LabelMyTextWidget.py
├── LabelMyTextWidget.gif
├── README.md
└── Example using the LabelMyText widget.ipynb


/LabelMyTextWidget/__init__.py:
--------------------------------------------------------------------------------
1 | from .LabelMyTextWidget import *


--------------------------------------------------------------------------------
/LabelMyTextWidget.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tchambon/LabelMyTextWidget/HEAD/LabelMyTextWidget.gif


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LabelMyTextWidget
 2 | 
 3 | LabelMyTextWidget is an iPythonWidget to quickly label text data.
 4 | 
 5 | This goal of this project is to speed up and simplify the process of manual labeling.
 6 | 
 7 | The widget provides a minimalist interface to review each row of a pandas dataframe and apply a label to it.
 8 | 
 9 | ![GIF showing the widget](https://github.com/tchambon/LabelMyTextWidget/blob/master/LabelMyTextWidget.gif "GIF Widget")
10 | 
11 | 
12 | ## How to use the widget
13 | 
14 | A Jupyter notebook is at the root of the repository, and it contains an example.
15 | 
16 | ## How to install it
17 | 
18 | For the time being, you have to download the folder LabelMyTextWidget and put it in your project folder.
19 | Better installation options and packaging will come soon.
20 | 
21 | 
22 | ## Dependancy 
23 | 
24 | A python environnement with Jupyter notebook, pandas and numpy is required.
25 | The best is to use the Anaconda distribution with Python 3.
26 | 
27 | Besides that, the following should be installed:
28 | 
29 | ```shell
30 | pip install ipywidgets
31 | 
32 | jupyter nbextension enable --py widgetsnbextension
33 | ```


--------------------------------------------------------------------------------
/LabelMyTextWidget/LabelMyTextWidget.py:
--------------------------------------------------------------------------------
  1 | import ipywidgets as widgets
  2 | from IPython.display import display
  3 | from ipywidgets import Button, HBox, VBox
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | 
  8 | __all__ = ['LabelMyTextWidget']
  9 | 
 10 | class LabelMyTextWidget:
 11 |     """ 
 12 |     Widget to quickly label a dataframe containing a column with text data and a column with target labels.
 13 |     
 14 |     """
 15 |     def __init__(self, df_source, content_column, class_list, class_id, output_column, unclassified_value=-1, randomize=False):
 16 |         """
 17 |         Create a LabelMyTextWidget object.
 18 |  
 19 |         :param df_source: The pandas dataframe containing the data column and the label column (to fill by the widget)
 20 |         :param content_column: The name of the column containing the text data to label
 21 |         :param class_list: List of the label type names (ex: Positive, Negative, Neutral)
 22 |         :param class_id: The id of each label type
 23 |         :param output_column: Name of the column to complete with labels
 24 |         :param unclassified_value: Value of the unclassified rows for the output_column
 25 |         :param randomize: If true, the labeling order will be random. If False, it will follow the index numbers
 26 | 
 27 |  
 28 |         :Example:
 29 |         >>> df['text'] = 'example of text content to label'
 30 |         >>> df['label'] = -1
 31 |         >>> LabelMyText(df, 'text', ['positive', 'negative'], [1, 0], 'label', unclassified_value=-1)
 32 |         """
 33 |         
 34 |         self.df_source = df_source
 35 |         self.content_column = content_column
 36 |         self.output_column = output_column
 37 |         self.unclassified_value = unclassified_value
 38 |         self.randomize = randomize
 39 |         
 40 |         
 41 |         self.items = [ButtonLabeling(class_id = class_id[i],description=l) for i, l in enumerate(class_list)]
 42 |         self.items.append(ButtonLabeling(class_id = unclassified_value, description='Skip', button_style='warning'))
 43 | 
 44 |         for button in self.items:
 45 |             button.on_click(self.on_button_clicked_t)
 46 | 
 47 | 
 48 | 
 49 |         self.out = widgets.Output(layout={'border': '1px solid black'})
 50 |         self.out.append_stderr('Text is coming here')
 51 | 
 52 |         button_box = HBox([widgets.Label(value="Label"), *self.items])
 53 | 
 54 |         self.box = VBox([button_box, self.out])  
 55 |         
 56 |         self.df_explore = self.df_source[self.df_source[output_column] == unclassified_value].index
 57 |         self.cursor = 0
 58 |         
 59 |         if randomize:
 60 |             self.df_explore = np.random.permutation(self.df_explore)
 61 |             
 62 |         self.out.clear_output(wait=True)
 63 |         
 64 |         
 65 |     def display(self):
 66 |         """
 67 |         Display the widget
 68 |         """
 69 |         display(self.box)
 70 |         self.display_next_row()
 71 |         
 72 |     def on_button_clicked_t(self, b):
 73 |         #print(f"TEST Button clicked: {b.description}, {b.class_id}")
 74 |         if (self.cursor) <= len(self.df_explore) and len(self.df_explore) > 0: 
 75 |             self.df_source.loc[self.df_explore[self.cursor - 1], self.output_column] = b.class_id
 76 |         self.display_next_row()
 77 |         
 78 |     def display_next_row(self):
 79 |         #pdb.set_trace()
 80 |         if (self.cursor) >= len(self.df_explore): 
 81 |             with self.out:
 82 |                 self.out.clear_output()
 83 |                 print('Finished: All rows have been processed')
 84 |             return
 85 |         
 86 |         next_text = str(self.df_source[self.content_column].loc[self.df_explore[self.cursor]])
 87 |         with self.out:
 88 |             print(f'Row index: {self.df_explore[self.cursor]} | Number of rows processed : {self.cursor} \n')
 89 |             print(f'{next_text}')
 90 |         
 91 |         
 92 |         self.cursor += 1
 93 |         self.out.clear_output(wait=True)
 94 |       
 95 |             
 96 |         
 97 |  
 98 | 
 99 | class ButtonLabeling(Button):
100 |     def __init__(self, class_id, *args, **kwargs):
101 |         self.class_id = class_id
102 |         super().__init__(*args, **kwargs)
103 |     


--------------------------------------------------------------------------------
/Example using the LabelMyText widget.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "ExecuteTime": {
  8 |      "end_time": "2019-01-30T14:37:01.337452Z",
  9 |      "start_time": "2019-01-30T14:37:01.159269Z"
 10 |     }
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "import pandas as pd\n",
 15 |     "import numpy as np\n",
 16 |     "from LabelMyTextWidget import *"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {
 23 |     "ExecuteTime": {
 24 |      "end_time": "2019-01-30T14:37:01.341053Z",
 25 |      "start_time": "2019-01-30T14:37:01.338560Z"
 26 |     }
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "df_test = pd.DataFrame(data = ['Sample text 1', 'sample text 2', 'sample text 3'], columns=['text'])"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {
 37 |     "ExecuteTime": {
 38 |      "end_time": "2019-01-30T14:37:01.357705Z",
 39 |      "start_time": "2019-01-30T14:37:01.342155Z"
 40 |     }
 41 |    },
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/html": [
 46 |        "<div>\n",
 47 |        "<style scoped>\n",
 48 |        "    .dataframe tbody tr th:only-of-type {\n",
 49 |        "        vertical-align: middle;\n",
 50 |        "    }\n",
 51 |        "\n",
 52 |        "    .dataframe tbody tr th {\n",
 53 |        "        vertical-align: top;\n",
 54 |        "    }\n",
 55 |        "\n",
 56 |        "    .dataframe thead th {\n",
 57 |        "        text-align: right;\n",
 58 |        "    }\n",
 59 |        "</style>\n",
 60 |        "<table border=\"1\" class=\"dataframe\">\n",
 61 |        "  <thead>\n",
 62 |        "    <tr style=\"text-align: right;\">\n",
 63 |        "      <th></th>\n",
 64 |        "      <th>text</th>\n",
 65 |        "    </tr>\n",
 66 |        "  </thead>\n",
 67 |        "  <tbody>\n",
 68 |        "    <tr>\n",
 69 |        "      <th>0</th>\n",
 70 |        "      <td>Sample text 1</td>\n",
 71 |        "    </tr>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>1</th>\n",
 74 |        "      <td>sample text 2</td>\n",
 75 |        "    </tr>\n",
 76 |        "    <tr>\n",
 77 |        "      <th>2</th>\n",
 78 |        "      <td>sample text 3</td>\n",
 79 |        "    </tr>\n",
 80 |        "  </tbody>\n",
 81 |        "</table>\n",
 82 |        "</div>"
 83 |       ],
 84 |       "text/plain": [
 85 |        "            text\n",
 86 |        "0  Sample text 1\n",
 87 |        "1  sample text 2\n",
 88 |        "2  sample text 3"
 89 |       ]
 90 |      },
 91 |      "execution_count": 3,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "df_test"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "The widget goal is to help the labeling process of a dataframe containing text data.\n",
105 |     "The widget will fill the labeling column with the selected class (ex: Positive, Negative for sentiment analysis).\n",
106 |     "\n",
107 |     "The label column should be created with a default unclassified value (to be able to keep track of unlabeled rows). \n",
108 |     "\n",
109 |     "In this example, we will have the following class values :\n",
110 |     "\n",
111 |     "- Unclassified value is -1\n",
112 |     "- Negative class value is 0\n",
113 |     "- Positive class value is 1"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 4,
119 |    "metadata": {
120 |     "ExecuteTime": {
121 |      "end_time": "2019-01-30T14:37:01.361026Z",
122 |      "start_time": "2019-01-30T14:37:01.358757Z"
123 |     }
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "df_test.loc[:, 'label'] = -1"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 5,
133 |    "metadata": {
134 |     "ExecuteTime": {
135 |      "end_time": "2019-01-30T14:37:01.371027Z",
136 |      "start_time": "2019-01-30T14:37:01.362607Z"
137 |     }
138 |    },
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/html": [
143 |        "<div>\n",
144 |        "<style scoped>\n",
145 |        "    .dataframe tbody tr th:only-of-type {\n",
146 |        "        vertical-align: middle;\n",
147 |        "    }\n",
148 |        "\n",
149 |        "    .dataframe tbody tr th {\n",
150 |        "        vertical-align: top;\n",
151 |        "    }\n",
152 |        "\n",
153 |        "    .dataframe thead th {\n",
154 |        "        text-align: right;\n",
155 |        "    }\n",
156 |        "</style>\n",
157 |        "<table border=\"1\" class=\"dataframe\">\n",
158 |        "  <thead>\n",
159 |        "    <tr style=\"text-align: right;\">\n",
160 |        "      <th></th>\n",
161 |        "      <th>text</th>\n",
162 |        "      <th>label</th>\n",
163 |        "    </tr>\n",
164 |        "  </thead>\n",
165 |        "  <tbody>\n",
166 |        "    <tr>\n",
167 |        "      <th>0</th>\n",
168 |        "      <td>Sample text 1</td>\n",
169 |        "      <td>-1</td>\n",
170 |        "    </tr>\n",
171 |        "    <tr>\n",
172 |        "      <th>1</th>\n",
173 |        "      <td>sample text 2</td>\n",
174 |        "      <td>-1</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>2</th>\n",
178 |        "      <td>sample text 3</td>\n",
179 |        "      <td>-1</td>\n",
180 |        "    </tr>\n",
181 |        "  </tbody>\n",
182 |        "</table>\n",
183 |        "</div>"
184 |       ],
185 |       "text/plain": [
186 |        "            text  label\n",
187 |        "0  Sample text 1     -1\n",
188 |        "1  sample text 2     -1\n",
189 |        "2  sample text 3     -1"
190 |       ]
191 |      },
192 |      "execution_count": 5,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "df_test"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 6,
204 |    "metadata": {
205 |     "ExecuteTime": {
206 |      "end_time": "2019-01-30T14:37:01.406751Z",
207 |      "start_time": "2019-01-30T14:37:01.372094Z"
208 |     }
209 |    },
210 |    "outputs": [
211 |     {
212 |      "data": {
213 |       "application/vnd.jupyter.widget-view+json": {
214 |        "model_id": "bcec871543cc42e1bac6cc4873e37ec9",
215 |        "version_major": 2,
216 |        "version_minor": 0
217 |       },
218 |       "text/plain": [
219 |        "VBox(children=(HBox(children=(Label(value='Label'), ButtonLabeling(description='Negative', style=ButtonStyle()…"
220 |       ]
221 |      },
222 |      "metadata": {},
223 |      "output_type": "display_data"
224 |     }
225 |    ],
226 |    "source": [
227 |     "w = LabelMyTextWidget(df_test, 'text', ['Negative', 'Positive'], [0, 1], 'label', unclassified_value=-1)\n",
228 |     "w.display()"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "After having labeled the first two rows with positive class, we get the following data:"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 8,
241 |    "metadata": {
242 |     "ExecuteTime": {
243 |      "end_time": "2019-01-30T14:37:05.452663Z",
244 |      "start_time": "2019-01-30T14:37:05.448082Z"
245 |     }
246 |    },
247 |    "outputs": [
248 |     {
249 |      "data": {
250 |       "text/html": [
251 |        "<div>\n",
252 |        "<style scoped>\n",
253 |        "    .dataframe tbody tr th:only-of-type {\n",
254 |        "        vertical-align: middle;\n",
255 |        "    }\n",
256 |        "\n",
257 |        "    .dataframe tbody tr th {\n",
258 |        "        vertical-align: top;\n",
259 |        "    }\n",
260 |        "\n",
261 |        "    .dataframe thead th {\n",
262 |        "        text-align: right;\n",
263 |        "    }\n",
264 |        "</style>\n",
265 |        "<table border=\"1\" class=\"dataframe\">\n",
266 |        "  <thead>\n",
267 |        "    <tr style=\"text-align: right;\">\n",
268 |        "      <th></th>\n",
269 |        "      <th>text</th>\n",
270 |        "      <th>label</th>\n",
271 |        "    </tr>\n",
272 |        "  </thead>\n",
273 |        "  <tbody>\n",
274 |        "    <tr>\n",
275 |        "      <th>0</th>\n",
276 |        "      <td>Sample text 1</td>\n",
277 |        "      <td>1</td>\n",
278 |        "    </tr>\n",
279 |        "    <tr>\n",
280 |        "      <th>1</th>\n",
281 |        "      <td>sample text 2</td>\n",
282 |        "      <td>1</td>\n",
283 |        "    </tr>\n",
284 |        "    <tr>\n",
285 |        "      <th>2</th>\n",
286 |        "      <td>sample text 3</td>\n",
287 |        "      <td>-1</td>\n",
288 |        "    </tr>\n",
289 |        "  </tbody>\n",
290 |        "</table>\n",
291 |        "</div>"
292 |       ],
293 |       "text/plain": [
294 |        "            text  label\n",
295 |        "0  Sample text 1      1\n",
296 |        "1  sample text 2      1\n",
297 |        "2  sample text 3     -1"
298 |       ]
299 |      },
300 |      "execution_count": 8,
301 |      "metadata": {},
302 |      "output_type": "execute_result"
303 |     }
304 |    ],
305 |    "source": [
306 |     "df_test"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {
312 |     "ExecuteTime": {
313 |      "end_time": "2019-01-25T15:28:36.913683Z",
314 |      "start_time": "2019-01-25T15:28:36.908965Z"
315 |     }
316 |    },
317 |    "source": [
318 |     "We can then save the dataset which is now partialy labeled. If we run the widget again, we will be able to finish labeling on the rows with label value -1."
319 |    ]
320 |   }
321 |  ],
322 |  "metadata": {
323 |   "kernelspec": {
324 |    "display_name": "Python 3",
325 |    "language": "python",
326 |    "name": "python3"
327 |   },
328 |   "language_info": {
329 |    "codemirror_mode": {
330 |     "name": "ipython",
331 |     "version": 3
332 |    },
333 |    "file_extension": ".py",
334 |    "mimetype": "text/x-python",
335 |    "name": "python",
336 |    "nbconvert_exporter": "python",
337 |    "pygments_lexer": "ipython3",
338 |    "version": "3.6.5"
339 |   }
340 |  },
341 |  "nbformat": 4,
342 |  "nbformat_minor": 2
343 | }
344 | 


--------------------------------------------------------------------------------