├── LabelMyTextWidget ├── __init__.py └── LabelMyTextWidget.py ├── LabelMyTextWidget.gif ├── README.md └── Example using the LabelMyText widget.ipynb /LabelMyTextWidget/__init__.py: -------------------------------------------------------------------------------- 1 | from .LabelMyTextWidget import * -------------------------------------------------------------------------------- /LabelMyTextWidget.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tchambon/LabelMyTextWidget/HEAD/LabelMyTextWidget.gif -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LabelMyTextWidget 2 | 3 | LabelMyTextWidget is an iPythonWidget to quickly label text data. 4 | 5 | This goal of this project is to speed up and simplify the process of manual labeling. 6 | 7 | The widget provides a minimalist interface to review each row of a pandas dataframe and apply a label to it. 8 | 9 | ![GIF showing the widget](https://github.com/tchambon/LabelMyTextWidget/blob/master/LabelMyTextWidget.gif "GIF Widget") 10 | 11 | 12 | ## How to use the widget 13 | 14 | A Jupyter notebook is at the root of the repository, and it contains an example. 15 | 16 | ## How to install it 17 | 18 | For the time being, you have to download the folder LabelMyTextWidget and put it in your project folder. 19 | Better installation options and packaging will come soon. 20 | 21 | 22 | ## Dependancy 23 | 24 | A python environnement with Jupyter notebook, pandas and numpy is required. 25 | The best is to use the Anaconda distribution with Python 3. 26 | 27 | Besides that, the following should be installed: 28 | 29 | ```shell 30 | pip install ipywidgets 31 | 32 | jupyter nbextension enable --py widgetsnbextension 33 | ``` -------------------------------------------------------------------------------- /LabelMyTextWidget/LabelMyTextWidget.py: -------------------------------------------------------------------------------- 1 | import ipywidgets as widgets 2 | from IPython.display import display 3 | from ipywidgets import Button, HBox, VBox 4 | import pandas as pd 5 | import numpy as np 6 | 7 | 8 | __all__ = ['LabelMyTextWidget'] 9 | 10 | class LabelMyTextWidget: 11 | """ 12 | Widget to quickly label a dataframe containing a column with text data and a column with target labels. 13 | 14 | """ 15 | def __init__(self, df_source, content_column, class_list, class_id, output_column, unclassified_value=-1, randomize=False): 16 | """ 17 | Create a LabelMyTextWidget object. 18 | 19 | :param df_source: The pandas dataframe containing the data column and the label column (to fill by the widget) 20 | :param content_column: The name of the column containing the text data to label 21 | :param class_list: List of the label type names (ex: Positive, Negative, Neutral) 22 | :param class_id: The id of each label type 23 | :param output_column: Name of the column to complete with labels 24 | :param unclassified_value: Value of the unclassified rows for the output_column 25 | :param randomize: If true, the labeling order will be random. If False, it will follow the index numbers 26 | 27 | 28 | :Example: 29 | >>> df['text'] = 'example of text content to label' 30 | >>> df['label'] = -1 31 | >>> LabelMyText(df, 'text', ['positive', 'negative'], [1, 0], 'label', unclassified_value=-1) 32 | """ 33 | 34 | self.df_source = df_source 35 | self.content_column = content_column 36 | self.output_column = output_column 37 | self.unclassified_value = unclassified_value 38 | self.randomize = randomize 39 | 40 | 41 | self.items = [ButtonLabeling(class_id = class_id[i],description=l) for i, l in enumerate(class_list)] 42 | self.items.append(ButtonLabeling(class_id = unclassified_value, description='Skip', button_style='warning')) 43 | 44 | for button in self.items: 45 | button.on_click(self.on_button_clicked_t) 46 | 47 | 48 | 49 | self.out = widgets.Output(layout={'border': '1px solid black'}) 50 | self.out.append_stderr('Text is coming here') 51 | 52 | button_box = HBox([widgets.Label(value="Label"), *self.items]) 53 | 54 | self.box = VBox([button_box, self.out]) 55 | 56 | self.df_explore = self.df_source[self.df_source[output_column] == unclassified_value].index 57 | self.cursor = 0 58 | 59 | if randomize: 60 | self.df_explore = np.random.permutation(self.df_explore) 61 | 62 | self.out.clear_output(wait=True) 63 | 64 | 65 | def display(self): 66 | """ 67 | Display the widget 68 | """ 69 | display(self.box) 70 | self.display_next_row() 71 | 72 | def on_button_clicked_t(self, b): 73 | #print(f"TEST Button clicked: {b.description}, {b.class_id}") 74 | if (self.cursor) <= len(self.df_explore) and len(self.df_explore) > 0: 75 | self.df_source.loc[self.df_explore[self.cursor - 1], self.output_column] = b.class_id 76 | self.display_next_row() 77 | 78 | def display_next_row(self): 79 | #pdb.set_trace() 80 | if (self.cursor) >= len(self.df_explore): 81 | with self.out: 82 | self.out.clear_output() 83 | print('Finished: All rows have been processed') 84 | return 85 | 86 | next_text = str(self.df_source[self.content_column].loc[self.df_explore[self.cursor]]) 87 | with self.out: 88 | print(f'Row index: {self.df_explore[self.cursor]} | Number of rows processed : {self.cursor} \n') 89 | print(f'{next_text}') 90 | 91 | 92 | self.cursor += 1 93 | self.out.clear_output(wait=True) 94 | 95 | 96 | 97 | 98 | 99 | class ButtonLabeling(Button): 100 | def __init__(self, class_id, *args, **kwargs): 101 | self.class_id = class_id 102 | super().__init__(*args, **kwargs) 103 | -------------------------------------------------------------------------------- /Example using the LabelMyText widget.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2019-01-30T14:37:01.337452Z", 9 | "start_time": "2019-01-30T14:37:01.159269Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import pandas as pd\n", 15 | "import numpy as np\n", 16 | "from LabelMyTextWidget import *" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": { 23 | "ExecuteTime": { 24 | "end_time": "2019-01-30T14:37:01.341053Z", 25 | "start_time": "2019-01-30T14:37:01.338560Z" 26 | } 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "df_test = pd.DataFrame(data = ['Sample text 1', 'sample text 2', 'sample text 3'], columns=['text'])" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": { 37 | "ExecuteTime": { 38 | "end_time": "2019-01-30T14:37:01.357705Z", 39 | "start_time": "2019-01-30T14:37:01.342155Z" 40 | } 41 | }, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/html": [ 46 | "
\n", 47 | "\n", 60 | "\n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | "
text
0Sample text 1
1sample text 2
2sample text 3
\n", 82 | "
" 83 | ], 84 | "text/plain": [ 85 | " text\n", 86 | "0 Sample text 1\n", 87 | "1 sample text 2\n", 88 | "2 sample text 3" 89 | ] 90 | }, 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "df_test" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "The widget goal is to help the labeling process of a dataframe containing text data.\n", 105 | "The widget will fill the labeling column with the selected class (ex: Positive, Negative for sentiment analysis).\n", 106 | "\n", 107 | "The label column should be created with a default unclassified value (to be able to keep track of unlabeled rows). \n", 108 | "\n", 109 | "In this example, we will have the following class values :\n", 110 | "\n", 111 | "- Unclassified value is -1\n", 112 | "- Negative class value is 0\n", 113 | "- Positive class value is 1" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 4, 119 | "metadata": { 120 | "ExecuteTime": { 121 | "end_time": "2019-01-30T14:37:01.361026Z", 122 | "start_time": "2019-01-30T14:37:01.358757Z" 123 | } 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "df_test.loc[:, 'label'] = -1" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": { 134 | "ExecuteTime": { 135 | "end_time": "2019-01-30T14:37:01.371027Z", 136 | "start_time": "2019-01-30T14:37:01.362607Z" 137 | } 138 | }, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/html": [ 143 | "
\n", 144 | "\n", 157 | "\n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | "
textlabel
0Sample text 1-1
1sample text 2-1
2sample text 3-1
\n", 183 | "
" 184 | ], 185 | "text/plain": [ 186 | " text label\n", 187 | "0 Sample text 1 -1\n", 188 | "1 sample text 2 -1\n", 189 | "2 sample text 3 -1" 190 | ] 191 | }, 192 | "execution_count": 5, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "df_test" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 6, 204 | "metadata": { 205 | "ExecuteTime": { 206 | "end_time": "2019-01-30T14:37:01.406751Z", 207 | "start_time": "2019-01-30T14:37:01.372094Z" 208 | } 209 | }, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "application/vnd.jupyter.widget-view+json": { 214 | "model_id": "bcec871543cc42e1bac6cc4873e37ec9", 215 | "version_major": 2, 216 | "version_minor": 0 217 | }, 218 | "text/plain": [ 219 | "VBox(children=(HBox(children=(Label(value='Label'), ButtonLabeling(description='Negative', style=ButtonStyle()…" 220 | ] 221 | }, 222 | "metadata": {}, 223 | "output_type": "display_data" 224 | } 225 | ], 226 | "source": [ 227 | "w = LabelMyTextWidget(df_test, 'text', ['Negative', 'Positive'], [0, 1], 'label', unclassified_value=-1)\n", 228 | "w.display()" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "After having labeled the first two rows with positive class, we get the following data:" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 8, 241 | "metadata": { 242 | "ExecuteTime": { 243 | "end_time": "2019-01-30T14:37:05.452663Z", 244 | "start_time": "2019-01-30T14:37:05.448082Z" 245 | } 246 | }, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/html": [ 251 | "
\n", 252 | "\n", 265 | "\n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | "
textlabel
0Sample text 11
1sample text 21
2sample text 3-1
\n", 291 | "
" 292 | ], 293 | "text/plain": [ 294 | " text label\n", 295 | "0 Sample text 1 1\n", 296 | "1 sample text 2 1\n", 297 | "2 sample text 3 -1" 298 | ] 299 | }, 300 | "execution_count": 8, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "df_test" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": { 312 | "ExecuteTime": { 313 | "end_time": "2019-01-25T15:28:36.913683Z", 314 | "start_time": "2019-01-25T15:28:36.908965Z" 315 | } 316 | }, 317 | "source": [ 318 | "We can then save the dataset which is now partialy labeled. If we run the widget again, we will be able to finish labeling on the rows with label value -1." 319 | ] 320 | } 321 | ], 322 | "metadata": { 323 | "kernelspec": { 324 | "display_name": "Python 3", 325 | "language": "python", 326 | "name": "python3" 327 | }, 328 | "language_info": { 329 | "codemirror_mode": { 330 | "name": "ipython", 331 | "version": 3 332 | }, 333 | "file_extension": ".py", 334 | "mimetype": "text/x-python", 335 | "name": "python", 336 | "nbconvert_exporter": "python", 337 | "pygments_lexer": "ipython3", 338 | "version": "3.6.5" 339 | } 340 | }, 341 | "nbformat": 4, 342 | "nbformat_minor": 2 343 | } 344 | --------------------------------------------------------------------------------