├── LabelMyTextWidget
├── __init__.py
└── LabelMyTextWidget.py
├── LabelMyTextWidget.gif
├── README.md
└── Example using the LabelMyText widget.ipynb
/LabelMyTextWidget/__init__.py:
--------------------------------------------------------------------------------
1 | from .LabelMyTextWidget import *
--------------------------------------------------------------------------------
/LabelMyTextWidget.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tchambon/LabelMyTextWidget/HEAD/LabelMyTextWidget.gif
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LabelMyTextWidget
2 |
3 | LabelMyTextWidget is an iPythonWidget to quickly label text data.
4 |
5 | This goal of this project is to speed up and simplify the process of manual labeling.
6 |
7 | The widget provides a minimalist interface to review each row of a pandas dataframe and apply a label to it.
8 |
9 | 
10 |
11 |
12 | ## How to use the widget
13 |
14 | A Jupyter notebook is at the root of the repository, and it contains an example.
15 |
16 | ## How to install it
17 |
18 | For the time being, you have to download the folder LabelMyTextWidget and put it in your project folder.
19 | Better installation options and packaging will come soon.
20 |
21 |
22 | ## Dependancy
23 |
24 | A python environnement with Jupyter notebook, pandas and numpy is required.
25 | The best is to use the Anaconda distribution with Python 3.
26 |
27 | Besides that, the following should be installed:
28 |
29 | ```shell
30 | pip install ipywidgets
31 |
32 | jupyter nbextension enable --py widgetsnbextension
33 | ```
--------------------------------------------------------------------------------
/LabelMyTextWidget/LabelMyTextWidget.py:
--------------------------------------------------------------------------------
1 | import ipywidgets as widgets
2 | from IPython.display import display
3 | from ipywidgets import Button, HBox, VBox
4 | import pandas as pd
5 | import numpy as np
6 |
7 |
8 | __all__ = ['LabelMyTextWidget']
9 |
10 | class LabelMyTextWidget:
11 | """
12 | Widget to quickly label a dataframe containing a column with text data and a column with target labels.
13 |
14 | """
15 | def __init__(self, df_source, content_column, class_list, class_id, output_column, unclassified_value=-1, randomize=False):
16 | """
17 | Create a LabelMyTextWidget object.
18 |
19 | :param df_source: The pandas dataframe containing the data column and the label column (to fill by the widget)
20 | :param content_column: The name of the column containing the text data to label
21 | :param class_list: List of the label type names (ex: Positive, Negative, Neutral)
22 | :param class_id: The id of each label type
23 | :param output_column: Name of the column to complete with labels
24 | :param unclassified_value: Value of the unclassified rows for the output_column
25 | :param randomize: If true, the labeling order will be random. If False, it will follow the index numbers
26 |
27 |
28 | :Example:
29 | >>> df['text'] = 'example of text content to label'
30 | >>> df['label'] = -1
31 | >>> LabelMyText(df, 'text', ['positive', 'negative'], [1, 0], 'label', unclassified_value=-1)
32 | """
33 |
34 | self.df_source = df_source
35 | self.content_column = content_column
36 | self.output_column = output_column
37 | self.unclassified_value = unclassified_value
38 | self.randomize = randomize
39 |
40 |
41 | self.items = [ButtonLabeling(class_id = class_id[i],description=l) for i, l in enumerate(class_list)]
42 | self.items.append(ButtonLabeling(class_id = unclassified_value, description='Skip', button_style='warning'))
43 |
44 | for button in self.items:
45 | button.on_click(self.on_button_clicked_t)
46 |
47 |
48 |
49 | self.out = widgets.Output(layout={'border': '1px solid black'})
50 | self.out.append_stderr('Text is coming here')
51 |
52 | button_box = HBox([widgets.Label(value="Label"), *self.items])
53 |
54 | self.box = VBox([button_box, self.out])
55 |
56 | self.df_explore = self.df_source[self.df_source[output_column] == unclassified_value].index
57 | self.cursor = 0
58 |
59 | if randomize:
60 | self.df_explore = np.random.permutation(self.df_explore)
61 |
62 | self.out.clear_output(wait=True)
63 |
64 |
65 | def display(self):
66 | """
67 | Display the widget
68 | """
69 | display(self.box)
70 | self.display_next_row()
71 |
72 | def on_button_clicked_t(self, b):
73 | #print(f"TEST Button clicked: {b.description}, {b.class_id}")
74 | if (self.cursor) <= len(self.df_explore) and len(self.df_explore) > 0:
75 | self.df_source.loc[self.df_explore[self.cursor - 1], self.output_column] = b.class_id
76 | self.display_next_row()
77 |
78 | def display_next_row(self):
79 | #pdb.set_trace()
80 | if (self.cursor) >= len(self.df_explore):
81 | with self.out:
82 | self.out.clear_output()
83 | print('Finished: All rows have been processed')
84 | return
85 |
86 | next_text = str(self.df_source[self.content_column].loc[self.df_explore[self.cursor]])
87 | with self.out:
88 | print(f'Row index: {self.df_explore[self.cursor]} | Number of rows processed : {self.cursor} \n')
89 | print(f'{next_text}')
90 |
91 |
92 | self.cursor += 1
93 | self.out.clear_output(wait=True)
94 |
95 |
96 |
97 |
98 |
99 | class ButtonLabeling(Button):
100 | def __init__(self, class_id, *args, **kwargs):
101 | self.class_id = class_id
102 | super().__init__(*args, **kwargs)
103 |
--------------------------------------------------------------------------------
/Example using the LabelMyText widget.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "ExecuteTime": {
8 | "end_time": "2019-01-30T14:37:01.337452Z",
9 | "start_time": "2019-01-30T14:37:01.159269Z"
10 | }
11 | },
12 | "outputs": [],
13 | "source": [
14 | "import pandas as pd\n",
15 | "import numpy as np\n",
16 | "from LabelMyTextWidget import *"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {
23 | "ExecuteTime": {
24 | "end_time": "2019-01-30T14:37:01.341053Z",
25 | "start_time": "2019-01-30T14:37:01.338560Z"
26 | }
27 | },
28 | "outputs": [],
29 | "source": [
30 | "df_test = pd.DataFrame(data = ['Sample text 1', 'sample text 2', 'sample text 3'], columns=['text'])"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {
37 | "ExecuteTime": {
38 | "end_time": "2019-01-30T14:37:01.357705Z",
39 | "start_time": "2019-01-30T14:37:01.342155Z"
40 | }
41 | },
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/html": [
46 | "
\n",
47 | "\n",
60 | "
\n",
61 | " \n",
62 | " \n",
63 | " | \n",
64 | " text | \n",
65 | "
\n",
66 | " \n",
67 | " \n",
68 | " \n",
69 | " | 0 | \n",
70 | " Sample text 1 | \n",
71 | "
\n",
72 | " \n",
73 | " | 1 | \n",
74 | " sample text 2 | \n",
75 | "
\n",
76 | " \n",
77 | " | 2 | \n",
78 | " sample text 3 | \n",
79 | "
\n",
80 | " \n",
81 | "
\n",
82 | "
"
83 | ],
84 | "text/plain": [
85 | " text\n",
86 | "0 Sample text 1\n",
87 | "1 sample text 2\n",
88 | "2 sample text 3"
89 | ]
90 | },
91 | "execution_count": 3,
92 | "metadata": {},
93 | "output_type": "execute_result"
94 | }
95 | ],
96 | "source": [
97 | "df_test"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "The widget goal is to help the labeling process of a dataframe containing text data.\n",
105 | "The widget will fill the labeling column with the selected class (ex: Positive, Negative for sentiment analysis).\n",
106 | "\n",
107 | "The label column should be created with a default unclassified value (to be able to keep track of unlabeled rows). \n",
108 | "\n",
109 | "In this example, we will have the following class values :\n",
110 | "\n",
111 | "- Unclassified value is -1\n",
112 | "- Negative class value is 0\n",
113 | "- Positive class value is 1"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 4,
119 | "metadata": {
120 | "ExecuteTime": {
121 | "end_time": "2019-01-30T14:37:01.361026Z",
122 | "start_time": "2019-01-30T14:37:01.358757Z"
123 | }
124 | },
125 | "outputs": [],
126 | "source": [
127 | "df_test.loc[:, 'label'] = -1"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 5,
133 | "metadata": {
134 | "ExecuteTime": {
135 | "end_time": "2019-01-30T14:37:01.371027Z",
136 | "start_time": "2019-01-30T14:37:01.362607Z"
137 | }
138 | },
139 | "outputs": [
140 | {
141 | "data": {
142 | "text/html": [
143 | "\n",
144 | "\n",
157 | "
\n",
158 | " \n",
159 | " \n",
160 | " | \n",
161 | " text | \n",
162 | " label | \n",
163 | "
\n",
164 | " \n",
165 | " \n",
166 | " \n",
167 | " | 0 | \n",
168 | " Sample text 1 | \n",
169 | " -1 | \n",
170 | "
\n",
171 | " \n",
172 | " | 1 | \n",
173 | " sample text 2 | \n",
174 | " -1 | \n",
175 | "
\n",
176 | " \n",
177 | " | 2 | \n",
178 | " sample text 3 | \n",
179 | " -1 | \n",
180 | "
\n",
181 | " \n",
182 | "
\n",
183 | "
"
184 | ],
185 | "text/plain": [
186 | " text label\n",
187 | "0 Sample text 1 -1\n",
188 | "1 sample text 2 -1\n",
189 | "2 sample text 3 -1"
190 | ]
191 | },
192 | "execution_count": 5,
193 | "metadata": {},
194 | "output_type": "execute_result"
195 | }
196 | ],
197 | "source": [
198 | "df_test"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 6,
204 | "metadata": {
205 | "ExecuteTime": {
206 | "end_time": "2019-01-30T14:37:01.406751Z",
207 | "start_time": "2019-01-30T14:37:01.372094Z"
208 | }
209 | },
210 | "outputs": [
211 | {
212 | "data": {
213 | "application/vnd.jupyter.widget-view+json": {
214 | "model_id": "bcec871543cc42e1bac6cc4873e37ec9",
215 | "version_major": 2,
216 | "version_minor": 0
217 | },
218 | "text/plain": [
219 | "VBox(children=(HBox(children=(Label(value='Label'), ButtonLabeling(description='Negative', style=ButtonStyle()…"
220 | ]
221 | },
222 | "metadata": {},
223 | "output_type": "display_data"
224 | }
225 | ],
226 | "source": [
227 | "w = LabelMyTextWidget(df_test, 'text', ['Negative', 'Positive'], [0, 1], 'label', unclassified_value=-1)\n",
228 | "w.display()"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "After having labeled the first two rows with positive class, we get the following data:"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 8,
241 | "metadata": {
242 | "ExecuteTime": {
243 | "end_time": "2019-01-30T14:37:05.452663Z",
244 | "start_time": "2019-01-30T14:37:05.448082Z"
245 | }
246 | },
247 | "outputs": [
248 | {
249 | "data": {
250 | "text/html": [
251 | "\n",
252 | "\n",
265 | "
\n",
266 | " \n",
267 | " \n",
268 | " | \n",
269 | " text | \n",
270 | " label | \n",
271 | "
\n",
272 | " \n",
273 | " \n",
274 | " \n",
275 | " | 0 | \n",
276 | " Sample text 1 | \n",
277 | " 1 | \n",
278 | "
\n",
279 | " \n",
280 | " | 1 | \n",
281 | " sample text 2 | \n",
282 | " 1 | \n",
283 | "
\n",
284 | " \n",
285 | " | 2 | \n",
286 | " sample text 3 | \n",
287 | " -1 | \n",
288 | "
\n",
289 | " \n",
290 | "
\n",
291 | "
"
292 | ],
293 | "text/plain": [
294 | " text label\n",
295 | "0 Sample text 1 1\n",
296 | "1 sample text 2 1\n",
297 | "2 sample text 3 -1"
298 | ]
299 | },
300 | "execution_count": 8,
301 | "metadata": {},
302 | "output_type": "execute_result"
303 | }
304 | ],
305 | "source": [
306 | "df_test"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {
312 | "ExecuteTime": {
313 | "end_time": "2019-01-25T15:28:36.913683Z",
314 | "start_time": "2019-01-25T15:28:36.908965Z"
315 | }
316 | },
317 | "source": [
318 | "We can then save the dataset which is now partialy labeled. If we run the widget again, we will be able to finish labeling on the rows with label value -1."
319 | ]
320 | }
321 | ],
322 | "metadata": {
323 | "kernelspec": {
324 | "display_name": "Python 3",
325 | "language": "python",
326 | "name": "python3"
327 | },
328 | "language_info": {
329 | "codemirror_mode": {
330 | "name": "ipython",
331 | "version": 3
332 | },
333 | "file_extension": ".py",
334 | "mimetype": "text/x-python",
335 | "name": "python",
336 | "nbconvert_exporter": "python",
337 | "pygments_lexer": "ipython3",
338 | "version": "3.6.5"
339 | }
340 | },
341 | "nbformat": 4,
342 | "nbformat_minor": 2
343 | }
344 |
--------------------------------------------------------------------------------