├── README.md ├── chat_analytics.py ├── requirements.txt ├── stopwords.txt └── text_mining.py /README.md: -------------------------------------------------------------------------------- 1 | ## CHAT ANALYTICS 2 | 3 | A python CLI to summarize and visualize the number of question marks 4 | of a given Whatsapp chat history. 5 | 6 | ### requirements: 7 | 8 | - python 3.6 9 | 10 | ### how to use: 11 | 12 | - export the whatsapp chat as .txt file. 13 | - clone or download this repo. 14 | - install the dependencies using pip `pip install -r requirements.txt`. 15 | - run `python chat_analytics.py -f `. -------------------------------------------------------------------------------- /chat_analytics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import click 4 | from text_mining import TextCleaner 5 | 6 | class ChatAnalytics: 7 | 8 | __is_ios = False 9 | 10 | def __init__(self, chat_path): 11 | with open(chat_path, 'r', encoding='utf-8') as h: 12 | self.__history = h.read() 13 | 14 | def __to_pandas(self): 15 | self.__is_ios = self.__history[0] == '[' 16 | def startsWithDate(s): 17 | 18 | if self.__is_ios: 19 | if s[0] == '[': 20 | return '~StartsWithDate~' 21 | else: 22 | return '' 23 | else: 24 | is_digit = str(s[0]).isdigit() 25 | datefmt = str(s).split(',')[0] 26 | if is_digit and '/' in datefmt: 27 | 28 | return '~StartsWithDate~' 29 | else: 30 | return ' ' 31 | 32 | s = '\n'.join(['\n' if i == '' else f"{startsWithDate(i)} {i}".strip() for i in self.__history.split('\n')]).split( 33 | "~StartsWithDate~ ") 34 | 35 | s = [i.strip('\n') for i in s] 36 | 37 | chat_dict = [] 38 | for i in s: 39 | 40 | # ios 41 | if self.__is_ios: 42 | 43 | 44 | coord = i.find(':') 45 | close_bracket = i.find(']') 46 | open_bracket = i.find('[') 47 | chat = i[close_bracket+1:] 48 | 49 | if coord > 0 and 'image omitted' not in chat: 50 | 51 | sender = chat.split(":")[0].strip() 52 | time_chat = i[open_bracket + 1:close_bracket] 53 | date = time_chat.split()[0] 54 | 55 | text = " ".join(chat.split(":")[1:]).strip() 56 | if len(time_chat.split()) < 5 and len(date) >= 8: 57 | chat_item = dict( 58 | date=date, 59 | time=time_chat.split()[1] if len(time_chat.split()) < 3 else f"{time_chat.split()[1]} {time_chat.split()[2]}", 60 | sender=sender, 61 | text=text 62 | ) 63 | 64 | chat_dict.append(chat_item) 65 | 66 | else: 67 | 68 | 69 | coord = i.find(':') 70 | if coord > 0 and 'image omitted' not in i: 71 | 72 | chat = i[i.find("-")+1:] 73 | time_chat = i.split("-")[0].replace(",", "") 74 | date = time_chat.split()[0] 75 | sender = chat[:chat.find(":")].strip() 76 | text = chat[chat.find(":")+1:].strip() 77 | if len(time_chat.split()) < 5 and len(date) >= 8: 78 | chat_item = dict( 79 | date=date, 80 | time=time_chat.split()[1] if len( 81 | time_chat.split()) < 3 else f"{time_chat.split()[1]} {time_chat.split()[2]}", 82 | sender=sender, 83 | text=text 84 | ) 85 | 86 | chat_dict.append(chat_item) 87 | 88 | chat_df = pd.DataFrame(chat_dict) 89 | 90 | return chat_df 91 | 92 | def run(self): 93 | chat_df = self.__to_pandas() 94 | chat_df = chat_df[~chat_df.text.str.contains('end-to-end')] 95 | chat_df = chat_df[~chat_df.text.str.contains('omitted')] 96 | chat_df = chat_df[~chat_df.text.str.contains('voice')] 97 | senders = list(chat_df.sender.unique()) 98 | sender_1 = senders[0] 99 | sender_2 = senders[1] 100 | 101 | sender_1_df = chat_df[chat_df.sender == sender_1] 102 | sender_2_df = chat_df[chat_df.sender == sender_2] 103 | 104 | sender_1_text = " ".join(list(sender_1_df.text)) 105 | sender_2_text = " ".join(list(sender_2_df.text)) 106 | 107 | is_12_format = chat_df[(chat_df.time.str.contains('AM')) | chat_df.time.str.contains('PM')].shape[0] > 0 108 | if self.__is_ios: 109 | 110 | 111 | format_hour = "%H.%M.%S" if not is_12_format else "%I.%M.%S %p" 112 | 113 | chat_df['CHAT_HOUR'] = pd.to_datetime(chat_df['time'], format=format_hour) 114 | chat_df['CHAT_HOUR'] = chat_df.CHAT_HOUR.apply(lambda x: f"{x.hour:02d}") 115 | 116 | else: 117 | format_hour = "%H:%M" if not is_12_format else "%I:%M %p" 118 | chat_df['CHAT_HOUR'] = pd.to_datetime(chat_df['time'], format=format_hour) 119 | chat_df['CHAT_HOUR'] = chat_df.CHAT_HOUR.apply(lambda x: f"{x.hour:02d}") 120 | 121 | 122 | 123 | 124 | 125 | 126 | # wordcloud cleaner 127 | cleaner = TextCleaner() 128 | 129 | sender_1_wordcount_df = cleaner.get_clean_text(text=sender_1_text) 130 | sender_1_wordcount_df = sender_1_wordcount_df[sender_1_wordcount_df['count'] > 2] 131 | sender_1_wordcount_df = sender_1_wordcount_df.head(15) 132 | 133 | 134 | sender_2_wordcount_df = cleaner.get_clean_text(text=sender_2_text) 135 | sender_2_wordcount_df = sender_2_wordcount_df[sender_2_wordcount_df['count'] > 2] 136 | sender_2_wordcount_df = sender_2_wordcount_df.head(15) 137 | 138 | 139 | try: 140 | if self.__is_ios: 141 | chat_df.date = pd.to_datetime(chat_df.date, format="%d/%m/%y", errors='coerce') 142 | chat_df.date = chat_df.date.dt.tz_localize('UTC').dt.tz_convert('Asia/Jakarta') 143 | else: 144 | chat_df.date = pd.to_datetime(chat_df.date, format="%m/%d/%y", errors='coerce') 145 | chat_df.date = chat_df.date.dt.tz_localize('UTC').dt.tz_convert('Asia/Jakarta') 146 | except Exception as e: 147 | print(f"faield to covert to datetime format: {e}") 148 | 149 | # add total words 150 | chat_df['total_words'] = chat_df.text.apply(lambda x: len(str(x).split())) 151 | total_words_by_hour = chat_df.groupby(['sender', 'CHAT_HOUR']).total_words.\ 152 | sum().\ 153 | reset_index() 154 | 155 | total_words_by_hour = total_words_by_hour[total_words_by_hour.sender.isin([sender_1, sender_2])] 156 | total_words_by_hour_sender_1 = total_words_by_hour[total_words_by_hour.sender == sender_1] 157 | total_words_by_hour_sender_2 = total_words_by_hour[total_words_by_hour.sender == sender_2] 158 | 159 | total_words_by_hour_sender_1 = total_words_by_hour_sender_1.set_index('CHAT_HOUR').sort_index() 160 | total_words_by_hour_sender_2 = total_words_by_hour_sender_2.set_index('CHAT_HOUR').sort_index() 161 | 162 | 163 | 164 | total_words = chat_df.groupby(['date', 'sender'])['total_words'].sum() \ 165 | .reset_index() \ 166 | .rename(columns={'total_words': 'total'}) 167 | 168 | total_words = total_words.sort_values(by='date') 169 | 170 | chat_df['?'] = chat_df['text'].str.contains("""[?]""").astype(int) 171 | qmarks_counter = chat_df.groupby(['date', 'sender'])['?'].sum()\ 172 | .reset_index()\ 173 | .rename(columns={'?': 'total'}) 174 | qmarks_counter = qmarks_counter.sort_values(by='date') 175 | 176 | 177 | # visualize 178 | 179 | 180 | # plot size 181 | # plt.rcParams["figure.figsize"] = (15, 5) 182 | 183 | fig = plt.figure("Chat Analytics") 184 | 185 | # axes 186 | 187 | # ax word count 188 | ax_word_count_sender_1 = fig.add_axes((0.3, 0.08, 0.15, 0.4)) 189 | ax_word_count_sender_2 = fig.add_axes((0.08, 0.08, 0.15, 0.4)) 190 | 191 | # ax qmarks 192 | ax_qmarks = fig.add_axes((0.52, 0.15, 0.45, 0.3)) 193 | ax_qmarks.set_title('? Counter') 194 | ax_qmarks.set_ylabel('total ?') 195 | 196 | # ax prime time 197 | ax_prime_time = fig.add_axes((0.07, 0.63, 0.38, 0.25)) 198 | ax_prime_time.set_title('Prime Time') 199 | ax_prime_time.set_ylabel('total words') 200 | 201 | # ax total words 202 | ax_total_words = fig.add_axes((0.52, 0.58, 0.45, 0.3)) 203 | ax_total_words.tick_params( 204 | axis='x', # changes apply to the x-axis 205 | which='both', # both major and minor ticks are affected 206 | bottom=False, # ticks along the bottom edge are off 207 | top=False, # ticks along the top edge are off 208 | labelbottom=False) 209 | ax_total_words.set_title('Words') 210 | ax_total_words.set_ylabel('total words') 211 | ax_total_words.set_xlabel('date') 212 | 213 | 214 | qmarks_pivot = qmarks_counter.pivot(index='date', columns='sender', values='total') 215 | qmarks_pivot = qmarks_pivot.fillna(0) 216 | 217 | 218 | spearman_corr_qmarks = round(qmarks_pivot.iloc[:,[0, 1]].corr('spearman').iloc[:, 0][1], 2) 219 | 220 | qmarks_pivot.iloc[:,0].plot(grid=True, label=qmarks_pivot.iloc[:,0].name, legend=True, ax=ax_qmarks, figsize=(15, 7)) 221 | qmarks_pivot.iloc[:,1].plot(grid=True, label=qmarks_pivot.iloc[:,1].name, legend=True, ax=ax_qmarks, figsize=(15, 7)) 222 | 223 | # total words plot 224 | total_words_pivot = total_words.pivot(index='date', columns='sender', values='total') 225 | total_words_pivot = total_words_pivot.fillna(0) 226 | total_words_pivot.iloc[:, 0].plot(grid=True, 227 | label=total_words_pivot.iloc[:, 0].name, legend=True, 228 | figsize=(15, 7), 229 | ax=ax_total_words, 230 | x=None) 231 | total_words_pivot.iloc[:, 1].plot(grid=True, 232 | label=total_words_pivot.iloc[:, 1].name, legend=True, 233 | figsize=(15, 7), 234 | ax=ax_total_words, 235 | x=None) 236 | 237 | # prime time plot 238 | total_words_by_hour_sender_1['total_words'].plot(kind='bar', 239 | position=1, 240 | color='#ff7f0e', 241 | label=sender_1, 242 | figsize=(15, 7), 243 | legend=True, 244 | ax=ax_prime_time, 245 | width=.35, 246 | grid=True) 247 | total_words_by_hour_sender_2['total_words'].plot(kind='bar', 248 | position=0, 249 | color='#1f77b4', 250 | label=sender_2, 251 | figsize=(15, 7), 252 | legend=True, 253 | ax=ax_prime_time, 254 | width=.35, 255 | grid=True) 256 | 257 | 258 | # word count 259 | sender_1_wordcount_df.sort_values(by='count').plot.barh(x='word', 260 | y='count', 261 | ax=ax_word_count_sender_1, 262 | legend=False, 263 | color='#ff7f0e', 264 | xlabel='', 265 | title=sender_1) 266 | 267 | sender_2_wordcount_df.sort_values(by='count').plot.barh(x='word', 268 | y='count', 269 | ax=ax_word_count_sender_2, 270 | color='#1f77b4', 271 | legend=False, 272 | xlabel='', 273 | title=sender_2) 274 | 275 | fig.text(.85, .46, f"spearman corr: {spearman_corr_qmarks}", ha='left') 276 | plt.show() 277 | 278 | @click.command() 279 | @click.option('--filepath', '-f') 280 | def main(filepath): 281 | chat = ChatAnalytics(chat_path=filepath) 282 | chat.run() 283 | 284 | if __name__ == '__main__': 285 | main() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.1.3 2 | pandas==1.1.3 3 | click==7.1.2 -------------------------------------------------------------------------------- /stopwords.txt: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | across 5 | after 6 | again 7 | against 8 | all 9 | almost 10 | alone 11 | along 12 | already 13 | also 14 | although 15 | always 16 | among 17 | an 18 | and 19 | another 20 | any 21 | anybody 22 | anyone 23 | anything 24 | anywhere 25 | are 26 | area 27 | areas 28 | around 29 | as 30 | ask 31 | asked 32 | asking 33 | asks 34 | at 35 | away 36 | b 37 | back 38 | backed 39 | backing 40 | backs 41 | be 42 | became 43 | because 44 | become 45 | becomes 46 | been 47 | before 48 | began 49 | behind 50 | being 51 | beings 52 | best 53 | better 54 | between 55 | big 56 | both 57 | but 58 | by 59 | c 60 | came 61 | can 62 | cannot 63 | case 64 | cases 65 | certain 66 | certainly 67 | clear 68 | clearly 69 | come 70 | could 71 | d 72 | did 73 | differ 74 | different 75 | differently 76 | do 77 | does 78 | done 79 | down 80 | down 81 | downed 82 | downing 83 | downs 84 | during 85 | e 86 | each 87 | early 88 | either 89 | end 90 | ended 91 | ending 92 | ends 93 | enough 94 | even 95 | evenly 96 | ever 97 | every 98 | everybody 99 | everyone 100 | everything 101 | everywhere 102 | f 103 | face 104 | faces 105 | fact 106 | facts 107 | far 108 | felt 109 | few 110 | find 111 | finds 112 | first 113 | for 114 | four 115 | from 116 | full 117 | fully 118 | further 119 | furthered 120 | furthering 121 | furthers 122 | g 123 | gave 124 | general 125 | generally 126 | get 127 | gets 128 | give 129 | given 130 | gives 131 | go 132 | going 133 | good 134 | goods 135 | got 136 | great 137 | greater 138 | greatest 139 | group 140 | grouped 141 | grouping 142 | groups 143 | h 144 | had 145 | has 146 | have 147 | having 148 | he 149 | her 150 | here 151 | herself 152 | high 153 | high 154 | high 155 | higher 156 | highest 157 | him 158 | himself 159 | his 160 | how 161 | however 162 | i 163 | if 164 | important 165 | in 166 | interest 167 | interested 168 | interesting 169 | interests 170 | into 171 | is 172 | it 173 | its 174 | itself 175 | j 176 | just 177 | k 178 | keep 179 | keeps 180 | kind 181 | knew 182 | know 183 | known 184 | knows 185 | l 186 | large 187 | largely 188 | last 189 | later 190 | latest 191 | least 192 | less 193 | let 194 | lets 195 | like 196 | likely 197 | long 198 | longer 199 | longest 200 | m 201 | made 202 | make 203 | making 204 | man 205 | many 206 | may 207 | me 208 | member 209 | members 210 | men 211 | might 212 | more 213 | most 214 | mostly 215 | mr 216 | mrs 217 | much 218 | must 219 | my 220 | myself 221 | n 222 | necessary 223 | need 224 | needed 225 | needing 226 | needs 227 | never 228 | new 229 | new 230 | newer 231 | newest 232 | next 233 | no 234 | nobody 235 | non 236 | noone 237 | not 238 | nothing 239 | now 240 | nowhere 241 | number 242 | numbers 243 | o 244 | of 245 | off 246 | often 247 | old 248 | older 249 | oldest 250 | on 251 | once 252 | one 253 | only 254 | open 255 | opened 256 | opening 257 | opens 258 | or 259 | order 260 | ordered 261 | ordering 262 | orders 263 | other 264 | others 265 | our 266 | out 267 | over 268 | p 269 | part 270 | parted 271 | parting 272 | parts 273 | per 274 | perhaps 275 | place 276 | places 277 | point 278 | pointed 279 | pointing 280 | points 281 | possible 282 | present 283 | presented 284 | presenting 285 | presents 286 | problem 287 | problems 288 | put 289 | puts 290 | q 291 | quite 292 | r 293 | rather 294 | really 295 | right 296 | right 297 | room 298 | rooms 299 | s 300 | said 301 | same 302 | saw 303 | say 304 | says 305 | second 306 | seconds 307 | see 308 | seem 309 | seemed 310 | seeming 311 | seems 312 | sees 313 | several 314 | shall 315 | she 316 | should 317 | show 318 | showed 319 | showing 320 | shows 321 | side 322 | sides 323 | since 324 | small 325 | smaller 326 | smallest 327 | so 328 | some 329 | somebody 330 | someone 331 | something 332 | somewhere 333 | state 334 | states 335 | still 336 | still 337 | such 338 | sure 339 | t 340 | take 341 | taken 342 | than 343 | that 344 | the 345 | their 346 | them 347 | then 348 | there 349 | therefore 350 | these 351 | they 352 | thing 353 | things 354 | think 355 | thinks 356 | this 357 | those 358 | though 359 | thought 360 | thoughts 361 | three 362 | through 363 | thus 364 | to 365 | today 366 | together 367 | too 368 | took 369 | toward 370 | turn 371 | turned 372 | turning 373 | turns 374 | two 375 | u 376 | under 377 | until 378 | up 379 | upon 380 | us 381 | use 382 | used 383 | uses 384 | v 385 | very 386 | w 387 | want 388 | wanted 389 | wanting 390 | wants 391 | was 392 | way 393 | ways 394 | we 395 | well 396 | wells 397 | went 398 | were 399 | what 400 | when 401 | where 402 | whether 403 | which 404 | while 405 | who 406 | whole 407 | whose 408 | why 409 | will 410 | with 411 | within 412 | without 413 | work 414 | worked 415 | working 416 | works 417 | would 418 | x 419 | y 420 | year 421 | years 422 | yet 423 | you 424 | young 425 | younger 426 | youngest 427 | your 428 | yours 429 | z 430 | ada 431 | adalah 432 | adanya 433 | adapun 434 | agak 435 | agaknya 436 | agar 437 | akan 438 | akankah 439 | akhir 440 | akhiri 441 | akhirnya 442 | aku 443 | akulah 444 | amat 445 | amatlah 446 | anda 447 | andalah 448 | antar 449 | antara 450 | antaranya 451 | apa 452 | apaan 453 | apabila 454 | apakah 455 | apalagi 456 | apatah 457 | artinya 458 | asal 459 | asalkan 460 | atas 461 | atau 462 | ataukah 463 | ataupun 464 | awal 465 | awalnya 466 | bagai 467 | bagaikan 468 | bagaimana 469 | bagaimanakah 470 | bagaimanapun 471 | bagi 472 | bagian 473 | bahkan 474 | bahwa 475 | bahwasanya 476 | baik 477 | bakal 478 | bakalan 479 | balik 480 | banyak 481 | bapak 482 | baru 483 | bawah 484 | beberapa 485 | begini 486 | beginian 487 | beginikah 488 | beginilah 489 | begitu 490 | begitukah 491 | begitulah 492 | begitupun 493 | bekerja 494 | belakang 495 | belakangan 496 | belum 497 | belumlah 498 | benar 499 | benarkah 500 | benarlah 501 | berada 502 | berakhir 503 | berakhirlah 504 | berakhirnya 505 | berapa 506 | berapakah 507 | berapalah 508 | berapapun 509 | berarti 510 | berawal 511 | berbagai 512 | berdatangan 513 | beri 514 | berikan 515 | berikut 516 | berikutnya 517 | berjumlah 518 | berkali-kali 519 | berkata 520 | berkehendak 521 | berkeinginan 522 | berkenaan 523 | berlainan 524 | berlalu 525 | berlangsung 526 | berlebihan 527 | bermacam 528 | bermacam-macam 529 | bermaksud 530 | bermula 531 | bersama 532 | bersama-sama 533 | bersiap 534 | bersiap-siap 535 | bertanya 536 | bertanya-tanya 537 | berturut 538 | berturut-turut 539 | bertutur 540 | berujar 541 | berupa 542 | besar 543 | betul 544 | betulkah 545 | biasa 546 | biasanya 547 | bila 548 | bilakah 549 | bisa 550 | bisakah 551 | boleh 552 | bolehkah 553 | bolehlah 554 | buat 555 | bukan 556 | bukankah 557 | bukanlah 558 | bukannya 559 | bulan 560 | bung 561 | cara 562 | caranya 563 | cukup 564 | cukupkah 565 | cukuplah 566 | cuma 567 | dahulu 568 | dalam 569 | dan 570 | dapat 571 | dari 572 | daripada 573 | datang 574 | dekat 575 | demi 576 | demikian 577 | demikianlah 578 | dengan 579 | depan 580 | di 581 | dia 582 | diakhiri 583 | diakhirinya 584 | dialah 585 | diantara 586 | diantaranya 587 | diberi 588 | diberikan 589 | diberikannya 590 | dibuat 591 | dibuatnya 592 | didapat 593 | didatangkan 594 | digunakan 595 | diibaratkan 596 | diibaratkannya 597 | diingat 598 | diingatkan 599 | diinginkan 600 | dijawab 601 | dijelaskan 602 | dijelaskannya 603 | dikarenakan 604 | dikatakan 605 | dikatakannya 606 | dikerjakan 607 | diketahui 608 | diketahuinya 609 | dikira 610 | dilakukan 611 | dilalui 612 | dilihat 613 | dimaksud 614 | dimaksudkan 615 | dimaksudkannya 616 | dimaksudnya 617 | diminta 618 | dimintai 619 | dimisalkan 620 | dimulai 621 | dimulailah 622 | dimulainya 623 | dimungkinkan 624 | dini 625 | dipastikan 626 | diperbuat 627 | diperbuatnya 628 | dipergunakan 629 | diperkirakan 630 | diperlihatkan 631 | diperlukan 632 | diperlukannya 633 | dipersoalkan 634 | dipertanyakan 635 | dipunyai 636 | diri 637 | dirinya 638 | disampaikan 639 | disebut 640 | disebutkan 641 | disebutkannya 642 | disini 643 | disinilah 644 | ditambahkan 645 | ditandaskan 646 | ditanya 647 | ditanyai 648 | ditanyakan 649 | ditegaskan 650 | ditujukan 651 | ditunjuk 652 | ditunjuki 653 | ditunjukkan 654 | ditunjukkannya 655 | ditunjuknya 656 | dituturkan 657 | dituturkannya 658 | diucapkan 659 | diucapkannya 660 | diungkapkan 661 | dong 662 | dua 663 | dulu 664 | empat 665 | enggak 666 | enggaknya 667 | entah 668 | entahlah 669 | guna 670 | gunakan 671 | hal 672 | hampir 673 | hanya 674 | hanyalah 675 | hari 676 | harus 677 | haruslah 678 | harusnya 679 | hendak 680 | hendaklah 681 | hendaknya 682 | hingga 683 | ia 684 | ialah 685 | ibarat 686 | ibaratkan 687 | ibaratnya 688 | ibu 689 | ikut 690 | ingat 691 | ingat-ingat 692 | ingin 693 | inginkah 694 | inginkan 695 | ini 696 | inikah 697 | inilah 698 | itu 699 | itukah 700 | itulah 701 | jadi 702 | jadilah 703 | jadinya 704 | jangan 705 | jangankan 706 | janganlah 707 | jauh 708 | jawab 709 | jawaban 710 | jawabnya 711 | jelas 712 | jelaskan 713 | jelaslah 714 | jelasnya 715 | jika 716 | jikalau 717 | juga 718 | jumlah 719 | jumlahnya 720 | justru 721 | kala 722 | kalau 723 | kalaulah 724 | kalaupun 725 | kalian 726 | kami 727 | kamilah 728 | kamu 729 | kamulah 730 | kan 731 | kapan 732 | kapankah 733 | kapanpun 734 | karena 735 | karenanya 736 | kasus 737 | kata 738 | katakan 739 | katakanlah 740 | katanya 741 | ke 742 | keadaan 743 | kebetulan 744 | kecil 745 | kedua 746 | keduanya 747 | keinginan 748 | kelamaan 749 | kelihatan 750 | kelihatannya 751 | kelima 752 | keluar 753 | kembali 754 | kemudian 755 | kemungkinan 756 | kemungkinannya 757 | kenapa 758 | kepada 759 | kepadanya 760 | kesampaian 761 | keseluruhan 762 | keseluruhannya 763 | keterlaluan 764 | ketika 765 | khususnya 766 | kini 767 | kinilah 768 | kira 769 | kira-kira 770 | kiranya 771 | kita 772 | kitalah 773 | kok 774 | kurang 775 | lagi 776 | lagian 777 | lah 778 | lain 779 | lainnya 780 | lalu 781 | lama 782 | lamanya 783 | lanjut 784 | lanjutnya 785 | lebih 786 | lewat 787 | lima 788 | luar 789 | macam 790 | maka 791 | makanya 792 | makin 793 | malah 794 | malahan 795 | mampu 796 | mampukah 797 | mana 798 | manakala 799 | manalagi 800 | masa 801 | masalah 802 | masalahnya 803 | masih 804 | masihkah 805 | masing 806 | masing-masing 807 | mau 808 | maupun 809 | melainkan 810 | melakukan 811 | melalui 812 | melihat 813 | melihatnya 814 | memang 815 | memastikan 816 | memberi 817 | memberikan 818 | membuat 819 | memerlukan 820 | memihak 821 | meminta 822 | memintakan 823 | memisalkan 824 | memperbuat 825 | mempergunakan 826 | memperkirakan 827 | memperlihatkan 828 | mempersiapkan 829 | mempersoalkan 830 | mempertanyakan 831 | mempunyai 832 | memulai 833 | memungkinkan 834 | menaiki 835 | menambahkan 836 | menandaskan 837 | menanti 838 | menanti-nanti 839 | menantikan 840 | menanya 841 | menanyai 842 | menanyakan 843 | mendapat 844 | mendapatkan 845 | mendatang 846 | mendatangi 847 | mendatangkan 848 | menegaskan 849 | mengakhiri 850 | mengapa 851 | mengatakan 852 | mengatakannya 853 | mengenai 854 | mengerjakan 855 | mengetahui 856 | menggunakan 857 | menghendaki 858 | mengibaratkan 859 | mengibaratkannya 860 | mengingat 861 | mengingatkan 862 | menginginkan 863 | mengira 864 | mengucapkan 865 | mengucapkannya 866 | mengungkapkan 867 | menjadi 868 | menjawab 869 | menjelaskan 870 | menuju 871 | menunjuk 872 | menunjuki 873 | menunjukkan 874 | menunjuknya 875 | menurut 876 | menuturkan 877 | menyampaikan 878 | menyangkut 879 | menyatakan 880 | menyebutkan 881 | menyeluruh 882 | menyiapkan 883 | merasa 884 | mereka 885 | merekalah 886 | merupakan 887 | meski 888 | meskipun 889 | meyakini 890 | meyakinkan 891 | minta 892 | mirip 893 | misal 894 | misalkan 895 | misalnya 896 | mula 897 | mulai 898 | mulailah 899 | mulanya 900 | mungkin 901 | mungkinkah 902 | nah 903 | naik 904 | namun 905 | nanti 906 | nantinya 907 | nyaris 908 | nyatanya 909 | oleh 910 | olehnya 911 | pada 912 | padahal 913 | padanya 914 | pak 915 | paling 916 | panjang 917 | pantas 918 | para 919 | pasti 920 | pastilah 921 | penting 922 | pentingnya 923 | per 924 | percuma 925 | perlu 926 | perlukah 927 | perlunya 928 | pernah 929 | persoalan 930 | pertama 931 | pertama-tama 932 | pertanyaan 933 | pertanyakan 934 | pihak 935 | pihaknya 936 | pukul 937 | pula 938 | pun 939 | punya 940 | rasa 941 | rasanya 942 | rata 943 | rupanya 944 | saat 945 | saatnya 946 | saja 947 | sajalah 948 | saling 949 | sama 950 | sama-sama 951 | sambil 952 | sampai 953 | sampai-sampai 954 | sampaikan 955 | sana 956 | sangat 957 | sangatlah 958 | satu 959 | saya 960 | sayalah 961 | se 962 | sebab 963 | sebabnya 964 | sebagai 965 | sebagaimana 966 | sebagainya 967 | sebagian 968 | sebaik 969 | sebaik-baiknya 970 | sebaiknya 971 | sebaliknya 972 | sebanyak 973 | sebegini 974 | sebegitu 975 | sebelum 976 | sebelumnya 977 | sebenarnya 978 | seberapa 979 | sebesar 980 | sebetulnya 981 | sebisanya 982 | sebuah 983 | sebut 984 | sebutlah 985 | sebutnya 986 | secara 987 | secukupnya 988 | sedang 989 | sedangkan 990 | sedemikian 991 | sedikit 992 | sedikitnya 993 | seenaknya 994 | segala 995 | segalanya 996 | segera 997 | seharusnya 998 | sehingga 999 | seingat 1000 | sejak 1001 | sejauh 1002 | sejenak 1003 | sejumlah 1004 | sekadar 1005 | sekadarnya 1006 | sekali 1007 | sekali-kali 1008 | sekalian 1009 | sekaligus 1010 | sekalipun 1011 | sekarang 1012 | sekarang 1013 | sekecil 1014 | seketika 1015 | sekiranya 1016 | sekitar 1017 | sekitarnya 1018 | sekurang-kurangnya 1019 | sekurangnya 1020 | sela 1021 | selain 1022 | selaku 1023 | selalu 1024 | selama 1025 | selama-lamanya 1026 | selamanya 1027 | selanjutnya 1028 | seluruh 1029 | seluruhnya 1030 | semacam 1031 | semakin 1032 | semampu 1033 | semampunya 1034 | semasa 1035 | semasih 1036 | semata 1037 | semata-mata 1038 | semaunya 1039 | sementara 1040 | semisal 1041 | semisalnya 1042 | sempat 1043 | semua 1044 | semuanya 1045 | semula 1046 | sendiri 1047 | sendirian 1048 | sendirinya 1049 | seolah 1050 | seolah-olah 1051 | seorang 1052 | sepanjang 1053 | sepantasnya 1054 | sepantasnyalah 1055 | seperlunya 1056 | seperti 1057 | sepertinya 1058 | sepihak 1059 | sering 1060 | seringnya 1061 | serta 1062 | serupa 1063 | sesaat 1064 | sesama 1065 | sesampai 1066 | sesegera 1067 | sesekali 1068 | seseorang 1069 | sesuatu 1070 | sesuatunya 1071 | sesudah 1072 | sesudahnya 1073 | setelah 1074 | setempat 1075 | setengah 1076 | seterusnya 1077 | setiap 1078 | setiba 1079 | setibanya 1080 | setidak-tidaknya 1081 | setidaknya 1082 | setinggi 1083 | seusai 1084 | sewaktu 1085 | siap 1086 | siapa 1087 | siapakah 1088 | siapapun 1089 | sini 1090 | sinilah 1091 | soal 1092 | soalnya 1093 | suatu 1094 | sudah 1095 | sudahkah 1096 | sudahlah 1097 | supaya 1098 | tadi 1099 | tadinya 1100 | tahu 1101 | tahun 1102 | tak 1103 | tambah 1104 | tambahnya 1105 | tampak 1106 | tampaknya 1107 | tandas 1108 | tandasnya 1109 | tanpa 1110 | tanya 1111 | tanyakan 1112 | tanyanya 1113 | tapi 1114 | tegas 1115 | tegasnya 1116 | telah 1117 | tempat 1118 | tengah 1119 | tentang 1120 | tentu 1121 | tentulah 1122 | tentunya 1123 | tepat 1124 | terakhir 1125 | terasa 1126 | terbanyak 1127 | terdahulu 1128 | terdapat 1129 | terdiri 1130 | terhadap 1131 | terhadapnya 1132 | teringat 1133 | teringat-ingat 1134 | terjadi 1135 | terjadilah 1136 | terjadinya 1137 | terkira 1138 | terlalu 1139 | terlebih 1140 | terlihat 1141 | termasuk 1142 | ternyata 1143 | tersampaikan 1144 | tersebut 1145 | tersebutlah 1146 | tertentu 1147 | tertuju 1148 | terus 1149 | terutama 1150 | tetap 1151 | tetapi 1152 | tiap 1153 | tiba 1154 | tiba-tiba 1155 | tidak 1156 | tidakkah 1157 | tidaklah 1158 | tiga 1159 | tinggi 1160 | toh 1161 | tunjuk 1162 | turut 1163 | tutur 1164 | tuturnya 1165 | ucap 1166 | ucapnya 1167 | ujar 1168 | ujarnya 1169 | umum 1170 | umumnya 1171 | ungkap 1172 | ungkapnya 1173 | untuk 1174 | usah 1175 | usai 1176 | waduh 1177 | wah 1178 | wahai 1179 | waktu 1180 | waktunya 1181 | walau 1182 | walaupun 1183 | wong 1184 | yaitu 1185 | yakin 1186 | yakni 1187 | yang 1188 | hadeh 1189 | gak 1190 | rt 1191 | the 1192 | gue 1193 | gw 1194 | ya 1195 | duh 1196 | ga 1197 | yg 1198 | kalo 1199 | aja 1200 | iya 1201 | si 1202 | gpp 1203 | nih 1204 | gt 1205 | lo 1206 | sih 1207 | deh 1208 | ywd 1209 | gk 1210 | km 1211 | qmu 1212 | qm 1213 | bgt 1214 | tuh 1215 | im 1216 | gua 1217 | gini 1218 | udah 1219 | orang 1220 | org 1221 | amp 1222 | banget 1223 | tau 1224 | oh 1225 | tp 1226 | bg 1227 | am 1228 | nggak 1229 | cant 1230 | nya 1231 | bun 1232 | wa 1233 | ngga 1234 | wis 1235 | thats 1236 | loe 1237 | kali 1238 | wkwkw 1239 | lu 1240 | wkwkwkwk 1241 | tu 1242 | met 1243 | gas 1244 | ak 1245 | aq 1246 | jgn 1247 | klo 1248 | aj 1249 | mw 1250 | dr 1251 | jd 1252 | kmu 1253 | pke 1254 | gotta 1255 | kaya 1256 | ku 1257 | aing 1258 | emang 1259 | dont 1260 | engga 1261 | wkwk 1262 | xixi 1263 | yauda 1264 | kek 1265 | perna 1266 | kak 1267 | hehe 1268 | wkwkwkwkw 1269 | dah 1270 | sy 1271 | gitu 1272 | ra 1273 | ora 1274 | yes 1275 | kayak 1276 | ntar 1277 | yuk 1278 | ama 1279 | lg 1280 | mah 1281 | bat 1282 | ni 1283 | mas 1284 | bgtttt 1285 | thank 1286 | bang 1287 | ah 1288 | hah 1289 | pake 1290 | knp 1291 | udh 1292 | ko 1293 | bro 1294 | cm 1295 | sis 1296 | sampe 1297 | karna 1298 | actually 1299 | pas 1300 | haha 1301 | kl 1302 | krn 1303 | mo 1304 | biar 1305 | gmn 1306 | lho 1307 | min 1308 | iki 1309 | koe 1310 | soale 1311 | btw 1312 | hahaha 1313 | skrg 1314 | kinda 1315 | lur 1316 | eh 1317 | -------------------------------------------------------------------------------- /text_mining.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from string import digits 3 | import re 4 | from pathlib import Path 5 | 6 | 7 | class TextCleaner: 8 | 9 | 10 | def __get_stop_words(self): 11 | stopwords = pd.read_csv(f"{Path(Path(__file__).resolve()).parent}/stopwords.txt", names=['words']) 12 | return stopwords 13 | 14 | def __get_text_df(self, text): 15 | text_df = pd.DataFrame(data=text.split(), columns=['words']) 16 | return text_df 17 | 18 | def __cleaning_text(self, text): 19 | 20 | def remove_emoji(text): 21 | emoji_pattern = re.compile("[" 22 | u"\U0001F600-\U0001F64F" # emoticons 23 | u"\U0001F300-\U0001F5FF" # symbols & pictographs 24 | u"\U0001F680-\U0001F6FF" # transport & map symbols 25 | u"\U0001F1E0-\U0001F1FF" # flags (iOS) 26 | u"\U00002702-\U000027B0" 27 | u"\U000024C2-\U0001F251" 28 | "]+", flags=re.UNICODE) 29 | return emoji_pattern.sub(r'', text) 30 | 31 | text = text.lower().strip() 32 | 33 | # remove sp char 34 | text = " ".join([i for i in text.split() if '_' not in i]) 35 | text = " ".join([i for i in text.split() if '@' not in i]) 36 | 37 | # remove url 38 | text = re.sub(r"http\S+", "", text) 39 | 40 | text = remove_emoji(text=text) 41 | text = re.sub(r'[^\w\s]', '', text) 42 | 43 | # remove digits 44 | remove_digits = str.maketrans('', '', digits) 45 | text = text.translate(remove_digits) 46 | 47 | return text 48 | 49 | def get_clean_text(self, text): 50 | text = self.__cleaning_text(text) 51 | text_df = self.__get_text_df(text) 52 | stopwords = self.__get_stop_words() 53 | 54 | clean_text_df = text_df[~text_df.words.isin(stopwords.words)].reset_index(drop=True) 55 | clean_text_df = clean_text_df.words.value_counts().\ 56 | reset_index().\ 57 | rename(columns={'index': 'word', 'words': 'count'}).\ 58 | sort_values(by=['count'], ascending=False).\ 59 | reset_index(drop=True) 60 | 61 | return clean_text_df 62 | 63 | # insert new stopword(s) into stopwords.txt 64 | def insert_new_word(self, new_word): 65 | stop_words = self.__get_stop_words() 66 | 67 | if type(new_word) == list: 68 | new_word = [i.strip().lower() for i in new_word] 69 | else: 70 | new_word = [new_word.lower().strip()] 71 | 72 | words = [] 73 | 74 | for word in new_word: 75 | if word not in list(stop_words.words): 76 | words.append(word) 77 | print(f"insert_new_word: {word} has been inserted") 78 | else: 79 | print(f"insert_new_word: {word} already exists") 80 | 81 | if words: 82 | new_stopwords_df = pd.concat([stop_words, pd.DataFrame(data=words, columns=['words'])]) 83 | new_stopwords_df.to_csv('stopwords.txt', index=False, header=False) 84 | --------------------------------------------------------------------------------