├── .gitignore ├── README.md ├── corpus ├── DaleChallEasyWordList.txt ├── common.txt ├── common_phrases.txt ├── contractions.txt ├── hyperbolic.txt └── terrier-stopword.txt ├── datasets ├── clickbait_data ├── non_clickbait_data ├── test.csv ├── train.csv └── unlabelled.csv ├── notebooks ├── .ipynb_checkpoints │ ├── Embeddings-checkpoint.ipynb │ ├── Feature_engineering-checkpoint.ipynb │ ├── Splitting_data_EDA-checkpoint.ipynb │ ├── feature_selection_decomposition-checkpoint.ipynb │ └── models_ensembles_tuning-checkpoint.ipynb ├── Embeddings.ipynb ├── Feature_engineering.ipynb ├── Splitting_data_EDA.ipynb ├── __pycache__ │ ├── featurization.cpython-37.pyc │ └── utility.cpython-37.pyc ├── decision_boundary.ipynb ├── feature_selection_decomposition.ipynb ├── featurization.py ├── models_ensembles_tuning.ipynb ├── saved_models │ ├── saved_model.pb │ └── variables │ │ ├── variables.data-00000-of-00002 │ │ ├── variables.data-00001-of-00002 │ │ └── variables.index └── utility.py └── web_crawled └── breitbart.csv /.gitignore: -------------------------------------------------------------------------------- 1 | vectors/* 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Classification With Extremely Small Datasets 2 | 3 | Accompanying blog : https://towardsdatascience.com/text-classification-with-extremely-small-datasets-333d322caee2 4 | 5 | ## Credits: 6 | 7 | 1. Abhijnan Chakraborty, Bhargavi Paranjape, Sourya Kakarla, and Niloy Ganguly. "Stop Clickbait: Detecting and Preventing Clickbaits in Online News Media”. In Proceedings of the 2016 IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining (ASONAM), San Fransisco, US, August 2016. 8 | 2. Potthast et al. (2016) https://webis.de/downloads/publications/papers/stein_2016b.pdf 9 | 3. Terrier Stop Word list : https://github.com/terrier-org/terrier-desktop/blob/master/share/stopword-list.txt 10 | 4. Downworthy : https://github.com/snipe/downworthy 11 | 5. Dale Chall Easy word list: http://www.readabilityformulas.com/articles/dale-chall-readability-word-list.php 12 | 13 | -------------------------------------------------------------------------------- /corpus/DaleChallEasyWordList.txt: -------------------------------------------------------------------------------- 1 | a 2 | able 3 | aboard 4 | about 5 | above 6 | absent 7 | accept 8 | accident 9 | account 10 | ache 11 | aching 12 | acorn 13 | acre 14 | across 15 | act 16 | acts 17 | add 18 | address 19 | admire 20 | adventure 21 | afar 22 | afraid 23 | after 24 | afternoon 25 | afterward 26 | afterwards 27 | again 28 | against 29 | age 30 | aged 31 | ago 32 | agree 33 | ah 34 | ahead 35 | aid 36 | aim 37 | air 38 | airfield 39 | airplane 40 | airport 41 | airship 42 | airy 43 | alarm 44 | alike 45 | alive 46 | all 47 | alley 48 | alligator 49 | allow 50 | almost 51 | alone 52 | along 53 | aloud 54 | already 55 | also 56 | always 57 | am 58 | America 59 | American 60 | among 61 | amount 62 | an 63 | and 64 | angel 65 | anger 66 | angry 67 | animal 68 | another 69 | answer 70 | ant 71 | any 72 | anybody 73 | anyhow 74 | anyone 75 | anything 76 | anyway 77 | anywhere 78 | apart 79 | apartment 80 | ape 81 | apiece 82 | appear 83 | apple 84 | April 85 | apron 86 | are 87 | aren't 88 | arise 89 | arithmetic 90 | arm 91 | armful 92 | army 93 | arose 94 | around 95 | arrange 96 | arrive 97 | arrived 98 | arrow 99 | art 100 | artist 101 | as 102 | ash 103 | ashes 104 | aside 105 | ask 106 | asleep 107 | at 108 | ate 109 | attack 110 | attend 111 | attention 112 | August 113 | aunt 114 | author 115 | auto 116 | automobile 117 | autumn 118 | avenue 119 | awake 120 | awaken 121 | away 122 | awful 123 | awfully 124 | awhile 125 | ax 126 | axe 127 | baa 128 | babe 129 | babies 130 | back 131 | background 132 | backward 133 | backwards 134 | bacon 135 | bad 136 | badge 137 | badly 138 | bag 139 | bake 140 | baker 141 | bakery 142 | baking 143 | ball 144 | balloon 145 | banana 146 | band 147 | bandage 148 | bang 149 | banjo 150 | bank 151 | banker 152 | bar 153 | barber 154 | bare 155 | barefoot 156 | barely 157 | bark 158 | barn 159 | barrel 160 | base 161 | baseball 162 | basement 163 | basket 164 | bat 165 | batch 166 | bath 167 | bathe 168 | bathing 169 | bathroom 170 | bathtub 171 | battle 172 | battleship 173 | bay 174 | be 175 | beach 176 | bead 177 | beam 178 | bean 179 | bear 180 | beard 181 | beast 182 | beat 183 | beating 184 | beautiful 185 | beautify 186 | beauty 187 | became 188 | because 189 | become 190 | becoming 191 | bed 192 | bedbug 193 | bedroom 194 | bedspread 195 | bedtime 196 | bee 197 | beech 198 | beef 199 | beefsteak 200 | beehive 201 | been 202 | beer 203 | beet 204 | before 205 | beg 206 | began 207 | beggar 208 | begged 209 | begin 210 | beginning 211 | begun 212 | behave 213 | behind 214 | being 215 | believe 216 | bell 217 | belong 218 | below 219 | belt 220 | bench 221 | bend 222 | beneath 223 | bent 224 | berries 225 | berry 226 | beside 227 | besides 228 | best 229 | bet 230 | better 231 | between 232 | bib 233 | bible 234 | bicycle 235 | bid 236 | big 237 | bigger 238 | bill 239 | billboard 240 | bin 241 | bind 242 | bird 243 | birth 244 | birthday 245 | biscuit 246 | bit 247 | bite 248 | biting 249 | bitter 250 | black 251 | blackberry 252 | blackbird 253 | blackboard 254 | blackness 255 | blacksmith 256 | blame 257 | blank 258 | blanket 259 | blast 260 | blaze 261 | bleed 262 | bless 263 | blessing 264 | blew 265 | blind 266 | blindfold 267 | blinds 268 | block 269 | blood 270 | bloom 271 | blossom 272 | blot 273 | blow 274 | blue 275 | blueberry 276 | bluebird 277 | blush 278 | board 279 | boast 280 | boat 281 | bob 282 | bobwhite 283 | bodies 284 | body 285 | boil 286 | boiler 287 | bold 288 | bone 289 | bonnet 290 | boo 291 | book 292 | bookcase 293 | bookkeeper 294 | boom 295 | boot 296 | born 297 | borrow 298 | boss 299 | both 300 | bother 301 | bottle 302 | bottom 303 | bought 304 | bounce 305 | bow 306 | bowl 307 | bow-wow 308 | box 309 | boxcar 310 | boxer 311 | boxes 312 | boy 313 | boyhood 314 | bracelet 315 | brain 316 | brake 317 | bran 318 | branch 319 | brass 320 | brave 321 | bread 322 | break 323 | breakfast 324 | breast 325 | breath 326 | breathe 327 | breeze 328 | brick 329 | bride 330 | bridge 331 | bright 332 | brightness 333 | bring 334 | broad 335 | broadcast 336 | broke 337 | broken 338 | brook 339 | broom 340 | brother 341 | brought 342 | brown 343 | brush 344 | bubble 345 | bucket 346 | buckle 347 | bud 348 | buffalo 349 | bug 350 | buggy 351 | build 352 | building 353 | built 354 | bulb 355 | bull 356 | bullet 357 | bum 358 | bumblebee 359 | bump 360 | bun 361 | bunch 362 | bundle 363 | bunny 364 | burn 365 | burst 366 | bury 367 | bus 368 | bush 369 | bushel 370 | business 371 | busy 372 | but 373 | butcher 374 | butt 375 | butter 376 | buttercup 377 | butterfly 378 | buttermilk 379 | butterscotch 380 | button 381 | buttonhole 382 | buy 383 | buzz 384 | by 385 | bye 386 | cab 387 | cabbage 388 | cabin 389 | cabinet 390 | cackle 391 | cage 392 | cake 393 | calendar 394 | calf 395 | call 396 | caller 397 | calling 398 | came 399 | camel 400 | camp 401 | campfire 402 | can 403 | canal 404 | canary 405 | candle 406 | candlestick 407 | candy 408 | cane 409 | cannon 410 | cannot 411 | canoe 412 | can't 413 | canyon 414 | cap 415 | cape 416 | capital 417 | captain 418 | car 419 | card 420 | cardboard 421 | care 422 | careful 423 | careless 424 | carelessness 425 | carload 426 | carpenter 427 | carpet 428 | carriage 429 | carrot 430 | carry 431 | cart 432 | carve 433 | case 434 | cash 435 | cashier 436 | castle 437 | cat 438 | catbird 439 | catch 440 | catcher 441 | caterpillar 442 | catfish 443 | catsup 444 | cattle 445 | caught 446 | cause 447 | cave 448 | ceiling 449 | cell 450 | cellar 451 | cent 452 | center 453 | cereal 454 | certain 455 | certainly 456 | chain 457 | chair 458 | chalk 459 | champion 460 | chance 461 | change 462 | chap 463 | charge 464 | charm 465 | chart 466 | chase 467 | chatter 468 | cheap 469 | cheat 470 | check 471 | checkers 472 | cheek 473 | cheer 474 | cheese 475 | cherry 476 | chest 477 | chew 478 | chick 479 | chicken 480 | chief 481 | child 482 | childhood 483 | children 484 | chill 485 | chilly 486 | chimney 487 | chin 488 | china 489 | chip 490 | chipmunk 491 | chocolate 492 | choice 493 | choose 494 | chop 495 | chorus 496 | chose 497 | chosen 498 | christen 499 | Christmas 500 | church 501 | churn 502 | cigarette 503 | circle 504 | circus 505 | citizen 506 | city 507 | clang 508 | clap 509 | class 510 | classmate 511 | classroom 512 | claw 513 | clay 514 | clean 515 | cleaner 516 | clear 517 | clerk 518 | clever 519 | click 520 | cliff 521 | climb 522 | clip 523 | cloak 524 | clock 525 | close 526 | closet 527 | cloth 528 | clothes 529 | clothing 530 | cloud 531 | cloudy 532 | clover 533 | clown 534 | club 535 | cluck 536 | clump 537 | coach 538 | coal 539 | coast 540 | coat 541 | cob 542 | cobbler 543 | cocoa 544 | coconut 545 | cocoon 546 | cod 547 | codfish 548 | coffee 549 | coffeepot 550 | coin 551 | cold 552 | collar 553 | college 554 | color 555 | colored 556 | colt 557 | column 558 | comb 559 | come 560 | comfort 561 | comic 562 | coming 563 | company 564 | compare 565 | conductor 566 | cone 567 | connect 568 | coo 569 | cook 570 | cooked 571 | cooking 572 | cookie 573 | cookies 574 | cool 575 | cooler 576 | coop 577 | copper 578 | copy 579 | cord 580 | cork 581 | corn 582 | corner 583 | correct 584 | cost 585 | cot 586 | cottage 587 | cotton 588 | couch 589 | cough 590 | could 591 | couldn't 592 | count 593 | counter 594 | country 595 | county 596 | course 597 | court 598 | cousin 599 | cover 600 | cow 601 | coward 602 | cowardly 603 | cowboy 604 | cozy 605 | crab 606 | crack 607 | cracker 608 | cradle 609 | cramps 610 | cranberry 611 | crank 612 | cranky 613 | crash 614 | crawl 615 | crazy 616 | cream 617 | creamy 618 | creek 619 | creep 620 | crept 621 | cried 622 | croak 623 | crook 624 | crooked 625 | crop 626 | cross 627 | crossing 628 | cross-eyed 629 | crow 630 | crowd 631 | crowded 632 | crown 633 | cruel 634 | crumb 635 | crumble 636 | crush 637 | crust 638 | cry 639 | cries 640 | cub 641 | cuff 642 | cup 643 | cuff 644 | cup 645 | cupboard 646 | cupful 647 | cure 648 | curl 649 | curly 650 | curtain 651 | curve 652 | cushion 653 | custard 654 | customer 655 | cut 656 | cute 657 | cutting 658 | dab 659 | dad 660 | daddy 661 | daily 662 | dairy 663 | daisy 664 | dam 665 | damage 666 | dame 667 | damp 668 | dance 669 | dancer 670 | dancing 671 | dandy 672 | danger 673 | dangerous 674 | dare 675 | dark 676 | darkness 677 | darling 678 | darn 679 | dart 680 | dash 681 | date 682 | daughter 683 | dawn 684 | day 685 | daybreak 686 | daytime 687 | dead 688 | deaf 689 | deal 690 | dear 691 | death 692 | December 693 | decide 694 | deck 695 | deed 696 | deep 697 | deer 698 | defeat 699 | defend 700 | defense 701 | delight 702 | den 703 | dentist 704 | depend 705 | deposit 706 | describe 707 | desert 708 | deserve 709 | desire 710 | desk 711 | destroy 712 | devil 713 | dew 714 | diamond 715 | did 716 | didn't 717 | die 718 | died 719 | dies 720 | difference 721 | different 722 | dig 723 | dim 724 | dime 725 | dine 726 | ding-dong 727 | dinner 728 | dip 729 | direct 730 | direction 731 | dirt 732 | dirty 733 | discover 734 | dish 735 | dislike 736 | dismiss 737 | ditch 738 | dive 739 | diver 740 | divide 741 | do 742 | dock 743 | doctor 744 | does 745 | doesn't 746 | dog 747 | doll 748 | dollar 749 | dolly 750 | done 751 | donkey 752 | don't 753 | door 754 | doorbell 755 | doorknob 756 | doorstep 757 | dope 758 | dot 759 | double 760 | dough 761 | dove 762 | down 763 | downstairs 764 | downtown 765 | dozen 766 | drag 767 | drain 768 | drank 769 | draw 770 | drawer 771 | draw 772 | drawing 773 | dream 774 | dress 775 | dresser 776 | dressmaker 777 | drew 778 | dried 779 | drift 780 | drill 781 | drink 782 | drip 783 | drive 784 | driven 785 | driver 786 | drop 787 | drove 788 | drown 789 | drowsy 790 | drub 791 | drum 792 | drunk 793 | dry 794 | duck 795 | due 796 | dug 797 | dull 798 | dumb 799 | dump 800 | during 801 | dust 802 | dusty 803 | duty 804 | dwarf 805 | dwell 806 | dwelt 807 | dying 808 | each 809 | eager 810 | eagle 811 | ear 812 | early 813 | earn 814 | earth 815 | east 816 | eastern 817 | easy 818 | eat 819 | eaten 820 | edge 821 | egg 822 | eh 823 | eight 824 | eighteen 825 | eighth 826 | eighty 827 | either 828 | elbow 829 | elder 830 | eldest 831 | electric 832 | electricity 833 | elephant 834 | eleven 835 | elf 836 | elm 837 | else 838 | elsewhere 839 | empty 840 | end 841 | ending 842 | enemy 843 | engine 844 | engineer 845 | English 846 | enjoy 847 | enough 848 | enter 849 | envelope 850 | equal 851 | erase 852 | eraser 853 | errand 854 | escape 855 | eve 856 | even 857 | evening 858 | ever 859 | every 860 | everybody 861 | everyday 862 | everyone 863 | everything 864 | everywhere 865 | evil 866 | exact 867 | except 868 | exchange 869 | excited 870 | exciting 871 | excuse 872 | exit 873 | expect 874 | explain 875 | extra 876 | eye 877 | eyebrow 878 | fable 879 | face 880 | facing 881 | fact 882 | factory 883 | fail 884 | faint 885 | fair 886 | fairy 887 | faith 888 | fake 889 | fall 890 | false 891 | family 892 | fan 893 | fancy 894 | far 895 | faraway 896 | fare 897 | farmer 898 | farm 899 | farming 900 | far-off 901 | farther 902 | fashion 903 | fast 904 | fasten 905 | fat 906 | father 907 | fault 908 | favor 909 | favorite 910 | fear 911 | feast 912 | feather 913 | February 914 | fed 915 | feed 916 | feel 917 | feet 918 | fell 919 | fellow 920 | felt 921 | fence 922 | fever 923 | few 924 | fib 925 | fiddle 926 | field 927 | fife 928 | fifteen 929 | fifth 930 | fifty 931 | fig 932 | fight 933 | figure 934 | file 935 | fill 936 | film 937 | finally 938 | find 939 | fine 940 | finger 941 | finish 942 | fire 943 | firearm 944 | firecracker 945 | fireplace 946 | fireworks 947 | firing 948 | first 949 | fish 950 | fisherman 951 | fist 952 | fit 953 | fits 954 | five 955 | fix 956 | flag 957 | flake 958 | flame 959 | flap 960 | flash 961 | flashlight 962 | flat 963 | flea 964 | flesh 965 | flew 966 | flies 967 | flight 968 | flip 969 | flip-flop 970 | float 971 | flock 972 | flood 973 | floor 974 | flop 975 | flour 976 | flow 977 | flower 978 | flowery 979 | flutter 980 | fly 981 | foam 982 | fog 983 | foggy 984 | fold 985 | folks 986 | follow 987 | following 988 | fond 989 | food 990 | fool 991 | foolish 992 | foot 993 | football 994 | footprint 995 | for 996 | forehead 997 | forest 998 | forget 999 | forgive 1000 | forgot 1001 | forgotten 1002 | fork 1003 | form 1004 | fort 1005 | forth 1006 | fortune 1007 | forty 1008 | forward 1009 | fought 1010 | found 1011 | fountain 1012 | four 1013 | fourteen 1014 | fourth 1015 | fox 1016 | frame 1017 | free 1018 | freedom 1019 | freeze 1020 | freight 1021 | French 1022 | fresh 1023 | fret 1024 | Friday 1025 | fried 1026 | friend 1027 | friendly 1028 | friendship 1029 | frighten 1030 | frog 1031 | from 1032 | front 1033 | frost 1034 | frown 1035 | froze 1036 | fruit 1037 | fry 1038 | fudge 1039 | fuel 1040 | full 1041 | fully 1042 | fun 1043 | funny 1044 | fur 1045 | furniture 1046 | further 1047 | fuzzy 1048 | gain 1049 | gallon 1050 | gallop 1051 | game 1052 | gang 1053 | garage 1054 | garbage 1055 | garden 1056 | gas 1057 | gasoline 1058 | gate 1059 | gather 1060 | gave 1061 | gay 1062 | gear 1063 | geese 1064 | general 1065 | gentle 1066 | gentleman 1067 | gentlemen 1068 | geography 1069 | get 1070 | getting 1071 | giant 1072 | gift 1073 | gingerbread 1074 | girl 1075 | give 1076 | given 1077 | giving 1078 | glad 1079 | gladly 1080 | glance 1081 | glass 1082 | glasses 1083 | gleam 1084 | glide 1085 | glory 1086 | glove 1087 | glow 1088 | glue 1089 | go 1090 | going 1091 | goes 1092 | goal 1093 | goat 1094 | gobble 1095 | God 1096 | god 1097 | godmother 1098 | gold 1099 | golden 1100 | goldfish 1101 | golf 1102 | gone 1103 | good 1104 | goods 1105 | goodbye 1106 | good-by 1107 | goodbye 1108 | good-bye 1109 | good-looking 1110 | goodness 1111 | goody 1112 | goose 1113 | gooseberry 1114 | got 1115 | govern 1116 | government 1117 | gown 1118 | grab 1119 | gracious 1120 | grade 1121 | grain 1122 | grand 1123 | grandchild 1124 | grandchildren 1125 | granddaughter 1126 | grandfather 1127 | grandma 1128 | grandmother 1129 | grandpa 1130 | grandson 1131 | grandstand 1132 | grape 1133 | grapes 1134 | grapefruit 1135 | grass 1136 | grasshopper 1137 | grateful 1138 | grave 1139 | gravel 1140 | graveyard 1141 | gravy 1142 | gray 1143 | graze 1144 | grease 1145 | great 1146 | green 1147 | greet 1148 | grew 1149 | grind 1150 | groan 1151 | grocery 1152 | ground 1153 | group 1154 | grove 1155 | grow 1156 | guard 1157 | guess 1158 | guest 1159 | guide 1160 | gulf 1161 | gum 1162 | gun 1163 | gunpowder 1164 | guy 1165 | ha 1166 | habit 1167 | had 1168 | hadn't 1169 | hail 1170 | hair 1171 | haircut 1172 | hairpin 1173 | half 1174 | hall 1175 | halt 1176 | ham 1177 | hammer 1178 | hand 1179 | handful 1180 | handkerchief 1181 | handle 1182 | handwriting 1183 | hang 1184 | happen 1185 | happily 1186 | happiness 1187 | happy 1188 | harbor 1189 | hard 1190 | hardly 1191 | hardship 1192 | hardware 1193 | hare 1194 | hark 1195 | harm 1196 | harness 1197 | harp 1198 | harvest 1199 | has 1200 | hasn't 1201 | haste 1202 | hasten 1203 | hasty 1204 | hat 1205 | hatch 1206 | hatchet 1207 | hate 1208 | haul 1209 | have 1210 | haven't 1211 | having 1212 | hawk 1213 | hay 1214 | hayfield 1215 | haystack 1216 | he 1217 | head 1218 | headache 1219 | heal 1220 | health 1221 | healthy 1222 | heap 1223 | hear 1224 | hearing 1225 | heard 1226 | heart 1227 | heat 1228 | heater 1229 | heaven 1230 | heavy 1231 | he'd 1232 | heel 1233 | height 1234 | held 1235 | hell 1236 | he'll 1237 | hello 1238 | helmet 1239 | help 1240 | helper 1241 | helpful 1242 | hem 1243 | hen 1244 | henhouse 1245 | her 1246 | hers 1247 | herd 1248 | here 1249 | here's 1250 | hero 1251 | herself 1252 | he's 1253 | hey 1254 | hickory 1255 | hid 1256 | hidden 1257 | hide 1258 | high 1259 | highway 1260 | hill 1261 | hillside 1262 | hilltop 1263 | hilly 1264 | him 1265 | himself 1266 | hind 1267 | hint 1268 | hip 1269 | hire 1270 | his 1271 | hiss 1272 | history 1273 | hit 1274 | hitch 1275 | hive 1276 | ho 1277 | hoe 1278 | hog 1279 | hold 1280 | holder 1281 | hole 1282 | holiday 1283 | hollow 1284 | holy 1285 | home 1286 | homely 1287 | homesick 1288 | honest 1289 | honey 1290 | honeybee 1291 | honeymoon 1292 | honk 1293 | honor 1294 | hood 1295 | hoof 1296 | hook 1297 | hoop 1298 | hop 1299 | hope 1300 | hopeful 1301 | hopeless 1302 | horn 1303 | horse 1304 | horseback 1305 | horseshoe 1306 | hose 1307 | hospital 1308 | host 1309 | hot 1310 | hotel 1311 | hound 1312 | hour 1313 | house 1314 | housetop 1315 | housewife 1316 | housework 1317 | how 1318 | however 1319 | howl 1320 | hug 1321 | huge 1322 | hum 1323 | humble 1324 | hump 1325 | hundred 1326 | hung 1327 | hunger 1328 | hungry 1329 | hunk 1330 | hunt 1331 | hunter 1332 | hurrah 1333 | hurried 1334 | hurry 1335 | hurt 1336 | husband 1337 | hush 1338 | hut 1339 | hymn 1340 | I 1341 | ice 1342 | icy 1343 | I'd 1344 | idea 1345 | ideal 1346 | if 1347 | ill 1348 | I'll 1349 | I'm 1350 | important 1351 | impossible 1352 | improve 1353 | in 1354 | inch 1355 | inches 1356 | income 1357 | indeed 1358 | Indian 1359 | indoors 1360 | ink 1361 | inn 1362 | insect 1363 | inside 1364 | instant 1365 | instead 1366 | insult 1367 | intend 1368 | interested 1369 | interesting 1370 | into 1371 | invite 1372 | iron 1373 | is 1374 | island 1375 | isn't 1376 | it 1377 | its 1378 | it's 1379 | itself 1380 | I've 1381 | ivory 1382 | ivy 1383 | jacket 1384 | jacks 1385 | jail 1386 | jam 1387 | January 1388 | jar 1389 | jaw 1390 | jay 1391 | jelly 1392 | jellyfish 1393 | jerk 1394 | jig 1395 | job 1396 | jockey 1397 | join 1398 | joke 1399 | joking 1400 | jolly 1401 | journey 1402 | joy 1403 | joyful 1404 | joyous 1405 | judge 1406 | jug 1407 | juice 1408 | juicy 1409 | July 1410 | jump 1411 | June 1412 | junior 1413 | junk 1414 | just 1415 | keen 1416 | keep 1417 | kept 1418 | kettle 1419 | key 1420 | kick 1421 | kid 1422 | kill 1423 | killed 1424 | kind 1425 | kindly 1426 | kindness 1427 | king 1428 | kingdom 1429 | kiss 1430 | kitchen 1431 | kite 1432 | kitten 1433 | kitty 1434 | knee 1435 | kneel 1436 | knew 1437 | knife 1438 | knit 1439 | knives 1440 | knob 1441 | knock 1442 | knot 1443 | know 1444 | known 1445 | lace 1446 | lad 1447 | ladder 1448 | ladies 1449 | lady 1450 | laid 1451 | lake 1452 | lamb 1453 | lame 1454 | lamp 1455 | land 1456 | lane 1457 | language 1458 | lantern 1459 | lap 1460 | lard 1461 | large 1462 | lash 1463 | lass 1464 | last 1465 | late 1466 | laugh 1467 | laundry 1468 | law 1469 | lawn 1470 | lawyer 1471 | lay 1472 | lazy 1473 | lead 1474 | leader 1475 | leaf 1476 | leak 1477 | lean 1478 | leap 1479 | learn 1480 | learned 1481 | least 1482 | leather 1483 | leave 1484 | leaving 1485 | led 1486 | left 1487 | leg 1488 | lemon 1489 | lemonade 1490 | lend 1491 | length 1492 | less 1493 | lesson 1494 | let 1495 | let's 1496 | letter 1497 | letting 1498 | lettuce 1499 | level 1500 | liberty 1501 | library 1502 | lice 1503 | lick 1504 | lid 1505 | lie 1506 | life 1507 | lift 1508 | light 1509 | lightness 1510 | lightning 1511 | like 1512 | likely 1513 | liking 1514 | lily 1515 | limb 1516 | lime 1517 | limp 1518 | line 1519 | linen 1520 | lion 1521 | lip 1522 | list 1523 | listen 1524 | lit 1525 | little 1526 | live 1527 | lives 1528 | lively 1529 | liver 1530 | living 1531 | lizard 1532 | load 1533 | loaf 1534 | loan 1535 | loaves 1536 | lock 1537 | locomotive 1538 | log 1539 | lone 1540 | lonely 1541 | lonesome 1542 | long 1543 | look 1544 | lookout 1545 | loop 1546 | loose 1547 | lord 1548 | lose 1549 | loser 1550 | loss 1551 | lost 1552 | lot 1553 | loud 1554 | love 1555 | lovely 1556 | lover 1557 | low 1558 | luck 1559 | lucky 1560 | lumber 1561 | lump 1562 | lunch 1563 | lying 1564 | ma 1565 | machine 1566 | machinery 1567 | mad 1568 | made 1569 | magazine 1570 | magic 1571 | maid 1572 | mail 1573 | mailbox 1574 | mailman 1575 | major 1576 | make 1577 | making 1578 | male 1579 | mama 1580 | mamma 1581 | man 1582 | manager 1583 | mane 1584 | manger 1585 | many 1586 | map 1587 | maple 1588 | marble 1589 | march 1590 | March 1591 | mare 1592 | mark 1593 | market 1594 | marriage 1595 | married 1596 | marry 1597 | mask 1598 | mast 1599 | master 1600 | mat 1601 | match 1602 | matter 1603 | mattress 1604 | may 1605 | May 1606 | maybe 1607 | mayor 1608 | maypole 1609 | me 1610 | meadow 1611 | meal 1612 | mean 1613 | means 1614 | meant 1615 | measure 1616 | meat 1617 | medicine 1618 | meet 1619 | meeting 1620 | melt 1621 | member 1622 | men 1623 | mend 1624 | meow 1625 | merry 1626 | mess 1627 | message 1628 | met 1629 | metal 1630 | mew 1631 | mice 1632 | middle 1633 | midnight 1634 | might 1635 | mighty 1636 | mile 1637 | milk 1638 | milkman 1639 | mill 1640 | miler 1641 | million 1642 | mind 1643 | mine 1644 | miner 1645 | mint 1646 | minute 1647 | mirror 1648 | mischief 1649 | miss 1650 | Miss 1651 | misspell 1652 | mistake 1653 | misty 1654 | mitt 1655 | mitten 1656 | mix 1657 | moment 1658 | Monday 1659 | money 1660 | monkey 1661 | month 1662 | moo 1663 | moon 1664 | moonlight 1665 | moose 1666 | mop 1667 | more 1668 | morning 1669 | morrow 1670 | moss 1671 | most 1672 | mostly 1673 | mother 1674 | motor 1675 | mount 1676 | mountain 1677 | mouse 1678 | mouth 1679 | move 1680 | movie 1681 | movies 1682 | moving 1683 | mow 1684 | Mr. 1685 | Mrs. 1686 | much 1687 | mud 1688 | muddy 1689 | mug 1690 | mule 1691 | multiply 1692 | murder 1693 | music 1694 | must 1695 | my 1696 | myself 1697 | nail 1698 | name 1699 | nap 1700 | napkin 1701 | narrow 1702 | nasty 1703 | naughty 1704 | navy 1705 | near 1706 | nearby 1707 | nearly 1708 | neat 1709 | neck 1710 | necktie 1711 | need 1712 | needle 1713 | needn't 1714 | Negro 1715 | neighbor 1716 | neighborhood 1717 | neither 1718 | nerve 1719 | nest 1720 | net 1721 | never 1722 | nevermore 1723 | new 1724 | news 1725 | newspaper 1726 | next 1727 | nibble 1728 | nice 1729 | nickel 1730 | night 1731 | nightgown 1732 | nine 1733 | nineteen 1734 | ninety 1735 | no 1736 | nobody 1737 | nod 1738 | noise 1739 | noisy 1740 | none 1741 | noon 1742 | nor 1743 | north 1744 | northern 1745 | nose 1746 | not 1747 | note 1748 | nothing 1749 | notice 1750 | November 1751 | now 1752 | nowhere 1753 | number 1754 | nurse 1755 | nut 1756 | oak 1757 | oar 1758 | oatmeal 1759 | oats 1760 | obey 1761 | ocean 1762 | o'clock 1763 | October 1764 | odd 1765 | of 1766 | off 1767 | offer 1768 | office 1769 | officer 1770 | often 1771 | oh 1772 | oil 1773 | old 1774 | old-fashioned 1775 | on 1776 | once 1777 | one 1778 | onion 1779 | only 1780 | onward 1781 | open 1782 | or 1783 | orange 1784 | orchard 1785 | order 1786 | ore 1787 | organ 1788 | other 1789 | otherwise 1790 | ouch 1791 | ought 1792 | our 1793 | ours 1794 | ourselves 1795 | out 1796 | outdoors 1797 | outfit 1798 | outlaw 1799 | outline 1800 | outside 1801 | outward 1802 | oven 1803 | over 1804 | overalls 1805 | overcoat 1806 | overeat 1807 | overhead 1808 | overhear 1809 | overnight 1810 | overturn 1811 | owe 1812 | owing 1813 | owl 1814 | own 1815 | owner 1816 | ox 1817 | pa 1818 | pace 1819 | pack 1820 | package 1821 | pad 1822 | page 1823 | paid 1824 | pail 1825 | pain 1826 | painful 1827 | paint 1828 | painter 1829 | painting 1830 | pair 1831 | pal 1832 | palace 1833 | pale 1834 | pan 1835 | pancake 1836 | pane 1837 | pansy 1838 | pants 1839 | papa 1840 | paper 1841 | parade 1842 | pardon 1843 | parent 1844 | park 1845 | part 1846 | partly 1847 | partner 1848 | party 1849 | pass 1850 | passenger 1851 | past 1852 | paste 1853 | pasture 1854 | pat 1855 | patch 1856 | path 1857 | patter 1858 | pave 1859 | pavement 1860 | paw 1861 | pay 1862 | payment 1863 | pea 1864 | peas 1865 | peace 1866 | peaceful 1867 | peach 1868 | peaches 1869 | peak 1870 | peanut 1871 | pear 1872 | pearl 1873 | peck 1874 | peek 1875 | peel 1876 | peep 1877 | peg 1878 | pen 1879 | pencil 1880 | penny 1881 | people 1882 | pepper 1883 | peppermint 1884 | perfume 1885 | perhaps 1886 | person 1887 | pet 1888 | phone 1889 | piano 1890 | pick 1891 | pickle 1892 | picnic 1893 | picture 1894 | pie 1895 | piece 1896 | pig 1897 | pigeon 1898 | piggy 1899 | pile 1900 | pill 1901 | pillow 1902 | pin 1903 | pine 1904 | pineapple 1905 | pink 1906 | pint 1907 | pipe 1908 | pistol 1909 | pit 1910 | pitch 1911 | pitcher 1912 | pity 1913 | place 1914 | plain 1915 | plan 1916 | plane 1917 | plant 1918 | plate 1919 | platform 1920 | platter 1921 | play 1922 | player 1923 | playground 1924 | playhouse 1925 | playmate 1926 | plaything 1927 | pleasant 1928 | please 1929 | pleasure 1930 | plenty 1931 | plow 1932 | plug 1933 | plum 1934 | pocket 1935 | pocketbook 1936 | poem 1937 | point 1938 | poison 1939 | poke 1940 | pole 1941 | police 1942 | policeman 1943 | polish 1944 | polite 1945 | pond 1946 | ponies 1947 | pony 1948 | pool 1949 | poor 1950 | pop 1951 | popcorn 1952 | popped 1953 | porch 1954 | pork 1955 | possible 1956 | post 1957 | postage 1958 | postman 1959 | pot 1960 | potato 1961 | potatoes 1962 | pound 1963 | pour 1964 | powder 1965 | power 1966 | powerful 1967 | praise 1968 | pray 1969 | prayer 1970 | prepare 1971 | present 1972 | pretty 1973 | price 1974 | prick 1975 | prince 1976 | princess 1977 | print 1978 | prison 1979 | prize 1980 | promise 1981 | proper 1982 | protect 1983 | proud 1984 | prove 1985 | prune 1986 | public 1987 | puddle 1988 | puff 1989 | pull 1990 | pump 1991 | pumpkin 1992 | punch 1993 | punish 1994 | pup 1995 | pupil 1996 | puppy 1997 | pure 1998 | purple 1999 | purse 2000 | push 2001 | puss 2002 | pussy 2003 | pussycat 2004 | put 2005 | putting 2006 | puzzle 2007 | quack 2008 | quart 2009 | quarter 2010 | queen 2011 | queer 2012 | question 2013 | quick 2014 | quickly 2015 | quiet 2016 | quilt 2017 | quit 2018 | quite 2019 | rabbit 2020 | race 2021 | rack 2022 | radio 2023 | radish 2024 | rag 2025 | rail 2026 | railroad 2027 | railway 2028 | rain 2029 | rainy 2030 | rainbow 2031 | raise 2032 | raisin 2033 | rake 2034 | ram 2035 | ran 2036 | ranch 2037 | rang 2038 | rap 2039 | rapidly 2040 | rat 2041 | rate 2042 | rather 2043 | rattle 2044 | raw 2045 | ray 2046 | reach 2047 | read 2048 | reader 2049 | reading 2050 | ready 2051 | real 2052 | really 2053 | reap 2054 | rear 2055 | reason 2056 | rebuild 2057 | receive 2058 | recess 2059 | record 2060 | red 2061 | redbird 2062 | redbreast 2063 | refuse 2064 | reindeer 2065 | rejoice 2066 | remain 2067 | remember 2068 | remind 2069 | remove 2070 | rent 2071 | repair 2072 | repay 2073 | repeat 2074 | report 2075 | rest 2076 | return 2077 | review 2078 | reward 2079 | rib 2080 | ribbon 2081 | rice 2082 | rich 2083 | rid 2084 | riddle 2085 | ride 2086 | rider 2087 | riding 2088 | right 2089 | rim 2090 | ring 2091 | rip 2092 | ripe 2093 | rise 2094 | rising 2095 | river 2096 | road 2097 | roadside 2098 | roar 2099 | roast 2100 | rob 2101 | robber 2102 | robe 2103 | robin 2104 | rock 2105 | rocky 2106 | rocket 2107 | rode 2108 | roll 2109 | roller 2110 | roof 2111 | room 2112 | rooster 2113 | root 2114 | rope 2115 | rose 2116 | rosebud 2117 | rot 2118 | rotten 2119 | rough 2120 | round 2121 | route 2122 | row 2123 | rowboat 2124 | royal 2125 | rub 2126 | rubbed 2127 | rubber 2128 | rubbish 2129 | rug 2130 | rule 2131 | ruler 2132 | rumble 2133 | run 2134 | rung 2135 | runner 2136 | running 2137 | rush 2138 | rust 2139 | rusty 2140 | rye 2141 | sack 2142 | sad 2143 | saddle 2144 | sadness 2145 | safe 2146 | safety 2147 | said 2148 | sail 2149 | sailboat 2150 | sailor 2151 | saint 2152 | salad 2153 | sale 2154 | salt 2155 | same 2156 | sand 2157 | sandy 2158 | sandwich 2159 | sang 2160 | sank 2161 | sap 2162 | sash 2163 | sat 2164 | satin 2165 | satisfactory 2166 | Saturday 2167 | sausage 2168 | savage 2169 | save 2170 | savings 2171 | saw 2172 | say 2173 | scab 2174 | scales 2175 | scare 2176 | scarf 2177 | school 2178 | schoolboy 2179 | schoolhouse 2180 | schoolmaster 2181 | schoolroom 2182 | scorch 2183 | score 2184 | scrap 2185 | scrape 2186 | scratch 2187 | scream 2188 | screen 2189 | screw 2190 | scrub 2191 | sea 2192 | seal 2193 | seam 2194 | search 2195 | season 2196 | seat 2197 | second 2198 | secret 2199 | see 2200 | seeing 2201 | seed 2202 | seek 2203 | seem 2204 | seen 2205 | seesaw 2206 | select 2207 | self 2208 | selfish 2209 | sell 2210 | send 2211 | sense 2212 | sent 2213 | sentence 2214 | separate 2215 | September 2216 | servant 2217 | serve 2218 | service 2219 | set 2220 | setting 2221 | settle 2222 | settlement 2223 | seven 2224 | seventeen 2225 | seventh 2226 | seventy 2227 | several 2228 | sew 2229 | shade 2230 | shadow 2231 | shady 2232 | shake 2233 | shaker 2234 | shaking 2235 | shall 2236 | shame 2237 | shan't 2238 | shape 2239 | share 2240 | sharp 2241 | shave 2242 | she 2243 | she'd 2244 | she'll 2245 | she's 2246 | shear 2247 | shears 2248 | shed 2249 | sheep 2250 | sheet 2251 | shelf 2252 | shell 2253 | shepherd 2254 | shine 2255 | shining 2256 | shiny 2257 | ship 2258 | shirt 2259 | shock 2260 | shoe 2261 | shoemaker 2262 | shone 2263 | shook 2264 | shoot 2265 | shop 2266 | shopping 2267 | shore 2268 | short 2269 | shot 2270 | should 2271 | shoulder 2272 | shouldn't 2273 | shout 2274 | shovel 2275 | show 2276 | shower 2277 | shut 2278 | shy 2279 | sick 2280 | sickness 2281 | side 2282 | sidewalk 2283 | sideways 2284 | sigh 2285 | sight 2286 | sign 2287 | silence 2288 | silent 2289 | silk 2290 | sill 2291 | silly 2292 | silver 2293 | simple 2294 | sin 2295 | since 2296 | sing 2297 | singer 2298 | single 2299 | sink 2300 | sip 2301 | sir 2302 | sis 2303 | sissy 2304 | sister 2305 | sit 2306 | sitting 2307 | six 2308 | sixteen 2309 | sixth 2310 | sixty 2311 | size 2312 | skate 2313 | skater 2314 | ski 2315 | skin 2316 | skip 2317 | skirt 2318 | sky 2319 | slam 2320 | slap 2321 | slate 2322 | slave 2323 | sled 2324 | sleep 2325 | sleepy 2326 | sleeve 2327 | sleigh 2328 | slept 2329 | slice 2330 | slid 2331 | slide 2332 | sling 2333 | slip 2334 | slipped 2335 | slipper 2336 | slippery 2337 | slit 2338 | slow 2339 | slowly 2340 | sly 2341 | smack 2342 | small 2343 | smart 2344 | smell 2345 | smile 2346 | smoke 2347 | smooth 2348 | snail 2349 | snake 2350 | snap 2351 | snapping 2352 | sneeze 2353 | snow 2354 | snowy 2355 | snowball 2356 | snowflake 2357 | snuff 2358 | snug 2359 | so 2360 | soak 2361 | soap 2362 | sob 2363 | socks 2364 | sod 2365 | soda 2366 | sofa 2367 | soft 2368 | soil 2369 | sold 2370 | soldier 2371 | sole 2372 | some 2373 | somebody 2374 | somehow 2375 | someone 2376 | something 2377 | sometime 2378 | sometimes 2379 | somewhere 2380 | son 2381 | song 2382 | soon 2383 | sore 2384 | sorrow 2385 | sorry 2386 | sort 2387 | soul 2388 | sound 2389 | soup 2390 | sour 2391 | south 2392 | southern 2393 | space 2394 | spade 2395 | spank 2396 | sparrow 2397 | speak 2398 | speaker 2399 | spear 2400 | speech 2401 | speed 2402 | spell 2403 | spelling 2404 | spend 2405 | spent 2406 | spider 2407 | spike 2408 | spill 2409 | spin 2410 | spinach 2411 | spirit 2412 | spit 2413 | splash 2414 | spoil 2415 | spoke 2416 | spook 2417 | spoon 2418 | sport 2419 | spot 2420 | spread 2421 | spring 2422 | springtime 2423 | sprinkle 2424 | square 2425 | squash 2426 | squeak 2427 | squeeze 2428 | squirrel 2429 | stable 2430 | stack 2431 | stage 2432 | stair 2433 | stall 2434 | stamp 2435 | stand 2436 | star 2437 | stare 2438 | start 2439 | starve 2440 | state 2441 | station 2442 | stay 2443 | steak 2444 | steal 2445 | steam 2446 | steamboat 2447 | steamer 2448 | steel 2449 | steep 2450 | steeple 2451 | steer 2452 | stem 2453 | step 2454 | stepping 2455 | stick 2456 | sticky 2457 | stiff 2458 | still 2459 | stillness 2460 | sting 2461 | stir 2462 | stitch 2463 | stock 2464 | stocking 2465 | stole 2466 | stone 2467 | stood 2468 | stool 2469 | stoop 2470 | stop 2471 | stopped 2472 | stopping 2473 | store 2474 | stork 2475 | stories 2476 | storm 2477 | stormy 2478 | story 2479 | stove 2480 | straight 2481 | strange 2482 | stranger 2483 | strap 2484 | straw 2485 | strawberry 2486 | stream 2487 | street 2488 | stretch 2489 | string 2490 | strip 2491 | stripes 2492 | strong 2493 | stuck 2494 | study 2495 | stuff 2496 | stump 2497 | stung 2498 | subject 2499 | such 2500 | suck 2501 | sudden 2502 | suffer 2503 | sugar 2504 | suit 2505 | sum 2506 | summer 2507 | sun 2508 | Sunday 2509 | sunflower 2510 | sung 2511 | sunk 2512 | sunlight 2513 | sunny 2514 | sunrise 2515 | sunset 2516 | sunshine 2517 | supper 2518 | suppose 2519 | sure 2520 | surely 2521 | surface 2522 | surprise 2523 | swallow 2524 | swam 2525 | swamp 2526 | swan 2527 | swat 2528 | swear 2529 | sweat 2530 | sweater 2531 | sweep 2532 | sweet 2533 | sweetness 2534 | sweetheart 2535 | swell 2536 | swept 2537 | swift 2538 | swim 2539 | swimming 2540 | swing 2541 | switch 2542 | sword 2543 | swore 2544 | table 2545 | tablecloth 2546 | tablespoon 2547 | tablet 2548 | tack 2549 | tag 2550 | tail 2551 | tailor 2552 | take 2553 | taken 2554 | taking 2555 | tale 2556 | talk 2557 | talker 2558 | tall 2559 | tame 2560 | tan 2561 | tank 2562 | tap 2563 | tape 2564 | tar 2565 | tardy 2566 | task 2567 | taste 2568 | taught 2569 | tax 2570 | tea 2571 | teach 2572 | teacher 2573 | team 2574 | tear 2575 | tease 2576 | teaspoon 2577 | teeth 2578 | telephone 2579 | tell 2580 | temper 2581 | ten 2582 | tennis 2583 | tent 2584 | term 2585 | terrible 2586 | test 2587 | than 2588 | thank 2589 | thanks 2590 | thankful 2591 | Thanksgiving 2592 | that 2593 | that's 2594 | the 2595 | theater 2596 | thee 2597 | their 2598 | them 2599 | then 2600 | there 2601 | these 2602 | they 2603 | they'd 2604 | they'll 2605 | they're 2606 | they've 2607 | thick 2608 | thief 2609 | thimble 2610 | thin 2611 | thing 2612 | think 2613 | third 2614 | thirsty 2615 | thirteen 2616 | thirty 2617 | this 2618 | thorn 2619 | those 2620 | though 2621 | thought 2622 | thousand 2623 | thread 2624 | three 2625 | threw 2626 | throat 2627 | throne 2628 | through 2629 | throw 2630 | thrown 2631 | thumb 2632 | thunder 2633 | Thursday 2634 | thy 2635 | tick 2636 | ticket 2637 | tickle 2638 | tie 2639 | tiger 2640 | tight 2641 | till 2642 | time 2643 | tin 2644 | tinkle 2645 | tiny 2646 | tip 2647 | tiptoe 2648 | tire 2649 | tired 2650 | title 2651 | to 2652 | toad 2653 | toadstool 2654 | toast 2655 | tobacco 2656 | today 2657 | toe 2658 | together 2659 | toilet 2660 | told 2661 | tomato 2662 | tomorrow 2663 | ton 2664 | tone 2665 | tongue 2666 | tonight 2667 | too 2668 | took 2669 | tool 2670 | toot 2671 | tooth 2672 | toothbrush 2673 | toothpick 2674 | top 2675 | tore 2676 | torn 2677 | toss 2678 | touch 2679 | tow 2680 | toward 2681 | towards 2682 | towel 2683 | tower 2684 | town 2685 | toy 2686 | trace 2687 | track 2688 | trade 2689 | train 2690 | tramp 2691 | trap 2692 | tray 2693 | treasure 2694 | treat 2695 | tree 2696 | trick 2697 | tricycle 2698 | tried 2699 | trim 2700 | trip 2701 | trolley 2702 | trouble 2703 | truck 2704 | true 2705 | truly 2706 | trunk 2707 | trust 2708 | truth 2709 | try 2710 | tub 2711 | Tuesday 2712 | tug 2713 | tulip 2714 | tumble 2715 | tune 2716 | tunnel 2717 | turkey 2718 | turn 2719 | turtle 2720 | twelve 2721 | twenty 2722 | twice 2723 | twig 2724 | twin 2725 | two 2726 | ugly 2727 | umbrella 2728 | uncle 2729 | under 2730 | understand 2731 | underwear 2732 | undress 2733 | unfair 2734 | unfinished 2735 | unfold 2736 | unfriendly 2737 | unhappy 2738 | unhurt 2739 | uniform 2740 | United 2741 | States 2742 | unkind 2743 | unknown 2744 | unless 2745 | unpleasant 2746 | until 2747 | unwilling 2748 | up 2749 | upon 2750 | upper 2751 | upset 2752 | upside 2753 | upstairs 2754 | uptown 2755 | upward 2756 | us 2757 | use 2758 | used 2759 | useful 2760 | valentine 2761 | valley 2762 | valuable 2763 | value 2764 | vase 2765 | vegetable 2766 | velvet 2767 | very 2768 | vessel 2769 | victory 2770 | view 2771 | village 2772 | vine 2773 | violet 2774 | visit 2775 | visitor 2776 | voice 2777 | vote 2778 | wag 2779 | wagon 2780 | waist 2781 | wait 2782 | wake 2783 | waken 2784 | walk 2785 | wall 2786 | walnut 2787 | want 2788 | war 2789 | warm 2790 | warn 2791 | was 2792 | wash 2793 | washer 2794 | washtub 2795 | wasn't 2796 | waste 2797 | watch 2798 | watchman 2799 | water 2800 | watermelon 2801 | waterproof 2802 | wave 2803 | wax 2804 | way 2805 | wayside 2806 | we 2807 | weak 2808 | weakness 2809 | weaken 2810 | wealth 2811 | weapon 2812 | wear 2813 | weary 2814 | weather 2815 | weave 2816 | web 2817 | we'd 2818 | wedding 2819 | Wednesday 2820 | wee 2821 | weed 2822 | week 2823 | we'll 2824 | weep 2825 | weigh 2826 | welcome 2827 | well 2828 | went 2829 | were 2830 | we're 2831 | west 2832 | western 2833 | wet 2834 | we've 2835 | whale 2836 | what 2837 | what's 2838 | wheat 2839 | wheel 2840 | when 2841 | whenever 2842 | where 2843 | which 2844 | while 2845 | whip 2846 | whipped 2847 | whirl 2848 | whisky 2849 | whiskey 2850 | whisper 2851 | whistle 2852 | white 2853 | who 2854 | who'd 2855 | whole 2856 | who'll 2857 | whom 2858 | who's 2859 | whose 2860 | why 2861 | wicked 2862 | wide 2863 | wife 2864 | wiggle 2865 | wild 2866 | wildcat 2867 | will 2868 | willing 2869 | willow 2870 | win 2871 | wind 2872 | windy 2873 | windmill 2874 | window 2875 | wine 2876 | wing 2877 | wink 2878 | winner 2879 | winter 2880 | wipe 2881 | wire 2882 | wise 2883 | wish 2884 | wit 2885 | witch 2886 | with 2887 | without 2888 | woke 2889 | wolf 2890 | woman 2891 | women 2892 | won 2893 | wonder 2894 | wonderful 2895 | won't 2896 | wood 2897 | wooden 2898 | woodpecker 2899 | woods 2900 | wool 2901 | woolen 2902 | word 2903 | wore 2904 | work 2905 | worker 2906 | workman 2907 | world 2908 | worm 2909 | worn 2910 | worry 2911 | worse 2912 | worst 2913 | worth 2914 | would 2915 | wouldn't 2916 | wound 2917 | wove 2918 | wrap 2919 | wrapped 2920 | wreck 2921 | wren 2922 | wring 2923 | write 2924 | writing 2925 | written 2926 | wrong 2927 | wrote 2928 | wrung 2929 | yard 2930 | yarn 2931 | year 2932 | yell 2933 | yellow 2934 | yes 2935 | yesterday 2936 | yet 2937 | yolk 2938 | yonder 2939 | you 2940 | you'd 2941 | you'll 2942 | young 2943 | youngster 2944 | your 2945 | yours 2946 | you're 2947 | yourself 2948 | yourselves 2949 | youth 2950 | you've -------------------------------------------------------------------------------- /corpus/common.txt: -------------------------------------------------------------------------------- 1 | then 2 | up 3 | with 4 | many 5 | did 6 | everything 7 | each 8 | 9 | tell 10 | based 11 | if 12 | instantly 13 | probably 14 | the 15 | : 16 | while 17 | one 18 | to 19 | only 20 | going 21 | may 22 | looks 23 | our 24 | best 25 | had 26 | than 27 | we 28 | happens 29 | also 30 | by 31 | ve 32 | them 33 | it 34 | look 35 | ( 36 | ) 37 | ll 38 | which 39 | about 40 | types 41 | could 42 | this 43 | make 44 | that 45 | just 46 | at 47 | their 48 | is 49 | leave 50 | time 51 | goes 52 | world 53 | got 54 | # 55 | into 56 | has 57 | definitive 58 | me 59 | most 60 | feel 61 | or 62 | should 63 | as 64 | signs 65 | having 66 | things 67 | go 68 | it 69 | his 70 | re 71 | here 72 | happen 73 | those 74 | must 75 | would 76 | relate 77 | now 78 | no 79 | after 80 | way 81 | doing 82 | she 83 | reasons 84 | you 85 | more 86 | when 87 | & 88 | say 89 | be 90 | . 91 | wants 92 | remember 93 | there 94 | ways 95 | did 96 | n't 97 | why 98 | for 99 | before 100 | but 101 | ! 102 | too 103 | become 104 | times 105 | ) 106 | how 107 | want 108 | know 109 | guess 110 | '' 111 | 112 | give 113 | and 114 | can 115 | this 116 | are 117 | us 118 | to 119 | will 120 | who 121 | have 122 | he 123 | ask 124 | on 125 | first 126 | you 127 | being 128 | like 129 | from 130 | was 131 | they 132 | what 133 | do 134 | your 135 | away 136 | of 137 | are 138 | these 139 | , 140 | says 141 | cast 142 | i 143 | how 144 | over 145 | where 146 | perfectly 147 | character 148 | well 149 | match 150 | ' 151 | age 152 | get 153 | the 154 | `` 155 | people 156 | might 157 | thing 158 | which 159 | vs. 160 | need 161 | not 162 | never 163 | time 164 | does 165 | a 166 | can 167 | an 168 | been 169 | my 170 | ? 171 | in 172 | see -------------------------------------------------------------------------------- /corpus/common_phrases.txt: -------------------------------------------------------------------------------- 1 | about the 2 | are you 3 | are you more 4 | are you more like 5 | a single 6 | at the 7 | based on your 8 | but what happened next 9 | can change your life 10 | cannot even handle 11 | can ' t even handle 12 | can ' t handle 13 | can we guess 14 | can we guess what 15 | can we guess your 16 | can you 17 | can you guess 18 | can you guess the 19 | can you match the 20 | cast of 21 | cast of looks 22 | definitive ranking of 23 | doesn ' t want you to see 24 | do you 25 | do you know 26 | do you know the 27 | do you remember 28 | do you remember the 29 | everything you need to 30 | everything you need to know 31 | for a 32 | for the 33 | for the first 34 | for the first time 35 | from 36 | from the 37 | go viral 38 | happens when you 39 | here ' s what 40 | how many of 41 | how many of these 42 | how well 43 | how well do 44 | how well do you 45 | in a 46 | in the 47 | is a 48 | is the 49 | is the best 50 | is the most 51 | is what 52 | is what happens 53 | is what happens when 54 | it ' s like to 55 | it ' s like to be 56 | know about 57 | know the lyrics to 58 | make you 59 | make you want to 60 | many of these 61 | mind blowing 62 | mind blown 63 | need to 64 | need to know 65 | need to know about 66 | need to visit before you die 67 | nothing could prepare me for 68 | of a 69 | of all time 70 | of all-time 71 | of 72 | of looks like now 73 | of the 74 | of the most 75 | one of the 76 | one weird trick 77 | on the 78 | people who 79 | people who are 80 | looks like now 81 | reasons why 82 | reasons why you should 83 | reasons you should 84 | remember the lyrics to 85 | right now 86 | scientific reasons 87 | signs that you are 88 | simple lessons 89 | stop what you ' re doing 90 | stop what you’re doing 91 | stop what youre doing 92 | tell us about 93 | that are 94 | that happen when 95 | that happen when you 96 | that perfectly sum up 97 | that will 98 | that will give you 99 | that will make 100 | that will make you 101 | that will make you rethink 102 | the best 103 | the cast of 104 | the cast of 105 | the 106 | the definitive ranking of 107 | the most 108 | the first time 109 | the first time. 110 | the lyrics to 111 | the lyrics to 112 | the most 113 | the world 114 | the world ' s best 115 | things only people who 116 | things that 117 | things that happen 118 | things that happen when 119 | things you 120 | things you need to 121 | this is 122 | this is how 123 | this is the 124 | this is what 125 | this is what happens 126 | this is what the 127 | to a 128 | to be 129 | to be a 130 | to get 131 | to know about 132 | to know about the 133 | to make 134 | totally blew my mind 135 | to the 136 | want to 137 | ways to 138 | we guess your 139 | we guess your age 140 | well do 141 | well do you 142 | well do you know 143 | well do you remember 144 | what happens when 145 | what happens when you 146 | what it ' s like 147 | what it ' s like to 148 | what the cast of 149 | when you 150 | which character 151 | which character are you? 152 | will blow your mind 153 | will change your life forever 154 | will give you 155 | will instantly relate to. 156 | will leave you 157 | will make 158 | will make you 159 | will make you feel 160 | will make you want 161 | with a 162 | won the internet 163 | you can 164 | you didn ' t know exist 165 | you didn ’ t know exist 166 | you didn ' t know existed 167 | you didn ’ t know existed 168 | you guess the 169 | you have 170 | you know 171 | you know the 172 | you know the lyrics 173 | you need 174 | you need to 175 | you need to know 176 | you probably didn ' t know 177 | you remember the 178 | you remember the lyrics 179 | you should 180 | you should never 181 | you want to 182 | you won ' t believe 183 | you won ’ t believe 184 | you wont believe 185 | -------------------------------------------------------------------------------- /corpus/contractions.txt: -------------------------------------------------------------------------------- 1 | aren't 2 | can't 3 | couldn't 4 | could've 5 | didn't 6 | doesn't 7 | don't 8 | hadn't 9 | hasn't 10 | haven't 11 | he'd 12 | he'll 13 | here's 14 | he's 15 | how'd 16 | how'll 17 | how's 18 | i'd 19 | i'll 20 | i'm 21 | isn't 22 | it'd 23 | it'll 24 | it's 25 | i've 26 | mightn't 27 | might've 28 | mustn't 29 | must've 30 | she'd 31 | she'll 32 | she's 33 | shouldn't 34 | should've 35 | that'd 36 | that'll 37 | that's 38 | there's 39 | they'd 40 | they'll 41 | they're 42 | they've 43 | 'tis 44 | wasn't 45 | we'd 46 | we'll 47 | we're 48 | weren't 49 | we've 50 | what'd\ 51 | what'll 52 | what're 53 | what's 54 | when'd 55 | when'll 56 | when's 57 | where'd 58 | where'll 59 | where's 60 | who'd 61 | who'll 62 | who's 63 | why'd 64 | why'll 65 | won't 66 | wouldn't 67 | would've 68 | you'd 69 | you'll 70 | you're 71 | you've 72 | -------------------------------------------------------------------------------- /corpus/hyperbolic.txt: -------------------------------------------------------------------------------- 1 | absolutely 2 | action-packed 3 | adorable 4 | adorably 5 | adventurous 6 | all-star 7 | amazing 8 | amazingly 9 | amusing 10 | artfully 11 | artsy 12 | astonishing 13 | astonishingly 14 | astounding 15 | attraction 16 | avid 17 | awe-inspiring 18 | awesome 19 | ballsy 20 | big-time 21 | bittersweet 22 | bouncy 23 | breathless 24 | breathtaking 25 | breathtakingly 26 | brilliant 27 | brilliantly 28 | captivating 29 | career-defining 30 | certainly 31 | charged 32 | charming 33 | cheeky 34 | cheery 35 | chilling 36 | classy 37 | clever 38 | cleverest 39 | cleverly 40 | comforting 41 | comically 42 | commendable 43 | compelling 44 | confident 45 | cool 46 | courageous 47 | crafty 48 | cranky 49 | creative 50 | cuddly 51 | cunning 52 | cute 53 | cuter 54 | dazzling 55 | dead-on 56 | definitive 57 | delectable 58 | deliciously 59 | delight 60 | delightful 61 | delightfully 62 | divine 63 | dominant 64 | down-to-earth 65 | dreamlike 66 | dreamy 67 | eerily 68 | effortless 69 | effortlessly 70 | electric 71 | elegant 72 | elegantly 73 | eloquent 74 | enchanting 75 | encourage 76 | endearing 77 | engaging 78 | enjoy 79 | enjoyable 80 | enormous 81 | entertained 82 | entertaining 83 | enthusiastic 84 | epic 85 | ethereal 86 | ethnic 87 | evangelical 88 | exceedingly 89 | excellent 90 | exceptionally 91 | exciting 92 | excruciatingly 93 | exhilarating 94 | explosive 95 | expressive 96 | exquisite 97 | exquisitely 98 | extraordinary 99 | extravagant 100 | extravagantly 101 | eye-catching 102 | famed 103 | fancy 104 | fantastic 105 | fantastically 106 | fascinating 107 | fast-paced 108 | feel-good 109 | fierce 110 | firmly 111 | first-class 112 | flashy 113 | fondly 114 | fresh 115 | fun 116 | fun-loving 117 | funnier 118 | funniest 119 | funny 120 | gasp-worthy 121 | genius 122 | genuine 123 | genuinely 124 | glamorous 125 | glorious 126 | gloriously 127 | gorgeous 128 | gorgeously 129 | gracefully 130 | grand 131 | great 132 | greatest 133 | gripping 134 | gritty 135 | gutsy 136 | gut-wrenching 137 | happiness 138 | happy 139 | hard-hitting 140 | heartening 141 | heartfelt 142 | heart-felt 143 | heartwarming 144 | heroic 145 | high-octane 146 | hilarious 147 | hilariously 148 | honest 149 | honestly 150 | hope 151 | hot 152 | humorous 153 | hunk 154 | imaginable 155 | imaginative 156 | immensely 157 | impeccable 158 | impossibly 159 | impressed 160 | impressive 161 | impressively 162 | incredible 163 | incredibly 164 | indulgent 165 | infectious 166 | informative 167 | infuriate 168 | ingenious 169 | ingeniously 170 | inspiration 171 | inspirational 172 | inspires 173 | intelligent 174 | interestingly 175 | irresistible 176 | irresistibly 177 | jaw-dropping 178 | jolting 179 | joy 180 | kid-friendly 181 | laid-back 182 | laughing 183 | laugh-out-loud 184 | laughter 185 | laziest 186 | legendary 187 | liberating 188 | life-changing 189 | literally 190 | literate 191 | lovable 192 | love 193 | loveable 194 | loved 195 | love-struck 196 | luckiest 197 | luscious 198 | lush 199 | magical 200 | magnificent 201 | marvelous 202 | masterful 203 | masterpiece 204 | meaty 205 | memorable 206 | mesmerizing 207 | mind-bending 208 | mind-blowing 209 | miraculous 210 | moving 211 | must-see 212 | mystical 213 | neat 214 | nicest 215 | nostalgic 216 | noteworthy 217 | omg 218 | one-of-a-kind 219 | otherworldly 220 | outstanding 221 | overwhelmingly 222 | passionately 223 | perfect 224 | perfection 225 | perfectly 226 | phenomenal 227 | playful 228 | pleasant 229 | pleasantly 230 | pleasing 231 | pleasure 232 | poetic 233 | poignant 234 | powerful 235 | prettiest 236 | pretty 237 | priceless 238 | profound 239 | profoundly 240 | proud 241 | prove 242 | psychological 243 | pure 244 | radiant 245 | rarest 246 | recommended 247 | refreshing 248 | refreshingly 249 | remarkable 250 | remarkably 251 | resourceful 252 | revolutionary 253 | rightly 254 | riveted 255 | riveting 256 | rousing 257 | sassy 258 | satisfied 259 | satisfying 260 | savvy 261 | scenic 262 | sensational 263 | sexy 264 | sharper 265 | shine 266 | shines 267 | shocked 268 | shocking 269 | sincerely 270 | sleek 271 | slick 272 | smart 273 | smarter 274 | smartest 275 | smile 276 | snazzy 277 | soulful 278 | soul-stirring 279 | sparkles 280 | sparkling 281 | spectacular 282 | spectacularly 283 | spellbinding 284 | spicy 285 | splendid 286 | spontaneous 287 | spot-on 288 | steamy 289 | striking 290 | stronger 291 | stunning 292 | stylishly 293 | successful 294 | sumptuous 295 | super 296 | superb 297 | supremely 298 | surprisingly 299 | sweet 300 | tear-jerking 301 | terrific 302 | terrifically 303 | terrifying 304 | thoughtful 305 | thought-provoking 306 | thrilling 307 | thrills 308 | timeless 309 | tongue-in-cheek 310 | top-notch 311 | totally 312 | touching 313 | touchingly 314 | transcendent 315 | true-blue 316 | truly 317 | ultimate 318 | unbelievable 319 | uncanny 320 | unconditional 321 | unforgettable 322 | unimaginable 323 | unpretentious 324 | upbeat 325 | uplifting 326 | valiant 327 | virtuoso 328 | vivid 329 | warmth 330 | well-rounded 331 | what? 332 | whimsical 333 | whirlwind 334 | whoa 335 | whoah 336 | wicked 337 | winner 338 | witty 339 | wonderful 340 | wonderfully 341 | wondrous 342 | world-class 343 | worst 344 | worthwhile 345 | worthy 346 | wow 347 | -------------------------------------------------------------------------------- /corpus/terrier-stopword.txt: -------------------------------------------------------------------------------- 1 | x 2 | y 3 | your 4 | yours 5 | yourself 6 | yourselves 7 | you 8 | yond 9 | yonder 10 | yon 11 | ye 12 | yet 13 | z 14 | zillion 15 | j 16 | u 17 | umpteen 18 | usually 19 | us 20 | username 21 | uponed 22 | upons 23 | uponing 24 | upon 25 | ups 26 | upping 27 | upped 28 | up 29 | unto 30 | until 31 | unless 32 | unlike 33 | unliker 34 | unlikest 35 | under 36 | underneath 37 | use 38 | used 39 | usedest 40 | r 41 | rath 42 | rather 43 | rathest 44 | rathe 45 | re 46 | relate 47 | related 48 | relatively 49 | regarding 50 | really 51 | res 52 | respecting 53 | respectively 54 | q 55 | quite 56 | que 57 | qua 58 | n 59 | neither 60 | neaths 61 | neath 62 | nethe 63 | nethermost 64 | necessary 65 | necessariest 66 | necessarier 67 | never 68 | nevertheless 69 | nigh 70 | nighest 71 | nigher 72 | nine 73 | noone 74 | nobody 75 | nobodies 76 | nowhere 77 | nowheres 78 | no 79 | noes 80 | nor 81 | nos 82 | no-one 83 | none 84 | not 85 | notwithstanding 86 | nothings 87 | nothing 88 | nathless 89 | natheless 90 | t 91 | ten 92 | tills 93 | till 94 | tilled 95 | tilling 96 | to 97 | towards 98 | toward 99 | towardest 100 | towarder 101 | together 102 | too 103 | thy 104 | thyself 105 | thus 106 | than 107 | that 108 | those 109 | thou 110 | though 111 | thous 112 | thouses 113 | thoroughest 114 | thorougher 115 | thorough 116 | thoroughly 117 | thru 118 | thruer 119 | thruest 120 | thro 121 | through 122 | throughout 123 | throughest 124 | througher 125 | thine 126 | this 127 | thises 128 | they 129 | thee 130 | the 131 | then 132 | thence 133 | thenest 134 | thener 135 | them 136 | themselves 137 | these 138 | therer 139 | there 140 | thereby 141 | therest 142 | thereafter 143 | therein 144 | thereupon 145 | therefore 146 | their 147 | theirs 148 | thing 149 | things 150 | three 151 | two 152 | o 153 | oh 154 | owt 155 | owning 156 | owned 157 | own 158 | owns 159 | others 160 | other 161 | otherwise 162 | otherwisest 163 | otherwiser 164 | of 165 | often 166 | oftener 167 | oftenest 168 | off 169 | offs 170 | offest 171 | one 172 | ought 173 | oughts 174 | our 175 | ours 176 | ourselves 177 | ourself 178 | out 179 | outest 180 | outed 181 | outwith 182 | outs 183 | outside 184 | over 185 | overallest 186 | overaller 187 | overalls 188 | overall 189 | overs 190 | or 191 | orer 192 | orest 193 | on 194 | oneself 195 | onest 196 | ons 197 | onto 198 | a 199 | atween 200 | at 201 | athwart 202 | atop 203 | afore 204 | afterward 205 | afterwards 206 | after 207 | afterest 208 | afterer 209 | ain 210 | an 211 | any 212 | anything 213 | anybody 214 | anyone 215 | anyhow 216 | anywhere 217 | anent 218 | anear 219 | and 220 | andor 221 | another 222 | around 223 | ares 224 | are 225 | aest 226 | aer 227 | against 228 | again 229 | accordingly 230 | abaft 231 | abafter 232 | abaftest 233 | abovest 234 | above 235 | abover 236 | abouter 237 | aboutest 238 | about 239 | aid 240 | amidst 241 | amid 242 | among 243 | amongst 244 | apartest 245 | aparter 246 | apart 247 | appeared 248 | appears 249 | appear 250 | appearing 251 | appropriating 252 | appropriate 253 | appropriatest 254 | appropriates 255 | appropriater 256 | appropriated 257 | already 258 | always 259 | also 260 | along 261 | alongside 262 | although 263 | almost 264 | all 265 | allest 266 | aller 267 | allyou 268 | alls 269 | albeit 270 | awfully 271 | as 272 | aside 273 | asides 274 | aslant 275 | ases 276 | astrider 277 | astride 278 | astridest 279 | astraddlest 280 | astraddler 281 | astraddle 282 | availablest 283 | availabler 284 | available 285 | aughts 286 | aught 287 | vs 288 | v 289 | variousest 290 | variouser 291 | various 292 | via 293 | vis-a-vis 294 | vis-a-viser 295 | vis-a-visest 296 | viz 297 | very 298 | veriest 299 | verier 300 | versus 301 | k 302 | g 303 | go 304 | gone 305 | good 306 | got 307 | gotta 308 | gotten 309 | get 310 | gets 311 | getting 312 | b 313 | by 314 | byandby 315 | by-and-by 316 | bist 317 | both 318 | but 319 | buts 320 | be 321 | beyond 322 | because 323 | became 324 | becomes 325 | become 326 | becoming 327 | becomings 328 | becominger 329 | becomingest 330 | behind 331 | behinds 332 | before 333 | beforehand 334 | beforehandest 335 | beforehander 336 | bettered 337 | betters 338 | better 339 | bettering 340 | betwixt 341 | between 342 | beneath 343 | been 344 | below 345 | besides 346 | beside 347 | m 348 | my 349 | myself 350 | mucher 351 | muchest 352 | much 353 | must 354 | musts 355 | musths 356 | musth 357 | main 358 | make 359 | mayest 360 | many 361 | mauger 362 | maugre 363 | me 364 | meanwhiles 365 | meanwhile 366 | mostly 367 | most 368 | moreover 369 | more 370 | might 371 | mights 372 | midst 373 | midsts 374 | h 375 | huh 376 | humph 377 | he 378 | hers 379 | herself 380 | her 381 | hereby 382 | herein 383 | hereafters 384 | hereafter 385 | hereupon 386 | hence 387 | hadst 388 | had 389 | having 390 | haves 391 | have 392 | has 393 | hast 394 | hardly 395 | hae 396 | hath 397 | him 398 | himself 399 | hither 400 | hitherest 401 | hitherer 402 | his 403 | how-do-you-do 404 | however 405 | how 406 | howbeit 407 | howdoyoudo 408 | hoos 409 | hoo 410 | w 411 | woulded 412 | woulding 413 | would 414 | woulds 415 | was 416 | wast 417 | we 418 | wert 419 | were 420 | with 421 | withal 422 | without 423 | within 424 | why 425 | what 426 | whatever 427 | whateverer 428 | whateverest 429 | whatsoeverer 430 | whatsoeverest 431 | whatsoever 432 | whence 433 | whencesoever 434 | whenever 435 | whensoever 436 | when 437 | whenas 438 | whether 439 | wheen 440 | whereto 441 | whereupon 442 | wherever 443 | whereon 444 | whereof 445 | where 446 | whereby 447 | wherewithal 448 | wherewith 449 | whereinto 450 | wherein 451 | whereafter 452 | whereas 453 | wheresoever 454 | wherefrom 455 | which 456 | whichever 457 | whichsoever 458 | whilst 459 | while 460 | whiles 461 | whithersoever 462 | whither 463 | whoever 464 | whosoever 465 | whoso 466 | whose 467 | whomever 468 | s 469 | syne 470 | syn 471 | shalling 472 | shall 473 | shalled 474 | shalls 475 | shoulding 476 | should 477 | shoulded 478 | shoulds 479 | she 480 | sayyid 481 | sayid 482 | said 483 | saider 484 | saidest 485 | same 486 | samest 487 | sames 488 | samer 489 | saved 490 | sans 491 | sanses 492 | sanserifs 493 | sanserif 494 | so 495 | soer 496 | soest 497 | sobeit 498 | someone 499 | somebody 500 | somehow 501 | some 502 | somewhere 503 | somewhat 504 | something 505 | sometimest 506 | sometimes 507 | sometimer 508 | sometime 509 | several 510 | severaler 511 | severalest 512 | serious 513 | seriousest 514 | seriouser 515 | senza 516 | send 517 | sent 518 | seem 519 | seems 520 | seemed 521 | seemingest 522 | seeminger 523 | seemings 524 | seven 525 | summat 526 | sups 527 | sup 528 | supping 529 | supped 530 | such 531 | since 532 | sine 533 | sines 534 | sith 535 | six 536 | stop 537 | stopped 538 | p 539 | plaintiff 540 | plenty 541 | plenties 542 | please 543 | pleased 544 | pleases 545 | per 546 | perhaps 547 | particulars 548 | particularly 549 | particular 550 | particularest 551 | particularer 552 | pro 553 | providing 554 | provides 555 | provided 556 | provide 557 | probably 558 | l 559 | layabout 560 | layabouts 561 | latter 562 | latterest 563 | latterer 564 | latterly 565 | latters 566 | lots 567 | lotting 568 | lotted 569 | lot 570 | lest 571 | less 572 | ie 573 | ifs 574 | if 575 | i 576 | info 577 | information 578 | itself 579 | its 580 | it 581 | is 582 | idem 583 | idemer 584 | idemest 585 | immediate 586 | immediately 587 | immediatest 588 | immediater 589 | in 590 | inwards 591 | inwardest 592 | inwarder 593 | inward 594 | inasmuch 595 | into 596 | instead 597 | insofar 598 | indicates 599 | indicated 600 | indicate 601 | indicating 602 | indeed 603 | inc 604 | f 605 | fact 606 | facts 607 | fs 608 | figupon 609 | figupons 610 | figuponing 611 | figuponed 612 | few 613 | fewer 614 | fewest 615 | frae 616 | from 617 | failing 618 | failings 619 | five 620 | furthers 621 | furtherer 622 | furthered 623 | furtherest 624 | further 625 | furthering 626 | furthermore 627 | fourscore 628 | followthrough 629 | for 630 | forwhy 631 | fornenst 632 | formerly 633 | former 634 | formerer 635 | formerest 636 | formers 637 | forbye 638 | forby 639 | fore 640 | forever 641 | forer 642 | fores 643 | four 644 | d 645 | ddays 646 | dday 647 | do 648 | doing 649 | doings 650 | doe 651 | does 652 | doth 653 | downwarder 654 | downwardest 655 | downward 656 | downwards 657 | downs 658 | done 659 | doner 660 | dones 661 | donest 662 | dos 663 | dost 664 | did 665 | differentest 666 | differenter 667 | different 668 | describing 669 | describe 670 | describes 671 | described 672 | despiting 673 | despites 674 | despited 675 | despite 676 | during 677 | c 678 | cum 679 | circa 680 | chez 681 | cer 682 | certain 683 | certainest 684 | certainer 685 | cest 686 | canst 687 | cannot 688 | cant 689 | cants 690 | canting 691 | cantest 692 | canted 693 | co 694 | could 695 | couldst 696 | comeon 697 | comeons 698 | come-ons 699 | come-on 700 | concerning 701 | concerninger 702 | concerningest 703 | consequently 704 | considering 705 | e 706 | eg 707 | eight 708 | either 709 | even 710 | evens 711 | evenser 712 | evensest 713 | evened 714 | evenest 715 | ever 716 | everyone 717 | everything 718 | everybody 719 | everywhere 720 | every 721 | ere 722 | each 723 | et 724 | etc 725 | elsewhere 726 | else 727 | ex 728 | excepted 729 | excepts 730 | except 731 | excepting 732 | exes 733 | enough 734 | -------------------------------------------------------------------------------- /datasets/train.csv: -------------------------------------------------------------------------------- 1 | title,label 2 | 20 BuzzFeed Articles It's Probably OK That I Never Finished,clickbait 3 | Pigskin A Blanket: NFL Conference Championship Picks,clickbait 4 | When Relationship Introductions Get Awkward,clickbait 5 | You Might Be Food Shopping Wrong,clickbait 6 | 27 Underrated Makeup Brands Everyone Should Know About,clickbait 7 | We Know Your Favorite Artist Based On Your Zodiac Sign,clickbait 8 | 17 Things Everyone With Strong Eyebrows Knows To Be True,clickbait 9 | 17 Creepy Videos That Will Send A Chill Down Your Spine,clickbait 10 | "Hunted by Puma, It Flees, but Against Fox, It Fights",not-clickbait 11 | Riots in Nigeria kill nearly 400,not-clickbait 12 | Here Are All The Parts Of A Cat You Need To Know,clickbait 13 | "The One Mind-Blowing Detail You Missed In ""The Little Mermaid""",clickbait 14 | 6 Struggles Only Atheists Understand,clickbait 15 | Decline and Fall of the U.S.: A View From 2089,not-clickbait 16 | Guidant announces more defibrillator problems,not-clickbait 17 | Woman Files Complaint Against Bernie Williams,not-clickbait 18 | 'Very serious': Chinese government releases corruption report,not-clickbait 19 | Head of Joint Chiefs Praises Pakistani Operation,not-clickbait 20 | Many nations offer material aid to hurricane victims; Bush refuses to accept,not-clickbait 21 | Chinese City Bolsters Scant Consumer Spending With Free Vouchers,not-clickbait 22 | A Photo Of Demi Lovato Has Now Become A Huge Meme Called Poot,clickbait 23 | Study raises health concerns about shower curtains,not-clickbait 24 | Watch This Boyfriend Blindly Guess Which Hand Belongs To His Boyfriend,clickbait 25 | McEnroe Wants Academy to Revive Tennis in New York,not-clickbait 26 | French tourists killed in California bus rollover accident,not-clickbait 27 | Final draw sets groups for FIFA World Cup 2010,not-clickbait 28 | Washington Train Crash Prompts Safety Warning,not-clickbait 29 | Kristen Bell And Dax Shepard Dancing Through Africa Will Restore Your Faith In Love,clickbait 30 | 11 Pieces Of Wisdom To Read Whenever You're Feeling Down,clickbait 31 | Oldest living European person reaches age 114,not-clickbait 32 | US retailer Circuit City files for bankruptcy protection,not-clickbait 33 | Lil Mama Has Managed To Become A Hilarious Meme,clickbait 34 | 17 Things You Only See At A Historically Black College Homecoming,clickbait 35 | Which One Direction Music Video Is Your Favorite,clickbait 36 | "What's The Best ""Yahoo Answers"" Question You've Ever Seen",clickbait 37 | 21 Reasons Why No One Should Have New York Values,clickbait 38 | "Tens of thousands of workers demonstrate in Ljubljana, Slovenia",not-clickbait 39 | "Avenged Sevenfold drummer James ""The Rev"" Sullivan found dead at age 28",not-clickbait 40 | Man dies in Serbian enclave; could not call ambulance,not-clickbait 41 | "Speeding ticket paid with 12,000 pennies",not-clickbait 42 | Xbox 360 shortages expected on debut day in Europe,not-clickbait 43 | "In April, Fed Weighed Purchase of More Debt",not-clickbait 44 | St Paul's cathedral to shut down following 'Occupy' protest,not-clickbait 45 | What's The Worst Thing You've Ever Smelled,clickbait 46 | "Which Zodiac Should You Date Based On Your Favorite ""Game Of Thrones"" Character",clickbait 47 | Tarja Turunen to perform at Doro Pesch's 25th anniversary concert and record duets with her,not-clickbait 48 | United States Senator Ted Kennedy rushed to the hospital,not-clickbait 49 | 24 Tweets That Will Make Every Nurse Laugh Out Loud,clickbait 50 | 29 Pictures That Will Give You Severe Flashbacks To The Late '90s And Early '00s,clickbait 51 | 16 Black Friday Horror Stories That Prove Retail Employees Are Saints,clickbait 52 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/Embeddings-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "Collapsed": "false" 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd \n", 12 | "import numpy as np\n", 13 | "\n", 14 | "train = pd.read_csv('../datasets/train.csv')\n", 15 | "test = pd.read_csv('../datasets/test.csv')" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "Collapsed": "false" 22 | }, 23 | "source": [ 24 | "## Utility Functions" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "Collapsed": "false" 31 | }, 32 | "source": [ 33 | "Before we start exploring embeddings lets write a couple of helper functions to run Logistic Regression and calculate evaluation metrics\n", 34 | "\n", 35 | "Since we want to optimize our model for F1-Scores, for all models we'll first predict the probability of the positive class. We'll then use these probabilities to get the Precision-Recall curve and from here we can select a threshold value that has the highest F1-score. To predict the labels we can simply use this threshold value." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 8, 41 | "metadata": { 42 | "Collapsed": "false" 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "%matplotlib inline\n", 47 | "import matplotlib.pyplot as plt\n", 48 | "from sklearn.metrics import precision_recall_curve, f1_score, accuracy_score, roc_auc_score, confusion_matrix\n", 49 | "import seaborn as sns\n", 50 | "sns.set_palette(\"muted\")\n", 51 | " \n", 52 | "\n", 53 | "def calc_f1(p_and_r):\n", 54 | " p, r = p_and_r\n", 55 | " return (2*p*r)/(p+r)\n", 56 | "\n", 57 | "\n", 58 | "# Print the F1, Precision, Recall, ROC-AUC, and Accuracy Metrics \n", 59 | "# Since we are optimizing for F1 score - we will first calculate precision and recall and \n", 60 | "# then find the probability threshold value that gives us the best F1 score\n", 61 | "\n", 62 | "def print_model_metrics(y_test, y_test_prob, confusion = False, verbose = True, return_metrics = False):\n", 63 | "\n", 64 | " precision, recall, threshold = precision_recall_curve(y_test, y_test_prob, pos_label = 1)\n", 65 | " \n", 66 | " #Find the threshold value that gives the best F1 Score\n", 67 | " best_f1_index =np.argmax([calc_f1(p_r) for p_r in zip(precision, recall)])\n", 68 | " best_threshold, best_precision, best_recall = threshold[best_f1_index], precision[best_f1_index], recall[best_f1_index]\n", 69 | " \n", 70 | " # Calulcate predictions based on the threshold value\n", 71 | " y_test_pred = np.where(y_test_prob > best_threshold, 1, 0)\n", 72 | " \n", 73 | " # Calculate all metrics\n", 74 | " f1 = f1_score(y_test, y_test_pred, pos_label = 1, average = 'binary')\n", 75 | " roc_auc = roc_auc_score(y_test, y_test_prob)\n", 76 | " acc = accuracy_score(y_test, y_test_pred)\n", 77 | " \n", 78 | " \n", 79 | " if confusion:\n", 80 | " # Calculate and Display the confusion Matrix\n", 81 | " cm = confusion_matrix(y_test, y_test_pred)\n", 82 | "\n", 83 | " plt.title('Confusion Matrix')\n", 84 | " sns.set(font_scale=1.0) #for label size\n", 85 | " sns.heatmap(cm, annot = True, fmt = 'd', xticklabels = ['No Clickbait', 'Clickbait'], yticklabels = ['No Clickbait', 'Clickbait'], annot_kws={\"size\": 14}, cmap = 'Blues')# font size\n", 86 | "\n", 87 | " plt.xlabel('Truth')\n", 88 | " plt.ylabel('Prediction')\n", 89 | " \n", 90 | " if verbose:\n", 91 | " print('F1: {:.3f} | Pr: {:.3f} | Re: {:.3f} | AUC: {:.3f} | Accuracy: {:.3f} \\n'.format(f1, best_precision, best_recall, roc_auc, acc))\n", 92 | " \n", 93 | " if return_metrics:\n", 94 | " return np.array([f1, best_precision, best_recall, roc_auc, acc])" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 9, 100 | "metadata": { 101 | "Collapsed": "false" 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# Run Simple Log Reg Model and Print metrics\n", 106 | "from sklearn.linear_model import SGDClassifier\n", 107 | "\n", 108 | "# Run log reg 10 times and average the result to reduce predction variance\n", 109 | "def run_log_reg(train_features, test_features, y_train, y_test, alpha = 1e-4, confusion = False, return_f1 = False, verbose = True):\n", 110 | " metrics = np.zeros(5)\n", 111 | " for _ in range(10):\n", 112 | " log_reg = SGDClassifier(loss = 'log', alpha = alpha, n_jobs = -1, penalty = 'l2')\n", 113 | " log_reg.fit(train_features, y_train)\n", 114 | " y_test_prob = log_reg.predict_proba(test_features)[:,1]\n", 115 | " metrics += print_model_metrics(y_test, y_test_prob, confusion = confusion, verbose = False, return_metrics = True)\n", 116 | " metrics /=10\n", 117 | " if verbose:\n", 118 | " print('F1: {:.3f} | Pr: {:.3f} | Re: {:.3f} | AUC: {:.3f} | Accuracy: {:.3f} \\n'.format(*metrics))\n", 119 | " if return_f1:\n", 120 | " return f1\n", 121 | " return log_reg" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "Collapsed": "false" 128 | }, 129 | "source": [ 130 | "# Bag-of-Words, TF-IDF and Word Embeddings" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 10, 136 | "metadata": { 137 | "Collapsed": "false" 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "y_train = np.where(train.label.values == 'clickbait', 1, 0)\n", 142 | "y_test = np.where(test.label.values == 'clickbait', 1, 0)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": { 148 | "Collapsed": "false" 149 | }, 150 | "source": [ 151 | "## Bag of Words\n", 152 | "Let's start with simple Bag-Of-Words" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 16, 158 | "metadata": { 159 | "Collapsed": "false" 160 | }, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "F1: 0.782 | Pr: 0.867 | Re: 0.714 | AUC: 0.837 | Accuracy: 0.801 \n", 167 | "\n" 168 | ] 169 | }, 170 | { 171 | "data": { 172 | "text/plain": [ 173 | "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", 174 | " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n", 175 | " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,\n", 176 | " n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,\n", 177 | " random_state=None, shuffle=True, tol=0.001,\n", 178 | " validation_fraction=0.1, verbose=0, warm_start=False)" 179 | ] 180 | }, 181 | "execution_count": 16, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "from sklearn.feature_extraction.text import CountVectorizer\n", 188 | "\n", 189 | "bow = CountVectorizer()\n", 190 | "x_train = bow.fit_transform(train.title.values)\n", 191 | "x_test = bow.transform(test.title.values)\n", 192 | "\n", 193 | "run_log_reg(x_train, x_test, y_train, y_test)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": { 199 | "Collapsed": "false" 200 | }, 201 | "source": [ 202 | "## TF-IDF\n", 203 | "\n", 204 | "TFIDF should perform better than BoW since it uses document frequencies to normalize" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 17, 210 | "metadata": { 211 | "Collapsed": "false" 212 | }, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "F1: 0.829 | Pr: 0.872 | Re: 0.790 | AUC: 0.896 | Accuracy: 0.837 \n", 219 | "\n" 220 | ] 221 | }, 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", 226 | " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n", 227 | " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,\n", 228 | " n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,\n", 229 | " random_state=None, shuffle=True, tol=0.001,\n", 230 | " validation_fraction=0.1, verbose=0, warm_start=False)" 231 | ] 232 | }, 233 | "execution_count": 17, 234 | "metadata": {}, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 240 | "\n", 241 | "tfidf = TfidfVectorizer()\n", 242 | "x_train = tfidf.fit_transform(train.title.values)\n", 243 | "x_test = tfidf.transform(test.title.values)\n", 244 | "\n", 245 | "run_log_reg(x_train, x_test, y_train, y_test)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "Collapsed": "false" 252 | }, 253 | "source": [ 254 | "TFIDF performs marginally better than BoW. Although whats impressive here is the fact that we're getting an F1 score of 0.826 with just 50 datapoints. This is why Log Reg + TFIDF is a great baseline for NLP classification tasks.\n", 255 | "\n", 256 | "Next we'll try 100D glove vectors. " 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": { 262 | "Collapsed": "false" 263 | }, 264 | "source": [ 265 | "## GloVe" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 3, 271 | "metadata": { 272 | "Collapsed": "false" 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "# Load the glove vectors with PyMagnitude\n", 277 | "# PyMagnitude is a fantastic library that handles a lot of word vectorization tasks. \n", 278 | "\n", 279 | "from pymagnitude import *\n", 280 | "glove = Magnitude(\"../vectors/glove.6B.100d.magnitude\")" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 19, 286 | "metadata": { 287 | "Collapsed": "false" 288 | }, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "application/vnd.jupyter.widget-view+json": { 293 | "model_id": "d3a232cdd47d43e98c6ba8119c46f337", 294 | "version_major": 2, 295 | "version_minor": 0 296 | }, 297 | "text/plain": [ 298 | "HBox(children=(IntProgress(value=0, max=50), HTML(value='')))" 299 | ] 300 | }, 301 | "metadata": {}, 302 | "output_type": "display_data" 303 | }, 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "\n" 309 | ] 310 | }, 311 | { 312 | "data": { 313 | "application/vnd.jupyter.widget-view+json": { 314 | "model_id": "2dd36b0a7cbf4620863eb049b9c4bcf3", 315 | "version_major": 2, 316 | "version_minor": 0 317 | }, 318 | "text/plain": [ 319 | "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))" 320 | ] 321 | }, 322 | "metadata": {}, 323 | "output_type": "display_data" 324 | }, 325 | { 326 | "name": "stdout", 327 | "output_type": "stream", 328 | "text": [ 329 | "\n" 330 | ] 331 | } 332 | ], 333 | "source": [ 334 | "# We'll use Average Glove here \n", 335 | "from tqdm import tqdm_notebook\n", 336 | "from nltk import word_tokenize\n", 337 | "\n", 338 | "\n", 339 | "def avg_glove(df):\n", 340 | " vectors = []\n", 341 | " for title in tqdm_notebook(df.title.values):\n", 342 | " vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))\n", 343 | " return np.array(vectors)\n", 344 | "\n", 345 | "x_train = avg_glove(train)\n", 346 | "x_test = avg_glove(test)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 22, 352 | "metadata": { 353 | "Collapsed": "false" 354 | }, 355 | "outputs": [ 356 | { 357 | "name": "stdout", 358 | "output_type": "stream", 359 | "text": [ 360 | "F1: 0.929 | Pr: 0.909 | Re: 0.950 | AUC: 0.979 | Accuracy: 0.928 \n", 361 | "\n" 362 | ] 363 | }, 364 | { 365 | "data": { 366 | "text/plain": [ 367 | "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", 368 | " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n", 369 | " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,\n", 370 | " n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,\n", 371 | " random_state=None, shuffle=True, tol=0.001,\n", 372 | " validation_fraction=0.1, verbose=0, warm_start=False)" 373 | ] 374 | }, 375 | "execution_count": 22, 376 | "metadata": {}, 377 | "output_type": "execute_result" 378 | } 379 | ], 380 | "source": [ 381 | "run_log_reg(x_train, x_test, y_train, y_test)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": { 387 | "Collapsed": "false" 388 | }, 389 | "source": [ 390 | "Woah! That's a huge increase in F1 score with just a small change in embedding. The improved performance is justified since W2V are pretrained embeddings that contain a lot of contextual information. This would obviously contribute to the classifiers performance, especially when we have a very limited dataset. " 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": { 396 | "Collapsed": "false" 397 | }, 398 | "source": [ 399 | "### IDF-Weighted Glove" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": { 405 | "Collapsed": "false" 406 | }, 407 | "source": [ 408 | "Instead of just taking the average of each word, what if we did weighted average - in particular IDF-Weighted average?" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 8, 414 | "metadata": { 415 | "Collapsed": "false" 416 | }, 417 | "outputs": [ 418 | { 419 | "data": { 420 | "text/plain": [ 421 | "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 422 | " dtype=, encoding='utf-8',\n", 423 | " input='content', lowercase=True, max_df=1.0, max_features=None,\n", 424 | " min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,\n", 425 | " smooth_idf=True, stop_words=None, strip_accents=None,\n", 426 | " sublinear_tf=False, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 427 | " tokenizer=None, use_idf=True, vocabulary=None)" 428 | ] 429 | }, 430 | "execution_count": 8, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 437 | "\n", 438 | "tfidf = TfidfVectorizer()\n", 439 | "tfidf.fit(train.title.values)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 9, 445 | "metadata": { 446 | "Collapsed": "false" 447 | }, 448 | "outputs": [], 449 | "source": [ 450 | "# Now lets create a dict so that for every word in our corpus we have a corresponding IDF value\n", 451 | "idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 10, 457 | "metadata": { 458 | "Collapsed": "false" 459 | }, 460 | "outputs": [ 461 | { 462 | "data": { 463 | "application/vnd.jupyter.widget-view+json": { 464 | "model_id": "78d7c2c19c284f4ebfe4c84937ac90c8", 465 | "version_major": 2, 466 | "version_minor": 0 467 | }, 468 | "text/plain": [ 469 | "HBox(children=(IntProgress(value=0, max=50), HTML(value='')))" 470 | ] 471 | }, 472 | "metadata": {}, 473 | "output_type": "display_data" 474 | }, 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "\n" 480 | ] 481 | }, 482 | { 483 | "data": { 484 | "application/vnd.jupyter.widget-view+json": { 485 | "model_id": "214e6af41d3240578c528df4dae08a08", 486 | "version_major": 2, 487 | "version_minor": 0 488 | }, 489 | "text/plain": [ 490 | "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))" 491 | ] 492 | }, 493 | "metadata": {}, 494 | "output_type": "display_data" 495 | }, 496 | { 497 | "name": "stdout", 498 | "output_type": "stream", 499 | "text": [ 500 | "\n" 501 | ] 502 | } 503 | ], 504 | "source": [ 505 | "# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights. \n", 506 | "\n", 507 | "def tfidf_glove(df):\n", 508 | " vectors = []\n", 509 | " for title in tqdm_notebook(df.title.values):\n", 510 | " glove_vectors = glove.query(word_tokenize(title))\n", 511 | " weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]\n", 512 | " vectors.append(np.average(glove_vectors, axis = 0, weights = weights))\n", 513 | " return np.array(vectors)\n", 514 | "\n", 515 | "x_train = tfidf_glove(train)\n", 516 | "x_test = tfidf_glove(test)" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 26, 522 | "metadata": { 523 | "Collapsed": "false" 524 | }, 525 | "outputs": [ 526 | { 527 | "name": "stdout", 528 | "output_type": "stream", 529 | "text": [ 530 | "F1: 0.957 | Pr: 0.943 | Re: 0.971 | AUC: 0.989 | Accuracy: 0.956 \n", 531 | "\n" 532 | ] 533 | }, 534 | { 535 | "data": { 536 | "text/plain": [ 537 | "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", 538 | " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n", 539 | " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,\n", 540 | " n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,\n", 541 | " random_state=None, shuffle=True, tol=0.001,\n", 542 | " validation_fraction=0.1, verbose=0, warm_start=False)" 543 | ] 544 | }, 545 | "execution_count": 26, 546 | "metadata": {}, 547 | "output_type": "execute_result" 548 | } 549 | ], 550 | "source": [ 551 | "run_log_reg(x_train, x_test, y_train, y_test)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": { 557 | "Collapsed": "false" 558 | }, 559 | "source": [ 560 | "Our F1 increased by 0.02 points. The increased performance makes sense - commonly occurring words get less weightage while less frequent (and perhaps more important) words have more say in the vector representation for the titles. \n", 561 | "\n", 562 | "Since GloVe worked so well, let's try one last embedding technique - Facebook's InferSent model. This model converts the entire sentence into a vector representation. However, a potential problem here is that the vector representations are 4096 dimensional which might cause our model to overfit easily. Let's give it a shot anyway" 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "metadata": { 568 | "Collapsed": "false" 569 | }, 570 | "source": [ 571 | "## InferSent" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 27, 577 | "metadata": { 578 | "Collapsed": "false" 579 | }, 580 | "outputs": [ 581 | { 582 | "data": { 583 | "text/plain": [ 584 | "" 585 | ] 586 | }, 587 | "execution_count": 27, 588 | "metadata": {}, 589 | "output_type": "execute_result" 590 | } 591 | ], 592 | "source": [ 593 | "from InferSent.models import InferSent\n", 594 | "import torch\n", 595 | "\n", 596 | "# Uncomment the lines below with the paths to infersent pkl file and glove file\n", 597 | "\n", 598 | "V = 1\n", 599 | "#MODEL_PATH = 'path to infersent pkl file'\n", 600 | "params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,\n", 601 | " 'pool_type': 'max', 'dpout_model': 0.0, 'version': V}\n", 602 | "infersent = InferSent(params_model)\n", 603 | "infersent.load_state_dict(torch.load(MODEL_PATH))\n" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 28, 609 | "metadata": { 610 | "Collapsed": "false" 611 | }, 612 | "outputs": [], 613 | "source": [ 614 | "#W2V_PATH = 'path to glove file'\n", 615 | "infersent.set_w2v_path(W2V_PATH)" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 29, 621 | "metadata": { 622 | "Collapsed": "false" 623 | }, 624 | "outputs": [ 625 | { 626 | "name": "stdout", 627 | "output_type": "stream", 628 | "text": [ 629 | "Found 338(/363) words with w2v vectors\n", 630 | "Vocab size : 338\n" 631 | ] 632 | } 633 | ], 634 | "source": [ 635 | "infersent.build_vocab(train.title.values, tokenize= False)" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 30, 641 | "metadata": { 642 | "Collapsed": "false" 643 | }, 644 | "outputs": [], 645 | "source": [ 646 | "x_train = infersent.encode(train.title.values, tokenize= False)\n", 647 | "x_test = infersent.encode(test.title.values, tokenize= False)" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 36, 653 | "metadata": { 654 | "Collapsed": "false" 655 | }, 656 | "outputs": [ 657 | { 658 | "name": "stdout", 659 | "output_type": "stream", 660 | "text": [ 661 | "F1: 0.927 | Pr: 0.912 | Re: 0.946 | AUC: 0.966 | Accuracy: 0.926 \n", 662 | "\n" 663 | ] 664 | }, 665 | { 666 | "data": { 667 | "text/plain": [ 668 | "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", 669 | " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n", 670 | " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,\n", 671 | " n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,\n", 672 | " random_state=None, shuffle=True, tol=0.001,\n", 673 | " validation_fraction=0.1, verbose=0, warm_start=False)" 674 | ] 675 | }, 676 | "execution_count": 36, 677 | "metadata": {}, 678 | "output_type": "execute_result" 679 | } 680 | ], 681 | "source": [ 682 | "run_log_reg(x_train, x_test, y_train, y_test, alpha = 1e-4)" 683 | ] 684 | }, 685 | { 686 | "cell_type": "markdown", 687 | "metadata": { 688 | "Collapsed": "false" 689 | }, 690 | "source": [ 691 | "High Dimensional, overfitting, keep for feature selection layer" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": { 697 | "Collapsed": "false" 698 | }, 699 | "source": [ 700 | "## Visualize IDF-Weighted Glove" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": 35, 706 | "metadata": { 707 | "Collapsed": "false" 708 | }, 709 | "outputs": [ 710 | { 711 | "data": { 712 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEWCAYAAABv+EDhAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3dd3xUVfr48c+TySSTShISOgFEBJQSMGJBBUUEG5a1d/2pa3e/a1nFXVfd1dXVrxVdxXXFgi4KYsUCX1BUEAgYkSrFACGUhJDeJpnz++NOQiqk3ynP+/WaF7ln7sx9ZhLOc+85554jxhiUUkoFnxC7A1BKKWUPTQBKKRWkNAEopVSQ0gSglFJBShOAUkoFKU0ASikVpDQBKOUlIhkicprdcVQTkaki8u+DPH+tiHzfmTG1JxGZISJ/9/58kohstDumYKMJwI+ISFGth0dESmttXyEicSLyHxHZLSKFIvKriNxf6/VGRH4RkZBaZX8XkRnen/t79ymq97ikEz6bEZHDvT8/LCJu72eo/hzTRKRnrf3He7+D2nF+epD3DxORh0Rko4gUi8hOEflCRE7v6M92kJgO+vs0xjxujLnBu2/17ya0nY79jYiUNff762jGmO+MMYPtOn6wapc/JtU5jDHR1T+LSAZwgzFmQa2yN4AoYCiQDxwBDKv3Nr2AS4F3D3KoOGNMZTuF3VqzjDFXiogT63M8AqwUkaONMbu8+2QZY/o08/1mA72Bq4GfvGWnAmcBX7dj3M12qN9nJ7jdGNPkFYYKfHoFEFiOAd41xuw3xniMMRuMMbPr7fNP4JH2OJMUkV4i8omI5IrIZhG5sdZzD4vI+yLylvcsfq2IpLb0GMYYtzFmLXAJkA3c3Yo4TwMmAucaY5YZYyq8jy+NMXc18ZpwEXlORLK8j+dEJNz73HoRObvWvqEiki0io73bx4nIEhHJE5GfRWR8S2P2vs/DIvKOd3Ox998879n68Y3sP0RE5nt/HxtF5OJWHne8iGSKyN0isldEdonIdbWejxCR/xWRbSKSLyLfi0iE97kp3t91nvcqY2it140SkVXev4dZgKv+MWttZ4jIPSKy2nuMWSJSe//7vHFlicgN9a4gzxSRdd7j7BSRe1rzPQQDTQCB5UfgMRG5TkQGNbHPh0ABcG07HO+/QCbWVcWFwOMicmqt56d494kDPgGmtfZAxpgq4GPgpFa8/DRgmTEm85B7HvAgcByQAowExgB/9j73HnBZrX0nATnGmFUi0hv4HPg7kADcA8wRkaRWxF3byd5/44wx0caYpbWfFJEoYD7WlV03rKu8l0XkyFYerwfQBeuq6f8BL4lIvPe5p4GjgROwPuN9gEdEjsD6bv4AJAHzgE+9zW9hwEfA297XfAD87hAxXAxMBgYAI/D+zYrIZOCPWL/Xw4Hx9V73OvB7Y0wM1hXwwhZ/+iChCSCw3AHMBG4H1nnPys+ot48B/gL8xfufsjE53jO46sfQ+juISF9gLPAnY0yZMSYd+DdWE0u1740x87yV99tYFWlbZGFVHtV61YuzqTPeRGB3rdgTvPvni0hZE6+5AnjUGLPXGJON1QR1lfe5d4EpIhLp3b4cq+IDuBKY5/3cHmPMfCANOLOlH7aFzgYyjDFvGGMqjTE/AXOAiw7ymhfqfX9/q/WcG+vzu40x84AiYLBY/UfXA3cZY3YaY6qMMUuMMeVYV2mfG2PmG2PcWIkiAitRHAc4gee87zkbWHGIz/SCMSbLGJMLfIqVjMFKDG8YY9YaY0qAh+u9zg0cKSKx3qvhVYc4TtDSBBBAjDGl3o7Do4GuwPvAByKSUG+/eVhn7r9v4q0SjTFxtR7rG9mnF5BrjCmsVbYN64yx2u5aP5cArjY2PfUGcmttZ9WL831o0LmaDOwDajqQjTG5xpg4rLPY8CaO1cv7eapt85ZhjNkMrAfO8SaBKRzoU+kHXFS7YgVOrH38DtIPOLbeca/AOpNvyp31vr+/1HpuX71+oBIgGiuZuoAtjbxfne/MGOMBdmD93noBO03d2Se3cXD1/36q+0x6ed+3Wu2fwbqyOBPYJiLfNtZcpiyaAAKUMaYAeByrU3hAI7s8CEwFIht5rjmygAQRialVlgzsbOX7HZT3zPMc4LtD7ettIql+bAf+DzhGRJrbYQzW5+tXazvZW1atuhnoXGCdNymAVRm9Xa9ijTLGPNGCYzfmUNP27gC+rXfcaGPMLW08bn05QBkwsJHn6nxnIiJAX6y/iV1Ab29ZteRWxrALqP277Fv7SWPMCmPMuVhNYR9hnQipRmgCCCAi8hcROcbb5uoC7gLygAbjq40x3wBrgGtacyxjzA5gCfAPEXGJyAistuJ3Dv7KlvF2sA7FqnB7AM+0ItavgUXARyJyrPf7cWI1SzTlPeDPIpIkIonAQ9T9bP8FTgduoe6IqnewrgwmiYjD+92Mb2HyaUw24AEOa+L5z4AjROQqEXF6H8c01nzXFt6z+v8Az4g1CMAhIseL1UH+PnCWiEzwfr93A+VYfydLgUrgTm9sF2D1q7TG+8B1IjLUewVWc+Xi/d1eISJdvM1QBVjfm2qEJoDAYoA3sM7SsrBGvpxljClqYv8/U7dNvVr1SJPqxx+beP1lQH/vseYCf23HYYyXiEgR1nDWT7CacY42xmQd/GVNOh+rknwHKyn+htVEMqmJ/f+O1Xa/GvgFWOUtA8A7FHUpVvv2rFrlO7CuCqZiVdo7gHtp4/81b1v3Y8AP3iae4+o9X4iVkC7F+n3sBp6k6SYugGn1fs8rmxnOPVjfyQqsJrkngRBjzEasPpAXsf4GzwHOqR51BVyA1ZGbi9Vf8GEzj1eHMeYL4AWspL4Za/ADWMkGrL6aDBEpAG7G+j2rRoguCKOU8mfeq5w1QLgP3L/iV/QKQCnld0TkfLHu1YjHugL5VCv/ltMEoJTyR78H9mKNRqrC6otRLaRNQEopFaT0CkAppYKUX00Gl5iYaPr37293GEop5VdWrlyZY4xpMB2JXyWA/v37k5aWZncYSinlV0Sk0buutQlIKaWClCYApZQKUpoAlFIqSPlVH4AKXG63m8zMTMrKmpqdWXUkl8tFnz59cDqddoeiOpEmAOUTMjMziYmJoX///tSdMFJ1NGMM+/btIzMzkwEDGps4VgUqbQJSPqGsrIyuXbtq5W8DEaFr16569VVfWT4U7oaS3EPv66c0ASifoZW/ffS7r6cgCz78Pbw4Gv57OezbAgE4a4LtCcA7n/hPIvKZ3bEopRTFOTD39/DrF1BRDNuXwtvnQXG23ZG1O9sTANaiJY0tOahUp4qOjj7o8xkZGQwbNqxF73nttdcye/bstoSlOlt5Ify2uG5Z3nYrGQQYWxOAd5Wks7AWE1dKKXsVZUPOrxDfv265MxKcLltC6kh2XwE8B9zHQZZsE5GbRCRNRNKyswPvEkz5nqKiIiZMmMDo0aMZPnw4H3/8cc1zlZWVXHHFFQwdOpQLL7yQkpISAFauXMm4ceM4+uijmTRpErt27bIrfNUWpgqWvQqTn4Aw7xWhIwymvACuOHtj6wC2JQARORvYa4w56DJ0xpjpxphUY0xqUlKDuYyUancul4u5c+eyatUqFi1axN133031tOkbN27k1ltvZf369cTGxvLyyy/jdru54447mD17NitXruT666/nwQcftPlTqFYJj4X4frDqTbjiA7jmU7j+S+h/Mjgj7I6u3dl5H8BYYIqInAm4gFgReccYc6WNMSmFMYapU6eyePFiQkJC2LlzJ3v27AGgb9++jB07FoArr7ySF154gcmTJ7NmzRomTpwIQFVVFT179rQtftUGYZFwylRY+jJ8eid0OxIm/g2iu9kdWYewLQEYYx4AHgAQkfHAPVr5K18wc+ZMsrOzWblyJU6nk/79+9eMka8/XFJEMMZw1FFHsXTpUjvCVe0tKgnG3w/H3WKd9YfH2B1Rh7G7D0Apn5Ofn0+3bt1wOp0sWrSIbdsOzKS7ffv2mor+3Xff5cQTT2Tw4MFkZ2fXlLvdbtauXWtL7KqdhIZbZ/0BXPmDjyQAY8w3xpiz7Y5DKYArrriCtLQ0hg8fzltvvcWQIUNqnhs8eDAvvfQSQ4cOZf/+/dxyyy2EhYUxe/Zs/vSnPzFy5EhSUlJYsmSJjZ9AqebxqzWBU1NTjS4IE5jWr1/P0KFD7Q4jqOnvIHCJyEpjTGr9cp+4AlBKKdX5NAEopVSQ0gSglFJBShOAUkoFKU0ASikVpDQBKKVUkNIEoFQTHn74YZ5++mkAHnroIRYsWNDkvjNmzOD2229vUN7S6aCzsrK48MILAUhPT2fevHktjFqp5tM1gZVqhkcffbRTjtOrV6+ahJGenk5aWhpnnnlmpxxbBR+9AlB+6aOfdjL2iYUMuP9zxj6xkI9+2tnm93zrrbcYMWIEI0eO5KqrrqrzXO0z+RUrVnDCCScwcuRIxowZQ2FhYZ19P//8c44//nhycnIAWLBgAampqRxxxBF89pm18F1GRgYnnXQSo0ePZvTo0TV3DlcvOlNRUcFDDz3ErFmzSElJYdasWW3+fErVp1cAyu989NNOHvjwF0rdVQDszCvlgQ9/AeC8Ub1b9Z5r167l73//O0uWLCExMZHc3FxeeOGFBvtVVFRwySWXMGvWLI455hgKCgqIiDgwTfDcuXN55plnmDdvHvHx8YBVqS9fvpwtW7ZwyimnsHnzZrp168b8+fNxuVxs2rSJyy67jNp3uYeFhfHoo4+SlpbGtGnTWvWZlDoUTQDK7zz11caayr9aqbuKp77a2OoEsHDhQi666CISExMBSEhIaHS/jRs30rNnT4455hgAYmNj67xHWloaX3/9dZ3yiy++mJCQEAYNGsRhhx3Ghg0bGDBgALfffjvp6ek4HA5+/fXXVsWtVFtoE5DyO1l5pS0q7ywDBw6ksLCwQWXe2BTSzz77LN27d+fnn38mLS2NioqKzgxVKUATgPJDveIaX5mpqfLmOPXUU/nggw/Yt28fALm5uY3uN3jwYHbt2sWKFSsAKCwspLKyEoB+/foxZ84crr766jrTQX/wwQd4PB62bNnC1q1bGTx4MPn5+fTs2ZOQkBDefvttqqqqGhwrJiamQf+CUu1JE4DyO/dOGkyE01GnLMLp4N5Jg1v9nkcddRQPPvgg48aNY+TIkfzxj39sdL+wsDBmzZrFHXfcwciRI5k4cWLNYjEAQ4YMYebMmVx00UVs2bIFgOTkZMaMGcMZZ5zBK6+8gsvl4tZbb+XNN99k5MiRbNiwgaioqAbHOuWUU1i3bp12AqsOo9NBK5/Q0qmIP/ppJ099tZGsvFJ6xUVw76TBrW7/VxadDjpwNTUdtHYCK7903qjeWuEr1UbaBKSUUkHKtgQgIi4RWS4iP4vIWhF5xK5YlFIqGNnZBFQOnGqMKRIRJ/C9iHxhjPnRxpiUUipo2JYAjNX7XOTddHof/tMjrZRSfs7WPgARcYhIOrAXmG+MWdbIPjeJSJqIpGVnZ3d+kEopFaBsTQDGmCpjTArQBxgjIsMa2We6MSbVGJOalJTU+UEq1YgZM2aQlZXVotfUnlDuhhtuYN26dU3uW3sq6trGjx9PS4ZCp6WlceeddwLwzTff1Ew6pxT4yDBQY0yeiCwCJgNr7I5HqUOZMWMGw4YNo1evXq16/b///e92jqhxqamppKZaw7+/+eYboqOjOeGEEzrl2Mr32TkKKElE4rw/RwATgQ12xaP8zOr34dlh8HCc9e/q99v0dhkZGQwdOpQbb7yRo446itNPP53S0lLS09M57rjjGDFiBOeffz779+9n9uzZpKWlccUVV5CSkkJpacM5iJ588kmGDx/OyJEjuf/++xs8X/tM/ssvv2T06NGMHDmSCRMmNNj3tdde44wzzqg5zttvv01KSgrDhg1j+fLlACxfvpzjjz+eUaNGccIJJ7Bx40bAqvTPPvtsMjIyeOWVV3j22WdJSUnhu+++a9P3pQKDnVcAPYE3RcSBlYjeN8Z8ZmM8yl+sfh8+vRPc3oo3f4e1DTDi4la/7aZNm3jvvfd47bXXuPjii5kzZw7//Oc/efHFFxk3bhwPPfQQjzzyCM899xzTpk3j6aefrjm7ru2LL77g448/ZtmyZURGRjY5rxBAdnY2N954I4sXL2bAgAEN9p02bRrz58/no48+Ijw8HICSkhLS09NZvHgx119/PWvWrGHIkCF89913hIaGsmDBAqZOncqcOXNq3qd///7cfPPNREdHc88997T6O1KBxc5RQKuBUXYdX7WQMVC0B3alQ2gEdBsK0d3sieX/Hj1Q+Vdzl1rlbUgAAwYMICUlBYCjjz6aLVu2kJeXx7hx4wC45ppruOiiiw75PgsWLOC6664jMjISaHpqaYAff/yRk08+mQEDBjTY96233qJv37589NFHOJ3OmvLLLrsMgJNPPpmCggLy8vIoLCzkmmuuYdOmTYgIbre7hZ9eBSO9E1g1T0EWvDIW3r0E3poCM86Cor32xJKf2bLyZqo+wwZwOBzk5eU163XLli0jJSWFlJQUPvnkkzbFUNvw4cPJyMggM7Pu52pseum//OUvnHLKKaxZs4ZPP/20zgR1SjVFE4A6NE8lLHsVinMOlOX8Cr/Z1I7cpU/Lylt7mC5diI+Pr2kvf/vtt2uuBmpP1XzssceSnp5Oeno6U6ZMYeLEibzxxhuUlJQATU8tDXDcccexePFifvvttwb7jho1ildffZUpU6bUGXFUPTPo999/T5cuXejSpQv5+fn07m3NjTRjxoxGj6XTS6v6NAGoQ/NUQUEjZ9cFLRsG2W4mPATOenP/OyOs8nb25ptvcu+99zJixAjS09N56CHrGNdeey0333xzo53AkydPZsqUKaSmppKSktLocM5qSUlJTJ8+nQsuuICRI0dyySWX1Hn+xBNP5Omnn+ass86qWWPY5XIxatQobr75Zl5//XUA7rvvPh544AFGjRpVsz5Bfeeccw5z587VTmBVQ6eDVs2zYzm8PvHAtsMJt6dBfP92efsWT0W8+n2rzT8/0zrzn/BQm9r/lU4HHch0OmjVNkmD4eqP4bv/tTqBT/0zRHe3L54RF2uFr5pWtAfWfWI1VY66EuIHgCv20K8LMpoAVPO4usBh46FnCogDXDF2R6RU44r2woyzrcofYPl0uPwDOOJ0e+PyQdoHoFomIq7DKn9/ao4MNAH13ednHqj8q33zeN1BDArQBKB8hMvlYt++fYFVEfkJYwz79u3D5XLZHYrqZNoEpHxCnz59yMzMRGd8tYfL5aJPn/YdRmubLn0g8Yi6VwHjp0JUon0x+ShNAMonOJ3OmrthlWqT6G5w7Wew/lPI3gijrmq30WqBRhOAUirwRHeHY26wOwqfpwlAKdU+KoqhNM+6QbBLL3DFQViU3VFByT7I3Qo7VkD/EyGuL0TE2x2VT9AEoJRqu8oK2Px/MPta687xEAdc9CYMmgShYfbFVV4I3z8LS148UDbpMUi9AZza6a2jgJRSbVeaC5/cYVX+YP37yR1WuZ3KC+HHl+uWLfoHlDVvor9ApwlAKdV2HnfDSrV0vzWRoJ2M50BSqlZZak1vrjQBKKXaQWgE9B5dt6xPKoTa3MzijITDT6tbNuIyCIu2Jx4fo30ASqm2i0qES96BLx+EHUuh7/Ew+XH7x95HJsB5r8Dq/8Jvi+GIyXDkuTqViZfOBqqUaj/lhdZooLAoCPehStZTZa0a54yEkOBr+GhqNlA7F4XvKyKLRGSdiKwVkbvsikUp1U7CYyCmh29V/mCNSgqPDsrK/2DsbAKqBO42xqwSkRhgpYjMN8asszEmFWhK9kGV26qQfGFMulI+xLZ0aIzZZYxZ5f25EFgP9LYrHhVgqqpg73p47zL41/HwxZ/sW8NYKR/lE53AItIfGAUsa+S5m4CbAJKTkzs1LuXHSrJhxplQ4h2H/tPb1pDAM5/SKwGlvGxvEBORaGAO8AdjTEH9540x040xqcaY1KSkpM4PUPmn0v0HKv9qGz6zOimVUoDNCUBEnFiV/0xjzId2xqICTHgsSL0/764DIcQnLnqV8gl2jgIS4HVgvTHmGbviUB2vsspDdmEZOYXleDydNOw4PAZOewRErG1XFzjnRfvHpSvlQ+w8HRoLXAX8IiLp3rKpxph5Nsak2tn+4go+Ts9i+uIthDsd3Dd5MCcenkiMy9mxB3bFwtHXwrALrBkqoxIhUit/pWqzLQEYY74HxK7jq86RviOPhz9dW7N9yzur+OoPJzO4RwcnALCSgCvWWiFKKdWA7Z3AKnCVu6uYvTKzQfnX63bbEI1Sqj5NAKrDOB0hHNUrtkH50J4Ny5RSnU8TgOowISHCRal9GNz9wLQAJw1KZFTfOBujUkpV0zFxqkMlxbiYeeOx5JW4CQ0RYiOcJETZuEKUUm1VtBeKc6whxZEJfj2yTBOA6nCJ0eEkRofbHYbvMQaKs61/I+LtXTpRNU/hHnjrHMjeaG33G2stfRntnzepahOQjyl3V7G3oIz80gq7Q1EdqbwINi+AGWfB9JNh6TRr4jrluzweSH/nQOUPsO0H2L7UvpjaSBOAD9lXVM7TX//K+S8v4baZq9iyt4iqzrpxSnWu4r3w7kWQ8ysU7ob/e8RKCMp3mUrYs7Zh+V7/ncBYE4CPKHVXMW3hZl77bis780r5fvM+LnxlCfuKyymvrKKwzG13iKo9bf2m4bq0P8+CMp2ryGc5wmDUVQ3Ljzy382NpJ5oAfERhqZtPV2fVKdtf4mZvQTn//HIDt81cxbxfdrG/RJuGAkLi4IZl3Y+yfw1ddXA9U+CcFyDhMEgaApe+B7H+O4u9dgL7iFBHCH3jI8kpqlvBV3kMb/yQgcfA4k05PHbeMC4dk4wjRG+i9mtJg2HoFFj/ibXddSAcdyuEdsId0qr1IuMh5UoYPBkIsUYAif/+X9QrAB+REBXGY+cPIyrMUVN287jD+H5zDrW7Af7zQ4ZeBQSCqEQ45zm4Mx1uWw7XfQmxPe2OSjWHwwHR3a2RP35c+YNeAfiUQd2jWXjPeHbmlZIYHY4xhlP/99s6+3SJCMXh5390yiuyq/VQyiZ6BeBDnA4H3WNdjE6OJzkhkujwUI4/7EAFERoiTD1rKPF6I5VSqh3oFYAP6xodznOXprBpTyEZ+0o48fBEEmO08lcq4FRVWcuY7l1nXRXG9u6UO4w1Afi46rtojx9odyRKqQ6TlwGvnQJl+db24afB+a92eBLQJiCllLJTeREseORA5Q/WTYF52zr80HoFoFSgKdoLO1dBYZZ1JhnVDZx6f0G7c5dB0R5Y+5E1KdygiRDTo+XvU1UOhTsblhfuaXuMh6AJQKlAUrQX3rkAdv9ibTvC4MZF0GOYvXEForxt8OpJUFlubcclww0LrCGiLRGRAEdfB5lpB8qckdArpf1ibYKtTUAi8h8R2Ssia+yMQ6mAsW/LgcofoKrCmmeorMC+mAJRRQl8+88DlT9A3nbYsazl7yUCg8+07jDuORIOnwg3LoSojp9h1O4rgBnANOAtm+NQKjCUNzKXUHkheKo6Pxawjlu8F3atBlecNYWCn06dXJeBytKGxe7yhmXNEZlgzTM05CxwOMHVpW3hNZOtCcAYs1hE+tsZg1K2K9kH7lIQB0TEgTOi9e/Vc4RVmZTkHigbe6c1hYEd8nfAq+OgLM/a7jUaLp8F0d3siae9hEXBSXfDxnkHJvWLiIf+J7b+PUNCOn1xGbuvAA5JRG4CbgJITk62ORql2lnhHphzA2Qsttp9T3sERlxsJYLWiOoGNy2GH56Dgp1w7M3WBGZ2cJfC4qcOVP4AWatgzzr/TwBgTej3++9g6UtWxX3szX73uXw+ARhjpgPTAVJTU3VyfBU43GXww/NW5Q/gLoEv7oXDT219AggJgbi+MOkf4HFbZ6p2qXJDwa6G5YWNlPmj8GjoMRymvAgSAiGOQ7/Gx+h9AErZpaLIWlGqvj3tsMBIaJi9lT+AKxbG3FS3LNQFA06yJ56O4nD6ZeUPmgCUsk94DAwY17C8ewAN2eyVAhf+B/oeC0dMtoakdsLoFtU8tjYBich7wHggUUQygb8aY163MyalOk1oOJxwO2Svh01fWwnh9Mc7vSOwwxTnwNxboKrMGt3iLgWM9bmVT7B7FNBldh5fKdtFd4MLpntHAYVYI0kCpYIs2gNbF1o/b1ti/fvbYrhkpn2jklQdPt8JrFTAi4i3HoGmsZvPSnPBU9n5sahGaR+AUqpjJBzWsDnrmBut+xSUT9ArAKVUx4hKsjp9v3kS9v8Go6+xJkzz0xEzgUgTgFKqY4SEWBOknfW0NWdOa+9tUB1GE4BSqmM5I9o2vYXqMJoAlFK+w1NlDR/1VFqjoQJlSKyP0gSg/FtFsbWikgCRidq+7M8qK2DnSphzPRRkWVMjX/KO1YykOoSOAvIB+4sr2JlXyq78UgrL3HaH4z+Ks+HLB+D54fDaqdYyehXFdkelWqs0F9692Kr8AXb9DHNvrjuzqWpXTSYAEZmnUzV3vJyicv4w6yfGPrGQsU8s5Jn5v5JbXNHpMezILWF3fhmlFTbNG99SVZWQ9gasetPqYMzPhPcutaZWVv6pvAjK6907sH2pNamc6hAHuwJ4A/haRB4UEWdnBRRMPB7Dx+k7+fbXHGvbwBs/ZLA1u6jTYsjKK+WK15Zx0j8XMe6pRXywcod/XIWU5cOGz+uWGY911qj8U3h0wwnseh8NDm2p7ihNJgBjzAfAaCAWSBORe0Tkj9WPToswgJVXVrF0S8Mz1pXb9nfK8YvK3fxj3no27in0xuPhr5+sJa/EDxKAM8Ja/KS+roM6PxbVPlxxcPHbB+6K7joQzn8VIrvaG1cAO1RqrQCKgXAgBvB0eERBxOV0MPHIHixYv7dO+djDO2fkQ2mFh58z8+uUGQM780rpmxDZKTG0WlgkjL8ftv8IOb9a8+iccKffLcihanG6YMDJcOuP1lrGoS79fXawJhOAiEwGngE+AUYbY0o6LaogISJMPLI7P2cm80HaDsJDHfzhtEH0ie+cMdPR4aGcNCiRmcu215SFhgj9fL3yrxbbG6793Or4dYRZs2m6Yu2OSrWFwx1APnkAABdBSURBVAkxPeyOImgc7ArgQeAiY8zazgomGCVEhfHgmUO5a4LVdBEX4STc2b5DGas8hn1F5WTll9IlIoz4SCdxkWFEhFkJZ3d+GQs37iUpOpwnfjeCLhF+1OWjZ4hKtVqTCcAYE2DL9viuqPBQosI7rqNre24JF7z8A/u9bfsXp/bhgTOGEh8VRlKMi2cvSaHUXYVDhPhIJw6Hjg5WKhgE/P/0glI3GTnFfL46i63ZRRSU+kEHZzsqLHPz2Ofraip/gPfTMskpKq/ZdoQIxsDugjJyiisoc/vJUFClVJsE9PiqcncVn63exdS5v9SUPXT2kVw2JpmIsOC4Y7S80sP23IbdN3sKyhjUPYYydxULN+zlf2alU+kxhIeG8NrVqYwd2FWvBJQKcAH9Pzyv1M3j89bXKfvnVxso8Idx7u0kLsLJ+aN61ymLcDoY1D0GgPxSN/fPWU2lxwBWwrj7g5/ZV9K5N6MppTqf3WsCTwaeBxzAv40xT7Tn+xtjKCqvu/pQmdtDlbeyCwahjhAuOaYv5ZUeZq/MpEesi0fPPYr4SKujt6LSQ3G9u3+zC8vx6IBfpQKebQlARBzAS8BEIBNYISKfGGPWtdcxXE4Hpw5JYuGG7Jqy4wd2JaKdR9n4uoSocG4dfziXj0nG6QghPiqs5rkIp4NB3aLZtPfA3cdjBiQQHhrQF4dKKextAhoDbDbGbDXGVAD/Bc5tzwPERYbx5O9Gcuv4gYzo04WbTj6MFy5NqVMBBouw0BC6xboafPbEmHDeuO4YTh2SREJUGGcO78ELl44Kyu9IqWAjxtjTHCIiFwKTjTE3eLevAo41xtxeb7+bgJsAkpOTj962bVuLj+Wu9FBYXkl0eChhembbqIJSN2XuKiLDHES7/Og+AKXUIYnISmNMav1yn68NjTHTjTGpxpjUpKSkVr2HMzSEhKgwrfwPIjbCSbdYl1b+SgURO2vEnUDfWtt9vGVKKaU6gZ0JYAUwSEQGiEgYcCnWvENKKaU6gW2jgIwxlSJyO/AV1jDQ/+i8Q0p1IE8VFO2FHcutifN6DNO5lIKcrfcBGGPmAfPsjEGpoJGfCa+eDGV51nbSELjmU00CQUx7RZUKBpUV8MNzByp/gOwNsGOZfTGpQyvaYy19uvBvkLMJytt3zeuAngtI2aOyykN+qZvIMAcRYfon5hM8lVbzT33F2Q3LlG8o2gOvT4L9v1nb3z0DNyywlslsJ3oFoNrVvqJyXvl2C1f/Zzl/+XgtWXmldoekwFpB7fjb6paFuuDwifbEow5tz/oDlT9Ya14vegzKCtrtEHp6ptpNaUUlz//fJt5aat2stzargBUZucy++QSSYsJtjk7R/Si4+mP44XmrE/iUqdr+78s8lY2UVVmJoJ1oAlDtprCskjkrM+uUbdtXQnF5ZdAkgOLySgrLKqnyeIgICyXBl6bUcHWBw8ZDr9EQ4oCwKLsjUgfTYzjE9ITCXQfKTr4PIuLa7RCaAFS7ERGSYsIp3ldSq4yguQM7r6SCGUsyeGnRZtxVhuMOS+DFy0b7XvLTdZN9R1UVVBSCMxJC650sxHSHGxfBT29bI7jG3Ajx/dv18MHxPzMIGWPILizjt5xidueXUlTe8WsgJEaH8ffzhuEIkZqy68cOILoDl7v0Jbvyy3huwSbcVdb8Wj9uzeXNpRlUVOrc2qoRxTnw40vw38vh2yetTt/6YnvCuPvg7OesK4LwmHYNITj+Zwah7bklXP7aMnbmleIIEe6dNJjLxyQT24ELvosIo/vF8919p7BmZz79E6PoFhPeocf0JWt25jcoW/5bLqUVVUFzFaSaqawQvv4z/Pyetb3tB9j2PVzyLkR1bbh/SMf8/WgCCEAFpW4e+XQtO70jcKo8hie+2MDZI3p2eGUcGRZKZFgoveIiOvQ4ncFd5WF/cQUVVR5coQ4SD9GUMzo5vkHZKYO7ERUeXOtPqGZwF8Ev79ct2/4juIuBRhJAB9HTkgBUXlnFxt1FDcp355fZEI1/KndXsWzrPiY9t5gTn1zExa8uJSPn4DfhJMWE8/j5w4h1hRIiMGVkTy5O7UOorq2sGhAIr9cXExJqPTqR/mUGoOhwJ6cOqTu8Lzw0hD4JkTZF5H/ySt38/u2V7C+x+k625hTzP7PSyS1ueq3k2AgnFx7dlwV/HMfSBybw2PnD6RrtYx3AyjdEJMDEv9UtG3tXu7fxH4o2AQWgiDAHd00YRGGZm3m/7KZvQgRP/m4E8UHSFt8eSioqG6yVnJ6ZR+UhFkuuXnlNqYMKDYMjp0DysdbkfD1HQmxvTQCqfSTGhPP384Yx9ayhhCCHbL9WdUWGhRITHkph+YGbcVL7xePsoM44FYRcXaxH4hG2haAJIIBFu5y6wlcrJUQ6mXvbWHbuL+XHrftIy8jlmYuDcz1pFbg0AShVT2WVh83Zxdw/ZzW/5RQz8cjuTLt8NN27aNOOCiyaAJSqJ7e4gkumL6Wg1Gr+mbNqJ44Q4eEpRxGps5uqAKINmkrVs6+4oqbyrzZ/3R6KyhqZnEsdmscD7hIwxu5IVD16OqNUPV0inIjUra8OS4rGqXfztlxRNvzyAWQshiFnwxGTISrR7qiUly1/0SJykYisFRGPiKTaEYNSTYlxhXL/GUOontIoPtLJPy4YTnykdgC3SMk+mHsTfPUAbPwCPr4NvvkHlDe8SVHZw64rgDXABcCrNh1fqSbFuJxcPiaZKSN7UVRWSZdIJ1218m+5ihLYsrBu2aq34OR7ITzanphUHbYkAGPMerAmD1NNyyupICuvjB8255CSHMfApGjfml8+gMW4nMS4nNDF7kj8mIRYj9oLmITqSCpf4vN9ACJyE3ATQHJyss3RdJ6KSg8fp2fx10/W1pRdeVwy900aEjSzayo/Fx4Nqf8PVrx2oGz8VIhoOGmeskeHJQARWQD0aOSpB40xHzf3fYwx04HpAKmpqUEzjKCo3E369v2EOUKoqLLOoN5dtp3bxh+uCeAgjDGUV3oIDw3RK0y7ubrA+Adg2AWwYwUcNg7i+0Go3pXuKzosARhjTuuo9w5k+aVuft1TyDs/bqNXlwje//1x3P3BarZkF+Ex4NGhdE3aV1TOV2t38+2vOUwY2o0JQ7vRNUorG1tFdYWoE6DfCXZHohrh801Adqus8pBTVMHiTdlEh4dyTP94kmI6ph3TGMOSzTncMnNVTdmnq7N44oIRXPn6Mk4Y2JUIvRGpUfklFTz08Ro+/2U3AF+t3c2FR/fhr+ccabXl+4CCUjfF5ZWUVXqIDnd02N+RUs1lS20iIucDLwJJwOcikm6MmWRHLIeSlV/Gmc9/R5F3UrDkhEjm3HJ8h/znzS2p4KVvNtcpy9xfirvKw9/OPYozhvXUTuAmlFRUMW/N7jplc3/ayT2nD/aJBJBf4ua177fy0qLNGAN94iOYddPx9I73/4VzlP+y5T4AY8xcY0wfY0y4Maa7r1b+FZUe/vXNlprKH6ylFn/cmtshxwtBCGtk8ZCEqDAuP7afzuh5EB5jcNRr8w8NEXylGyC3pJxpCzfX3FyWub+UJ75cX+dvS6nOprc2HoTHGApKGy4Akl/aMQusx0eFcd/kIXUqrSN7xtI7PqLOQuuqofW7CrjiuH51ym44aQCxPnD2D7Ajt7RB2fpdhZTVW3NAqc6kDcoH4XI6uOnkgTXtygARTkeD1bba0/DeXVjwP+P4bHUW/ROjGDswkURdVeqQVm7fz/DesfzrytGs3pHPqOQ44iPDiAjzjfV4B3WLxukQ3FUHOvEnHdWdLjqiS9lIjB+NKklNTTVpaWmdeszCMjeb9xbxyrdbiXWFctsph9M73oXT4RsVi7LsyC1h4rPfkhQTzsDEaHbmlTLjujE+08ZeWlFJ+o48ps5dw+78Mqak9OK+SYN1yUjVKURkpTGmwbQ7mgCaqbSikhARwp1a8fuiisoq9hSU8+7y7YQIXD6mH91iw3H60ILsxhhyiiowGKLDQ3VqadVpmkoA+hfYTDr80reFhTromxDJnyYP6bRj5pe6CQ8NwdXMkwIRIUk78pUP0VpNqRbKK6ng21+z+e/yHfSJj+CuCYPoFRdBiHbUKz+jCUCpFvB4DF+u2c39H/5SU7Zww16++MNJdNMbu5Sf8Z0GUqX8QG5JBW8uzahTtq+4gq3ZxbbEo1RbaAJQqgWcIUJcRMO7sXWCPuWPNAEo1QJdIsN48Kyhde7YHnt4V7pr567yQ9oHoAJelcewr7ic0ooqXE4H8ZFOwkJbP5x3UPdovrl3PMt/y6V3fASHJUbpeH7llzQBqIC3eW8RV/9nGXsKyokJD2XaFaM4bkDXVt/TER7qoFdcBOeN6t3OkSrVubQJSAW0fUXl3PHeKvYUlANQWF7JbTN/Iq8Z8zlZU4GXU1jWMXM/KWU3vQJQAa3SY/h1T1GdsqLySkrdB5+ELbe4nFkrMvnop50kd43gwTOPJDkhUsf6q4CiVwAqoIU5QhidXHcN2sToMCIPMkmcu9LDW0u38eSXG9i4p5D56/Zywb+WkFNU3tHhKtWpNAGogBYfFcYLl6UwOjkOgIFJUbx5/RgSIpteWCevtILZKzPrlOUWV5CVX9ahsSrV2bQJSAW8PvGR/PuaVNxVBkeIHHJ67dCQELrHusjcX3cO/zgd6+/fqiqhJAcqy8AZAZGJEBLckzvacgUgIk+JyAYRWS0ic0Ukzo44VPBIiAqne6yr0cq/qKySrdlFvP7dVr7duBcReHTKUbicB/57XJzah7hITQB+q6oKsn6CV06E50fCq+Ngz1rwo9mQO4It00GLyOnAQmNMpYg8CWCM+dOhXmfndNAqcH2zcS/XzVhRUxecNCiR5y9NwV1l+HVPIT28iSNe12P2X4W74dWToGjvgbL4/vD/vobo7raF1Vl8ajpoY8zXtTZ/BC60Iw6lcorKeXze+jongt9tyiGvxM1hSdF0j9UJ3gJCZXndyh9gf4bVLBTEfKET+HrgC7uDUMHJGENJI+vyVlR5bIhGdZhQF3TpW7csaTA4grtZr8MSgIgsEJE1jTzOrbXPg0AlMPMg73OTiKSJSFp2dnZHhauCVHxUGDeceFidsgGJUXSN0qkdAkpUElw+C7oOtLa7DYVLZkJ0x63v7Q9sWxJSRK4Ffg9MMMaUNOc12gegOsL+kgpW/JbLByszGdIjhiuP66dNP4GqaC943BASBtFJdkfTaXyqD0BEJgP3AeOaW/kr1VHiI8M4/agenDQoEWdoCKEhvtAyWo+7FMoKQNDhi20R5Gf89dn1lz4NiAHmi0i6iLxiUxxK1YgIC/XNyr84BxY+Bi+OhtcmwMYvoLzQ7qhUALBrFNDhdhxXKb/j8cDaj2Dpi9Z2RRG8fyXcvgrCY+yNTfk9HzzdUUrVKC+AdXPrlhkD25faE48KKJoAlPJlzgjomdKwvNvQzo9FBRxNAEr5stBwOOH2uhX+6Gsgrp99MamAoZPBKeXrYnrC1Z9ARbF141JYNETo9FltVuUO+hvBNAEo5Q90+GL7KdoLa+ZC5jIYcTH0GQORCXZHZQtNAEqp4FGcA7OuhB3LrO01c2DCw3D8rVZzW5DRPgClVPCoKDpQ+Vdb8jyU5tkTj800ASilgoc0UuU5wqw7rIOQJgClVPAIi4bBZ9QtO+XPENHVnnhspn0ASqngEZkAU16EzBWwcxUMOdtaGMYRnFVhcH5qpVTwikqCwWdajyCnTUBKKRWkNAEopVSQ0gSglFJBShOAUkoFKU0ASikVpDQBKKVUkNIEEIQKSt0UlrntDkMpZTO7FoX/G3Au4AH2AtcaY7LsiCWYFJW5WZtVwIsLNxPqEO6eeAQDu0UTGaa3gygVjOy6AnjKGDPCGJMCfAY8ZFMcQSVjXwmXTP+R7zfn8M3GbM57eQl7C8rtDkspZRNbEoAxpqDWZhRg7IgjmFR6PLy5JKNOWZXH8FH6TnsCUm1TnAOFe6Bkv92RKD9m27W/iDwGXA3kA6ccZL+bgJsAkpOTOye4ABSC0DMuokF5jy4uG6JRreapgpxN8OENsGcN9DsRzn8FuvSxOzLlhzrsCkBEFojImkYe5wIYYx40xvQFZgK3N/U+xpjpxphUY0xqUlJSR4Ub8EJChMvHJJMUc2DRi35dIzl1iK405VeKc+Dt82D3L2AMZHwHH94IJbl2R6b8UIddARhjTmvmrjOBecBfOyoWZekeG87nd57Imp0FOB3CkB6xdRKC8gPuEijcVbds2xKo0r4c1XJ2jQIaZIzZ5N08F9hgRxzBRkToFuPi1CHa7OO3Ql3WnPYVRQfKEgeBOOyLSfktu0YBPeFtDloNnA7cZVMcyk9VVFaxp6CMrLxScosr7A6n80TGwwWvgTPSu90Vfve6LhqvWsWWKwBjzO/sOK4KDCUVlSz+NZs/zfmF/FI3o5PjefmK0cHRoR3qgoGnwh2rrOagsGgrCSjVCnonsPI7+aVubnv3J/JLrbuZV23fz+Pz1lFcXmlzZJ3E6YLYntB1IMR0D9rVrFTbaQJQficrr4wqT91bR5b/tj94EoBS7UQTgPI7veJcOEKkTtkx/eOJDNeOUKVaQhOA8jtdXE6evzSFWJfV9DGyTxemnjWU6HCnzZEp5V+08VD5ncjwUE4/sgfH9E+gymNwhYaQEK33MyjVUpoAlF8KCw2he2wQjPpRqgNpE5BSSgUpTQBKKRWkNAEopVSQ0gSglFJBShOAUkoFKU0ASikVpMQY/1mNUUSygW02HT4RyLHp2L5Ov5vG6ffSNP1umtYR300/Y0yDFbX8KgHYSUTSjDGpdsfhi/S7aZx+L03T76ZpnfndaBOQUkoFKU0ASikVpDQBNN90uwPwYfrdNE6/l6bpd9O0TvtutA9AKaWClF4BKKVUkNIEoJRSQUoTQAuJyN0iYkQk0e5YfIWIPCUiG0RktYjMFZE4u2Oym4hMFpGNIrJZRO63Ox5fISJ9RWSRiKwTkbUicpfdMfkaEXGIyE8i8llHH0sTQAuISF/gdGC73bH4mPnAMGPMCOBX4AGb47GViDiAl4AzgCOBy0TkSHuj8hmVwN3GmCOB44Db9Ltp4C5gfWccSBNAyzwL3Adoz3ktxpivjTHVK7L/CPSxMx4fMAbYbIzZaoypAP4LnGtzTD7BGLPLGLPK+3MhVkXX296ofIeI9AHOAv7dGcfTBNBMInIusNMY87Pdsfi464Ev7A7CZr2BHbW2M9FKrgER6Q+MApbZG4lPeQ7rJNPTGQfTJSFrEZEFQI9GnnoQmIrV/BOUDvbdGGM+9u7zINYl/szOjE35HxGJBuYAfzDGFNgdjy8QkbOBvcaYlSIyvjOOqQmgFmPMaY2Vi8hwYADws4iA1cSxSkTGGGN2d2KItmnqu6kmItcCZwMTjN5cshPoW2u7j7dMASLixKr8ZxpjPrQ7Hh8yFpgiImcCLiBWRN4xxlzZUQfUG8FaQUQygFRjjM5miDXiBXgGGGeMybY7HruJSChWZ/gErIp/BXC5MWatrYH5ALHOoN4Eco0xf7A7Hl/lvQK4xxhzdkceR/sAVHuYBsQA80UkXUResTsgO3k7xG8HvsLq5HxfK/8aY4GrgFO9fyvp3jNeZQO9AlBKqSClVwBKKRWkNAEopVSQ0gSglFJBShOAUkoFKU0ASikVpDQBKNVK3pktfxORBO92vHe7v72RKdU8mgCUaiVjzA7gX8AT3qIngOnGmAzbglKqBfQ+AKXawDutwUrgP8CNQIoxxm1vVEo1j84FpFQbGGPcInIv8CVwulb+yp9oE5BSbXcGsAsYZncgSrWEJgCl2kBEUoCJWKtb/Y+I9LQ5JKWaTROAUq3kndnyX1hz2m8HngKetjcqpZpPE4BSrXcjsN0YM9+7/TIwVETG2RiTUs2mo4CUUipI6RWAUkoFKU0ASikVpDQBKKVUkNIEoJRSQUoTgFJKBSlNAEopFaQ0ASilVJD6/0g6HtMvDC+AAAAAAElFTkSuQmCC\n", 713 | "text/plain": [ 714 | "
" 715 | ] 716 | }, 717 | "metadata": { 718 | "needs_background": "light" 719 | }, 720 | "output_type": "display_data" 721 | } 722 | ], 723 | "source": [ 724 | "from MulticoreTSNE import MulticoreTSNE as TSNE\n", 725 | "import seaborn as sns\n", 726 | "import matplotlib.pyplot as plt\n", 727 | "\n", 728 | "\n", 729 | "tsne = TSNE(n_components = 2, n_jobs= -1, verbose = 10, perplexity = 30)\n", 730 | "tsne_data = tsne.fit_transform(x_train)\n", 731 | "\n", 732 | "tsne_data = pd.DataFrame(tsne_data, columns = ['X', 'Y'])\n", 733 | "tsne_data['label'] = train.label.values\n", 734 | "\n", 735 | "sns.scatterplot(x = 'X', y = 'Y', hue = 'label', data = tsne_data)\n", 736 | "plt.title('TSNE on IDF-Glove Title Encodings')\n", 737 | "plt.show()" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": { 743 | "Collapsed": "false" 744 | }, 745 | "source": [ 746 | "This time we see some seperation between the 2 classes in the 2D projection. To some extent this explains the high accuracy we are able to get with simple Log Reg." 747 | ] 748 | }, 749 | { 750 | "cell_type": "markdown", 751 | "metadata": { 752 | "Collapsed": "false" 753 | }, 754 | "source": [ 755 | "To increase performance further, we can add some hand made features. Let's try this in the next section. " 756 | ] 757 | } 758 | ], 759 | "metadata": { 760 | "kernelspec": { 761 | "display_name": "Python 3", 762 | "language": "python", 763 | "name": "python3" 764 | }, 765 | "language_info": { 766 | "codemirror_mode": { 767 | "name": "ipython", 768 | "version": 3 769 | }, 770 | "file_extension": ".py", 771 | "mimetype": "text/x-python", 772 | "name": "python", 773 | "nbconvert_exporter": "python", 774 | "pygments_lexer": "ipython3", 775 | "version": "3.7.3" 776 | } 777 | }, 778 | "nbformat": 4, 779 | "nbformat_minor": 4 780 | } 781 | -------------------------------------------------------------------------------- /notebooks/Embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "Collapsed": "false" 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd \n", 12 | "import numpy as np\n", 13 | "\n", 14 | "train = pd.read_csv('../datasets/train.csv')\n", 15 | "test = pd.read_csv('../datasets/test.csv')" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "Collapsed": "false" 22 | }, 23 | "source": [ 24 | "## Utility Functions" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "Collapsed": "false" 31 | }, 32 | "source": [ 33 | "Before we start exploring embeddings lets write a couple of helper functions to run Logistic Regression and calculate evaluation metrics\n", 34 | "\n", 35 | "Since we want to optimize our model for F1-Scores, for all models we'll first predict the probability of the positive class. We'll then use these probabilities to get the Precision-Recall curve and from here we can select a threshold value that has the highest F1-score. To predict the labels we can simply use this threshold value." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 8, 41 | "metadata": { 42 | "Collapsed": "false" 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "%matplotlib inline\n", 47 | "import matplotlib.pyplot as plt\n", 48 | "from sklearn.metrics import precision_recall_curve, f1_score, accuracy_score, roc_auc_score, confusion_matrix\n", 49 | "import seaborn as sns\n", 50 | "sns.set_palette(\"muted\")\n", 51 | " \n", 52 | "\n", 53 | "def calc_f1(p_and_r):\n", 54 | " p, r = p_and_r\n", 55 | " return (2*p*r)/(p+r)\n", 56 | "\n", 57 | "\n", 58 | "# Print the F1, Precision, Recall, ROC-AUC, and Accuracy Metrics \n", 59 | "# Since we are optimizing for F1 score - we will first calculate precision and recall and \n", 60 | "# then find the probability threshold value that gives us the best F1 score\n", 61 | "\n", 62 | "def print_model_metrics(y_test, y_test_prob, confusion = False, verbose = True, return_metrics = False):\n", 63 | "\n", 64 | " precision, recall, threshold = precision_recall_curve(y_test, y_test_prob, pos_label = 1)\n", 65 | " \n", 66 | " #Find the threshold value that gives the best F1 Score\n", 67 | " best_f1_index =np.argmax([calc_f1(p_r) for p_r in zip(precision, recall)])\n", 68 | " best_threshold, best_precision, best_recall = threshold[best_f1_index], precision[best_f1_index], recall[best_f1_index]\n", 69 | " \n", 70 | " # Calulcate predictions based on the threshold value\n", 71 | " y_test_pred = np.where(y_test_prob > best_threshold, 1, 0)\n", 72 | " \n", 73 | " # Calculate all metrics\n", 74 | " f1 = f1_score(y_test, y_test_pred, pos_label = 1, average = 'binary')\n", 75 | " roc_auc = roc_auc_score(y_test, y_test_prob)\n", 76 | " acc = accuracy_score(y_test, y_test_pred)\n", 77 | " \n", 78 | " \n", 79 | " if confusion:\n", 80 | " # Calculate and Display the confusion Matrix\n", 81 | " cm = confusion_matrix(y_test, y_test_pred)\n", 82 | "\n", 83 | " plt.title('Confusion Matrix')\n", 84 | " sns.set(font_scale=1.0) #for label size\n", 85 | " sns.heatmap(cm, annot = True, fmt = 'd', xticklabels = ['No Clickbait', 'Clickbait'], yticklabels = ['No Clickbait', 'Clickbait'], annot_kws={\"size\": 14}, cmap = 'Blues')# font size\n", 86 | "\n", 87 | " plt.xlabel('Truth')\n", 88 | " plt.ylabel('Prediction')\n", 89 | " \n", 90 | " if verbose:\n", 91 | " print('F1: {:.3f} | Pr: {:.3f} | Re: {:.3f} | AUC: {:.3f} | Accuracy: {:.3f} \\n'.format(f1, best_precision, best_recall, roc_auc, acc))\n", 92 | " \n", 93 | " if return_metrics:\n", 94 | " return np.array([f1, best_precision, best_recall, roc_auc, acc])" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 9, 100 | "metadata": { 101 | "Collapsed": "false" 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# Run Simple Log Reg Model and Print metrics\n", 106 | "from sklearn.linear_model import SGDClassifier\n", 107 | "\n", 108 | "# Run log reg 10 times and average the result to reduce predction variance\n", 109 | "def run_log_reg(train_features, test_features, y_train, y_test, alpha = 1e-4, confusion = False, return_f1 = False, verbose = True):\n", 110 | " metrics = np.zeros(5)\n", 111 | " for _ in range(10):\n", 112 | " log_reg = SGDClassifier(loss = 'log', alpha = alpha, n_jobs = -1, penalty = 'l2')\n", 113 | " log_reg.fit(train_features, y_train)\n", 114 | " y_test_prob = log_reg.predict_proba(test_features)[:,1]\n", 115 | " metrics += print_model_metrics(y_test, y_test_prob, confusion = confusion, verbose = False, return_metrics = True)\n", 116 | " metrics /=10\n", 117 | " if verbose:\n", 118 | " print('F1: {:.3f} | Pr: {:.3f} | Re: {:.3f} | AUC: {:.3f} | Accuracy: {:.3f} \\n'.format(*metrics))\n", 119 | " if return_f1:\n", 120 | " return f1\n", 121 | " return log_reg" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "Collapsed": "false" 128 | }, 129 | "source": [ 130 | "# Bag-of-Words, TF-IDF and Word Embeddings" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 10, 136 | "metadata": { 137 | "Collapsed": "false" 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "y_train = np.where(train.label.values == 'clickbait', 1, 0)\n", 142 | "y_test = np.where(test.label.values == 'clickbait', 1, 0)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": { 148 | "Collapsed": "false" 149 | }, 150 | "source": [ 151 | "## Bag of Words\n", 152 | "Let's start with simple Bag-Of-Words" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 16, 158 | "metadata": { 159 | "Collapsed": "false" 160 | }, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "F1: 0.782 | Pr: 0.867 | Re: 0.714 | AUC: 0.837 | Accuracy: 0.801 \n", 167 | "\n" 168 | ] 169 | }, 170 | { 171 | "data": { 172 | "text/plain": [ 173 | "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", 174 | " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n", 175 | " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,\n", 176 | " n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,\n", 177 | " random_state=None, shuffle=True, tol=0.001,\n", 178 | " validation_fraction=0.1, verbose=0, warm_start=False)" 179 | ] 180 | }, 181 | "execution_count": 16, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "from sklearn.feature_extraction.text import CountVectorizer\n", 188 | "\n", 189 | "bow = CountVectorizer()\n", 190 | "x_train = bow.fit_transform(train.title.values)\n", 191 | "x_test = bow.transform(test.title.values)\n", 192 | "\n", 193 | "run_log_reg(x_train, x_test, y_train, y_test)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": { 199 | "Collapsed": "false" 200 | }, 201 | "source": [ 202 | "## TF-IDF\n", 203 | "\n", 204 | "TFIDF should perform better than BoW since it uses document frequencies to normalize" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 17, 210 | "metadata": { 211 | "Collapsed": "false" 212 | }, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "F1: 0.829 | Pr: 0.872 | Re: 0.790 | AUC: 0.896 | Accuracy: 0.837 \n", 219 | "\n" 220 | ] 221 | }, 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", 226 | " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n", 227 | " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,\n", 228 | " n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,\n", 229 | " random_state=None, shuffle=True, tol=0.001,\n", 230 | " validation_fraction=0.1, verbose=0, warm_start=False)" 231 | ] 232 | }, 233 | "execution_count": 17, 234 | "metadata": {}, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 240 | "\n", 241 | "tfidf = TfidfVectorizer()\n", 242 | "x_train = tfidf.fit_transform(train.title.values)\n", 243 | "x_test = tfidf.transform(test.title.values)\n", 244 | "\n", 245 | "run_log_reg(x_train, x_test, y_train, y_test)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "Collapsed": "false" 252 | }, 253 | "source": [ 254 | "TFIDF performs marginally better than BoW. Although whats impressive here is the fact that we're getting an F1 score of 0.826 with just 50 datapoints. This is why Log Reg + TFIDF is a great baseline for NLP classification tasks.\n", 255 | "\n", 256 | "Next we'll try 100D glove vectors. " 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": { 262 | "Collapsed": "false" 263 | }, 264 | "source": [ 265 | "## GloVe" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 3, 271 | "metadata": { 272 | "Collapsed": "false" 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "# Load the glove vectors with PyMagnitude\n", 277 | "# PyMagnitude is a fantastic library that handles a lot of word vectorization tasks. \n", 278 | "\n", 279 | "from pymagnitude import *\n", 280 | "glove = Magnitude(\"../vectors/glove.6B.100d.magnitude\")" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 19, 286 | "metadata": { 287 | "Collapsed": "false" 288 | }, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "application/vnd.jupyter.widget-view+json": { 293 | "model_id": "d3a232cdd47d43e98c6ba8119c46f337", 294 | "version_major": 2, 295 | "version_minor": 0 296 | }, 297 | "text/plain": [ 298 | "HBox(children=(IntProgress(value=0, max=50), HTML(value='')))" 299 | ] 300 | }, 301 | "metadata": {}, 302 | "output_type": "display_data" 303 | }, 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "\n" 309 | ] 310 | }, 311 | { 312 | "data": { 313 | "application/vnd.jupyter.widget-view+json": { 314 | "model_id": "2dd36b0a7cbf4620863eb049b9c4bcf3", 315 | "version_major": 2, 316 | "version_minor": 0 317 | }, 318 | "text/plain": [ 319 | "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))" 320 | ] 321 | }, 322 | "metadata": {}, 323 | "output_type": "display_data" 324 | }, 325 | { 326 | "name": "stdout", 327 | "output_type": "stream", 328 | "text": [ 329 | "\n" 330 | ] 331 | } 332 | ], 333 | "source": [ 334 | "# We'll use Average Glove here \n", 335 | "from tqdm import tqdm_notebook\n", 336 | "from nltk import word_tokenize\n", 337 | "\n", 338 | "\n", 339 | "def avg_glove(df):\n", 340 | " vectors = []\n", 341 | " for title in tqdm_notebook(df.title.values):\n", 342 | " vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))\n", 343 | " return np.array(vectors)\n", 344 | "\n", 345 | "x_train = avg_glove(train)\n", 346 | "x_test = avg_glove(test)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 22, 352 | "metadata": { 353 | "Collapsed": "false" 354 | }, 355 | "outputs": [ 356 | { 357 | "name": "stdout", 358 | "output_type": "stream", 359 | "text": [ 360 | "F1: 0.929 | Pr: 0.909 | Re: 0.950 | AUC: 0.979 | Accuracy: 0.928 \n", 361 | "\n" 362 | ] 363 | }, 364 | { 365 | "data": { 366 | "text/plain": [ 367 | "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", 368 | " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n", 369 | " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,\n", 370 | " n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,\n", 371 | " random_state=None, shuffle=True, tol=0.001,\n", 372 | " validation_fraction=0.1, verbose=0, warm_start=False)" 373 | ] 374 | }, 375 | "execution_count": 22, 376 | "metadata": {}, 377 | "output_type": "execute_result" 378 | } 379 | ], 380 | "source": [ 381 | "run_log_reg(x_train, x_test, y_train, y_test)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": { 387 | "Collapsed": "false" 388 | }, 389 | "source": [ 390 | "Woah! That's a huge increase in F1 score with just a small change in embedding. The improved performance is justified since W2V are pretrained embeddings that contain a lot of contextual information. This would obviously contribute to the classifiers performance, especially when we have a very limited dataset. " 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": { 396 | "Collapsed": "false" 397 | }, 398 | "source": [ 399 | "### IDF-Weighted Glove" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": { 405 | "Collapsed": "false" 406 | }, 407 | "source": [ 408 | "Instead of just taking the average of each word, what if we did weighted average - in particular IDF-Weighted average?" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 8, 414 | "metadata": { 415 | "Collapsed": "false" 416 | }, 417 | "outputs": [ 418 | { 419 | "data": { 420 | "text/plain": [ 421 | "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 422 | " dtype=, encoding='utf-8',\n", 423 | " input='content', lowercase=True, max_df=1.0, max_features=None,\n", 424 | " min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,\n", 425 | " smooth_idf=True, stop_words=None, strip_accents=None,\n", 426 | " sublinear_tf=False, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 427 | " tokenizer=None, use_idf=True, vocabulary=None)" 428 | ] 429 | }, 430 | "execution_count": 8, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 437 | "\n", 438 | "tfidf = TfidfVectorizer()\n", 439 | "tfidf.fit(train.title.values)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 9, 445 | "metadata": { 446 | "Collapsed": "false" 447 | }, 448 | "outputs": [], 449 | "source": [ 450 | "# Now lets create a dict so that for every word in our corpus we have a corresponding IDF value\n", 451 | "idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 10, 457 | "metadata": { 458 | "Collapsed": "false" 459 | }, 460 | "outputs": [ 461 | { 462 | "data": { 463 | "application/vnd.jupyter.widget-view+json": { 464 | "model_id": "78d7c2c19c284f4ebfe4c84937ac90c8", 465 | "version_major": 2, 466 | "version_minor": 0 467 | }, 468 | "text/plain": [ 469 | "HBox(children=(IntProgress(value=0, max=50), HTML(value='')))" 470 | ] 471 | }, 472 | "metadata": {}, 473 | "output_type": "display_data" 474 | }, 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "\n" 480 | ] 481 | }, 482 | { 483 | "data": { 484 | "application/vnd.jupyter.widget-view+json": { 485 | "model_id": "214e6af41d3240578c528df4dae08a08", 486 | "version_major": 2, 487 | "version_minor": 0 488 | }, 489 | "text/plain": [ 490 | "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))" 491 | ] 492 | }, 493 | "metadata": {}, 494 | "output_type": "display_data" 495 | }, 496 | { 497 | "name": "stdout", 498 | "output_type": "stream", 499 | "text": [ 500 | "\n" 501 | ] 502 | } 503 | ], 504 | "source": [ 505 | "# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights. \n", 506 | "\n", 507 | "def tfidf_glove(df):\n", 508 | " vectors = []\n", 509 | " for title in tqdm_notebook(df.title.values):\n", 510 | " glove_vectors = glove.query(word_tokenize(title))\n", 511 | " weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]\n", 512 | " vectors.append(np.average(glove_vectors, axis = 0, weights = weights))\n", 513 | " return np.array(vectors)\n", 514 | "\n", 515 | "x_train = tfidf_glove(train)\n", 516 | "x_test = tfidf_glove(test)" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 26, 522 | "metadata": { 523 | "Collapsed": "false" 524 | }, 525 | "outputs": [ 526 | { 527 | "name": "stdout", 528 | "output_type": "stream", 529 | "text": [ 530 | "F1: 0.957 | Pr: 0.943 | Re: 0.971 | AUC: 0.989 | Accuracy: 0.956 \n", 531 | "\n" 532 | ] 533 | }, 534 | { 535 | "data": { 536 | "text/plain": [ 537 | "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", 538 | " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n", 539 | " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,\n", 540 | " n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,\n", 541 | " random_state=None, shuffle=True, tol=0.001,\n", 542 | " validation_fraction=0.1, verbose=0, warm_start=False)" 543 | ] 544 | }, 545 | "execution_count": 26, 546 | "metadata": {}, 547 | "output_type": "execute_result" 548 | } 549 | ], 550 | "source": [ 551 | "run_log_reg(x_train, x_test, y_train, y_test)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": { 557 | "Collapsed": "false" 558 | }, 559 | "source": [ 560 | "Our F1 increased by 0.02 points. The increased performance makes sense - commonly occurring words get less weightage while less frequent (and perhaps more important) words have more say in the vector representation for the titles. \n", 561 | "\n", 562 | "Since GloVe worked so well, let's try one last embedding technique - Facebook's InferSent model. This model converts the entire sentence into a vector representation. However, a potential problem here is that the vector representations are 4096 dimensional which might cause our model to overfit easily. Let's give it a shot anyway" 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "metadata": { 568 | "Collapsed": "false" 569 | }, 570 | "source": [ 571 | "## InferSent" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 27, 577 | "metadata": { 578 | "Collapsed": "false" 579 | }, 580 | "outputs": [ 581 | { 582 | "data": { 583 | "text/plain": [ 584 | "" 585 | ] 586 | }, 587 | "execution_count": 27, 588 | "metadata": {}, 589 | "output_type": "execute_result" 590 | } 591 | ], 592 | "source": [ 593 | "from InferSent.models import InferSent\n", 594 | "import torch\n", 595 | "\n", 596 | "# Uncomment the lines below with the paths to infersent pkl file and glove file\n", 597 | "\n", 598 | "V = 1\n", 599 | "#MODEL_PATH = 'path to infersent pkl file'\n", 600 | "params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,\n", 601 | " 'pool_type': 'max', 'dpout_model': 0.0, 'version': V}\n", 602 | "infersent = InferSent(params_model)\n", 603 | "infersent.load_state_dict(torch.load(MODEL_PATH))\n" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 28, 609 | "metadata": { 610 | "Collapsed": "false" 611 | }, 612 | "outputs": [], 613 | "source": [ 614 | "#W2V_PATH = 'path to glove file'\n", 615 | "infersent.set_w2v_path(W2V_PATH)" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 29, 621 | "metadata": { 622 | "Collapsed": "false" 623 | }, 624 | "outputs": [ 625 | { 626 | "name": "stdout", 627 | "output_type": "stream", 628 | "text": [ 629 | "Found 338(/363) words with w2v vectors\n", 630 | "Vocab size : 338\n" 631 | ] 632 | } 633 | ], 634 | "source": [ 635 | "infersent.build_vocab(train.title.values, tokenize= False)" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 30, 641 | "metadata": { 642 | "Collapsed": "false" 643 | }, 644 | "outputs": [], 645 | "source": [ 646 | "x_train = infersent.encode(train.title.values, tokenize= False)\n", 647 | "x_test = infersent.encode(test.title.values, tokenize= False)" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 36, 653 | "metadata": { 654 | "Collapsed": "false" 655 | }, 656 | "outputs": [ 657 | { 658 | "name": "stdout", 659 | "output_type": "stream", 660 | "text": [ 661 | "F1: 0.927 | Pr: 0.912 | Re: 0.946 | AUC: 0.966 | Accuracy: 0.926 \n", 662 | "\n" 663 | ] 664 | }, 665 | { 666 | "data": { 667 | "text/plain": [ 668 | "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", 669 | " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n", 670 | " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,\n", 671 | " n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,\n", 672 | " random_state=None, shuffle=True, tol=0.001,\n", 673 | " validation_fraction=0.1, verbose=0, warm_start=False)" 674 | ] 675 | }, 676 | "execution_count": 36, 677 | "metadata": {}, 678 | "output_type": "execute_result" 679 | } 680 | ], 681 | "source": [ 682 | "run_log_reg(x_train, x_test, y_train, y_test, alpha = 1e-4)" 683 | ] 684 | }, 685 | { 686 | "cell_type": "markdown", 687 | "metadata": { 688 | "Collapsed": "false" 689 | }, 690 | "source": [ 691 | "High Dimensional, overfitting, keep for feature selection layer" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": { 697 | "Collapsed": "false" 698 | }, 699 | "source": [ 700 | "## Visualize IDF-Weighted Glove" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": 35, 706 | "metadata": { 707 | "Collapsed": "false" 708 | }, 709 | "outputs": [ 710 | { 711 | "data": { 712 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEWCAYAAABv+EDhAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3dd3xUVfr48c+TySSTShISOgFEBJQSMGJBBUUEG5a1d/2pa3e/a1nFXVfd1dXVrxVdxXXFgi4KYsUCX1BUEAgYkSrFACGUhJDeJpnz++NOQiqk3ynP+/WaF7ln7sx9ZhLOc+85554jxhiUUkoFnxC7A1BKKWUPTQBKKRWkNAEopVSQ0gSglFJBShOAUkoFKU0ASikVpDQBKOUlIhkicprdcVQTkaki8u+DPH+tiHzfmTG1JxGZISJ/9/58kohstDumYKMJwI+ISFGth0dESmttXyEicSLyHxHZLSKFIvKriNxf6/VGRH4RkZBaZX8XkRnen/t79ymq97ikEz6bEZHDvT8/LCJu72eo/hzTRKRnrf3He7+D2nF+epD3DxORh0Rko4gUi8hOEflCRE7v6M92kJgO+vs0xjxujLnBu2/17ya0nY79jYiUNff762jGmO+MMYPtOn6wapc/JtU5jDHR1T+LSAZwgzFmQa2yN4AoYCiQDxwBDKv3Nr2AS4F3D3KoOGNMZTuF3VqzjDFXiogT63M8AqwUkaONMbu8+2QZY/o08/1mA72Bq4GfvGWnAmcBX7dj3M12qN9nJ7jdGNPkFYYKfHoFEFiOAd41xuw3xniMMRuMMbPr7fNP4JH2OJMUkV4i8omI5IrIZhG5sdZzD4vI+yLylvcsfq2IpLb0GMYYtzFmLXAJkA3c3Yo4TwMmAucaY5YZYyq8jy+NMXc18ZpwEXlORLK8j+dEJNz73HoRObvWvqEiki0io73bx4nIEhHJE5GfRWR8S2P2vs/DIvKOd3Ox998879n68Y3sP0RE5nt/HxtF5OJWHne8iGSKyN0isldEdonIdbWejxCR/xWRbSKSLyLfi0iE97kp3t91nvcqY2it140SkVXev4dZgKv+MWttZ4jIPSKy2nuMWSJSe//7vHFlicgN9a4gzxSRdd7j7BSRe1rzPQQDTQCB5UfgMRG5TkQGNbHPh0ABcG07HO+/QCbWVcWFwOMicmqt56d494kDPgGmtfZAxpgq4GPgpFa8/DRgmTEm85B7HvAgcByQAowExgB/9j73HnBZrX0nATnGmFUi0hv4HPg7kADcA8wRkaRWxF3byd5/44wx0caYpbWfFJEoYD7WlV03rKu8l0XkyFYerwfQBeuq6f8BL4lIvPe5p4GjgROwPuN9gEdEjsD6bv4AJAHzgE+9zW9hwEfA297XfAD87hAxXAxMBgYAI/D+zYrIZOCPWL/Xw4Hx9V73OvB7Y0wM1hXwwhZ/+iChCSCw3AHMBG4H1nnPys+ot48B/gL8xfufsjE53jO46sfQ+juISF9gLPAnY0yZMSYd+DdWE0u1740x87yV99tYFWlbZGFVHtV61YuzqTPeRGB3rdgTvPvni0hZE6+5AnjUGLPXGJON1QR1lfe5d4EpIhLp3b4cq+IDuBKY5/3cHmPMfCANOLOlH7aFzgYyjDFvGGMqjTE/AXOAiw7ymhfqfX9/q/WcG+vzu40x84AiYLBY/UfXA3cZY3YaY6qMMUuMMeVYV2mfG2PmG2PcWIkiAitRHAc4gee87zkbWHGIz/SCMSbLGJMLfIqVjMFKDG8YY9YaY0qAh+u9zg0cKSKx3qvhVYc4TtDSBBBAjDGl3o7Do4GuwPvAByKSUG+/eVhn7r9v4q0SjTFxtR7rG9mnF5BrjCmsVbYN64yx2u5aP5cArjY2PfUGcmttZ9WL831o0LmaDOwDajqQjTG5xpg4rLPY8CaO1cv7eapt85ZhjNkMrAfO8SaBKRzoU+kHXFS7YgVOrH38DtIPOLbeca/AOpNvyp31vr+/1HpuX71+oBIgGiuZuoAtjbxfne/MGOMBdmD93noBO03d2Se3cXD1/36q+0x6ed+3Wu2fwbqyOBPYJiLfNtZcpiyaAAKUMaYAeByrU3hAI7s8CEwFIht5rjmygAQRialVlgzsbOX7HZT3zPMc4LtD7ettIql+bAf+DzhGRJrbYQzW5+tXazvZW1atuhnoXGCdNymAVRm9Xa9ijTLGPNGCYzfmUNP27gC+rXfcaGPMLW08bn05QBkwsJHn6nxnIiJAX6y/iV1Ab29ZteRWxrALqP277Fv7SWPMCmPMuVhNYR9hnQipRmgCCCAi8hcROcbb5uoC7gLygAbjq40x3wBrgGtacyxjzA5gCfAPEXGJyAistuJ3Dv7KlvF2sA7FqnB7AM+0ItavgUXARyJyrPf7cWI1SzTlPeDPIpIkIonAQ9T9bP8FTgduoe6IqnewrgwmiYjD+92Mb2HyaUw24AEOa+L5z4AjROQqEXF6H8c01nzXFt6z+v8Az4g1CMAhIseL1UH+PnCWiEzwfr93A+VYfydLgUrgTm9sF2D1q7TG+8B1IjLUewVWc+Xi/d1eISJdvM1QBVjfm2qEJoDAYoA3sM7SsrBGvpxljClqYv8/U7dNvVr1SJPqxx+beP1lQH/vseYCf23HYYyXiEgR1nDWT7CacY42xmQd/GVNOh+rknwHKyn+htVEMqmJ/f+O1Xa/GvgFWOUtA8A7FHUpVvv2rFrlO7CuCqZiVdo7gHtp4/81b1v3Y8AP3iae4+o9X4iVkC7F+n3sBp6k6SYugGn1fs8rmxnOPVjfyQqsJrkngRBjzEasPpAXsf4GzwHOqR51BVyA1ZGbi9Vf8GEzj1eHMeYL4AWspL4Za/ADWMkGrL6aDBEpAG7G+j2rRoguCKOU8mfeq5w1QLgP3L/iV/QKQCnld0TkfLHu1YjHugL5VCv/ltMEoJTyR78H9mKNRqrC6otRLaRNQEopFaT0CkAppYKUX00Gl5iYaPr37293GEop5VdWrlyZY4xpMB2JXyWA/v37k5aWZncYSinlV0Sk0buutQlIKaWClCYApZQKUpoAlFIqSPlVH4AKXG63m8zMTMrKmpqdWXUkl8tFnz59cDqddoeiOpEmAOUTMjMziYmJoX///tSdMFJ1NGMM+/btIzMzkwEDGps4VgUqbQJSPqGsrIyuXbtq5W8DEaFr16569VVfWT4U7oaS3EPv66c0ASifoZW/ffS7r6cgCz78Pbw4Gv57OezbAgE4a4LtCcA7n/hPIvKZ3bEopRTFOTD39/DrF1BRDNuXwtvnQXG23ZG1O9sTANaiJY0tOahUp4qOjj7o8xkZGQwbNqxF73nttdcye/bstoSlOlt5Ify2uG5Z3nYrGQQYWxOAd5Wks7AWE1dKKXsVZUPOrxDfv265MxKcLltC6kh2XwE8B9zHQZZsE5GbRCRNRNKyswPvEkz5nqKiIiZMmMDo0aMZPnw4H3/8cc1zlZWVXHHFFQwdOpQLL7yQkpISAFauXMm4ceM4+uijmTRpErt27bIrfNUWpgqWvQqTn4Aw7xWhIwymvACuOHtj6wC2JQARORvYa4w56DJ0xpjpxphUY0xqUlKDuYyUancul4u5c+eyatUqFi1axN133031tOkbN27k1ltvZf369cTGxvLyyy/jdru54447mD17NitXruT666/nwQcftPlTqFYJj4X4frDqTbjiA7jmU7j+S+h/Mjgj7I6u3dl5H8BYYIqInAm4gFgReccYc6WNMSmFMYapU6eyePFiQkJC2LlzJ3v27AGgb9++jB07FoArr7ySF154gcmTJ7NmzRomTpwIQFVVFT179rQtftUGYZFwylRY+jJ8eid0OxIm/g2iu9kdWYewLQEYYx4AHgAQkfHAPVr5K18wc+ZMsrOzWblyJU6nk/79+9eMka8/XFJEMMZw1FFHsXTpUjvCVe0tKgnG3w/H3WKd9YfH2B1Rh7G7D0Apn5Ofn0+3bt1wOp0sWrSIbdsOzKS7ffv2mor+3Xff5cQTT2Tw4MFkZ2fXlLvdbtauXWtL7KqdhIZbZ/0BXPmDjyQAY8w3xpiz7Y5DKYArrriCtLQ0hg8fzltvvcWQIUNqnhs8eDAvvfQSQ4cOZf/+/dxyyy2EhYUxe/Zs/vSnPzFy5EhSUlJYsmSJjZ9AqebxqzWBU1NTjS4IE5jWr1/P0KFD7Q4jqOnvIHCJyEpjTGr9cp+4AlBKKdX5NAEopVSQ0gSglFJBShOAUkoFKU0ASikVpDQBKKVUkNIEoFQTHn74YZ5++mkAHnroIRYsWNDkvjNmzOD2229vUN7S6aCzsrK48MILAUhPT2fevHktjFqp5tM1gZVqhkcffbRTjtOrV6+ahJGenk5aWhpnnnlmpxxbBR+9AlB+6aOfdjL2iYUMuP9zxj6xkI9+2tnm93zrrbcYMWIEI0eO5KqrrqrzXO0z+RUrVnDCCScwcuRIxowZQ2FhYZ19P//8c44//nhycnIAWLBgAampqRxxxBF89pm18F1GRgYnnXQSo0ePZvTo0TV3DlcvOlNRUcFDDz3ErFmzSElJYdasWW3+fErVp1cAyu989NNOHvjwF0rdVQDszCvlgQ9/AeC8Ub1b9Z5r167l73//O0uWLCExMZHc3FxeeOGFBvtVVFRwySWXMGvWLI455hgKCgqIiDgwTfDcuXN55plnmDdvHvHx8YBVqS9fvpwtW7ZwyimnsHnzZrp168b8+fNxuVxs2rSJyy67jNp3uYeFhfHoo4+SlpbGtGnTWvWZlDoUTQDK7zz11caayr9aqbuKp77a2OoEsHDhQi666CISExMBSEhIaHS/jRs30rNnT4455hgAYmNj67xHWloaX3/9dZ3yiy++mJCQEAYNGsRhhx3Ghg0bGDBgALfffjvp6ek4HA5+/fXXVsWtVFtoE5DyO1l5pS0q7ywDBw6ksLCwQWXe2BTSzz77LN27d+fnn38mLS2NioqKzgxVKUATgPJDveIaX5mpqfLmOPXUU/nggw/Yt28fALm5uY3uN3jwYHbt2sWKFSsAKCwspLKyEoB+/foxZ84crr766jrTQX/wwQd4PB62bNnC1q1bGTx4MPn5+fTs2ZOQkBDefvttqqqqGhwrJiamQf+CUu1JE4DyO/dOGkyE01GnLMLp4N5Jg1v9nkcddRQPPvgg48aNY+TIkfzxj39sdL+wsDBmzZrFHXfcwciRI5k4cWLNYjEAQ4YMYebMmVx00UVs2bIFgOTkZMaMGcMZZ5zBK6+8gsvl4tZbb+XNN99k5MiRbNiwgaioqAbHOuWUU1i3bp12AqsOo9NBK5/Q0qmIP/ppJ099tZGsvFJ6xUVw76TBrW7/VxadDjpwNTUdtHYCK7903qjeWuEr1UbaBKSUUkHKtgQgIi4RWS4iP4vIWhF5xK5YlFIqGNnZBFQOnGqMKRIRJ/C9iHxhjPnRxpiUUipo2JYAjNX7XOTddHof/tMjrZRSfs7WPgARcYhIOrAXmG+MWdbIPjeJSJqIpGVnZ3d+kEopFaBsTQDGmCpjTArQBxgjIsMa2We6MSbVGJOalJTU+UEq1YgZM2aQlZXVotfUnlDuhhtuYN26dU3uW3sq6trGjx9PS4ZCp6WlceeddwLwzTff1Ew6pxT4yDBQY0yeiCwCJgNr7I5HqUOZMWMGw4YNo1evXq16/b///e92jqhxqamppKZaw7+/+eYboqOjOeGEEzrl2Mr32TkKKElE4rw/RwATgQ12xaP8zOr34dlh8HCc9e/q99v0dhkZGQwdOpQbb7yRo446itNPP53S0lLS09M57rjjGDFiBOeffz779+9n9uzZpKWlccUVV5CSkkJpacM5iJ588kmGDx/OyJEjuf/++xs8X/tM/ssvv2T06NGMHDmSCRMmNNj3tdde44wzzqg5zttvv01KSgrDhg1j+fLlACxfvpzjjz+eUaNGccIJJ7Bx40bAqvTPPvtsMjIyeOWVV3j22WdJSUnhu+++a9P3pQKDnVcAPYE3RcSBlYjeN8Z8ZmM8yl+sfh8+vRPc3oo3f4e1DTDi4la/7aZNm3jvvfd47bXXuPjii5kzZw7//Oc/efHFFxk3bhwPPfQQjzzyCM899xzTpk3j6aefrjm7ru2LL77g448/ZtmyZURGRjY5rxBAdnY2N954I4sXL2bAgAEN9p02bRrz58/no48+Ijw8HICSkhLS09NZvHgx119/PWvWrGHIkCF89913hIaGsmDBAqZOncqcOXNq3qd///7cfPPNREdHc88997T6O1KBxc5RQKuBUXYdX7WQMVC0B3alQ2gEdBsK0d3sieX/Hj1Q+Vdzl1rlbUgAAwYMICUlBYCjjz6aLVu2kJeXx7hx4wC45ppruOiiiw75PgsWLOC6664jMjISaHpqaYAff/yRk08+mQEDBjTY96233qJv37589NFHOJ3OmvLLLrsMgJNPPpmCggLy8vIoLCzkmmuuYdOmTYgIbre7hZ9eBSO9E1g1T0EWvDIW3r0E3poCM86Cor32xJKf2bLyZqo+wwZwOBzk5eU163XLli0jJSWFlJQUPvnkkzbFUNvw4cPJyMggM7Pu52pseum//OUvnHLKKaxZs4ZPP/20zgR1SjVFE4A6NE8lLHsVinMOlOX8Cr/Z1I7cpU/Lylt7mC5diI+Pr2kvf/vtt2uuBmpP1XzssceSnp5Oeno6U6ZMYeLEibzxxhuUlJQATU8tDXDcccexePFifvvttwb7jho1ildffZUpU6bUGXFUPTPo999/T5cuXejSpQv5+fn07m3NjTRjxoxGj6XTS6v6NAGoQ/NUQUEjZ9cFLRsG2W4mPATOenP/OyOs8nb25ptvcu+99zJixAjS09N56CHrGNdeey0333xzo53AkydPZsqUKaSmppKSktLocM5qSUlJTJ8+nQsuuICRI0dyySWX1Hn+xBNP5Omnn+ass86qWWPY5XIxatQobr75Zl5//XUA7rvvPh544AFGjRpVsz5Bfeeccw5z587VTmBVQ6eDVs2zYzm8PvHAtsMJt6dBfP92efsWT0W8+n2rzT8/0zrzn/BQm9r/lU4HHch0OmjVNkmD4eqP4bv/tTqBT/0zRHe3L54RF2uFr5pWtAfWfWI1VY66EuIHgCv20K8LMpoAVPO4usBh46FnCogDXDF2R6RU44r2woyzrcofYPl0uPwDOOJ0e+PyQdoHoFomIq7DKn9/ao4MNAH13ednHqj8q33zeN1BDArQBKB8hMvlYt++fYFVEfkJYwz79u3D5XLZHYrqZNoEpHxCnz59yMzMRGd8tYfL5aJPn/YdRmubLn0g8Yi6VwHjp0JUon0x+ShNAMonOJ3OmrthlWqT6G5w7Wew/lPI3gijrmq30WqBRhOAUirwRHeHY26wOwqfpwlAKdU+KoqhNM+6QbBLL3DFQViU3VFByT7I3Qo7VkD/EyGuL0TE2x2VT9AEoJRqu8oK2Px/MPta687xEAdc9CYMmgShYfbFVV4I3z8LS148UDbpMUi9AZza6a2jgJRSbVeaC5/cYVX+YP37yR1WuZ3KC+HHl+uWLfoHlDVvor9ApwlAKdV2HnfDSrV0vzWRoJ2M50BSqlZZak1vrjQBKKXaQWgE9B5dt6xPKoTa3MzijITDT6tbNuIyCIu2Jx4fo30ASqm2i0qES96BLx+EHUuh7/Ew+XH7x95HJsB5r8Dq/8Jvi+GIyXDkuTqViZfOBqqUaj/lhdZooLAoCPehStZTZa0a54yEkOBr+GhqNlA7F4XvKyKLRGSdiKwVkbvsikUp1U7CYyCmh29V/mCNSgqPDsrK/2DsbAKqBO42xqwSkRhgpYjMN8asszEmFWhK9kGV26qQfGFMulI+xLZ0aIzZZYxZ5f25EFgP9LYrHhVgqqpg73p47zL41/HwxZ/sW8NYKR/lE53AItIfGAUsa+S5m4CbAJKTkzs1LuXHSrJhxplQ4h2H/tPb1pDAM5/SKwGlvGxvEBORaGAO8AdjTEH9540x040xqcaY1KSkpM4PUPmn0v0HKv9qGz6zOimVUoDNCUBEnFiV/0xjzId2xqICTHgsSL0/764DIcQnLnqV8gl2jgIS4HVgvTHmGbviUB2vsspDdmEZOYXleDydNOw4PAZOewRErG1XFzjnRfvHpSvlQ+w8HRoLXAX8IiLp3rKpxph5Nsak2tn+4go+Ts9i+uIthDsd3Dd5MCcenkiMy9mxB3bFwtHXwrALrBkqoxIhUit/pWqzLQEYY74HxK7jq86RviOPhz9dW7N9yzur+OoPJzO4RwcnALCSgCvWWiFKKdWA7Z3AKnCVu6uYvTKzQfnX63bbEI1Sqj5NAKrDOB0hHNUrtkH50J4Ny5RSnU8TgOowISHCRal9GNz9wLQAJw1KZFTfOBujUkpV0zFxqkMlxbiYeeOx5JW4CQ0RYiOcJETZuEKUUm1VtBeKc6whxZEJfj2yTBOA6nCJ0eEkRofbHYbvMQaKs61/I+LtXTpRNU/hHnjrHMjeaG33G2stfRntnzepahOQjyl3V7G3oIz80gq7Q1EdqbwINi+AGWfB9JNh6TRr4jrluzweSH/nQOUPsO0H2L7UvpjaSBOAD9lXVM7TX//K+S8v4baZq9iyt4iqzrpxSnWu4r3w7kWQ8ysU7ob/e8RKCMp3mUrYs7Zh+V7/ncBYE4CPKHVXMW3hZl77bis780r5fvM+LnxlCfuKyymvrKKwzG13iKo9bf2m4bq0P8+CMp2ryGc5wmDUVQ3Ljzy382NpJ5oAfERhqZtPV2fVKdtf4mZvQTn//HIDt81cxbxfdrG/RJuGAkLi4IZl3Y+yfw1ddXA9U+CcFyDhMEgaApe+B7H+O4u9dgL7iFBHCH3jI8kpqlvBV3kMb/yQgcfA4k05PHbeMC4dk4wjRG+i9mtJg2HoFFj/ibXddSAcdyuEdsId0qr1IuMh5UoYPBkIsUYAif/+X9QrAB+REBXGY+cPIyrMUVN287jD+H5zDrW7Af7zQ4ZeBQSCqEQ45zm4Mx1uWw7XfQmxPe2OSjWHwwHR3a2RP35c+YNeAfiUQd2jWXjPeHbmlZIYHY4xhlP/99s6+3SJCMXh5390yiuyq/VQyiZ6BeBDnA4H3WNdjE6OJzkhkujwUI4/7EAFERoiTD1rKPF6I5VSqh3oFYAP6xodznOXprBpTyEZ+0o48fBEEmO08lcq4FRVWcuY7l1nXRXG9u6UO4w1Afi46rtojx9odyRKqQ6TlwGvnQJl+db24afB+a92eBLQJiCllLJTeREseORA5Q/WTYF52zr80HoFoFSgKdoLO1dBYZZ1JhnVDZx6f0G7c5dB0R5Y+5E1KdygiRDTo+XvU1UOhTsblhfuaXuMh6AJQKlAUrQX3rkAdv9ibTvC4MZF0GOYvXEForxt8OpJUFlubcclww0LrCGiLRGRAEdfB5lpB8qckdArpf1ibYKtTUAi8h8R2Ssia+yMQ6mAsW/LgcofoKrCmmeorMC+mAJRRQl8+88DlT9A3nbYsazl7yUCg8+07jDuORIOnwg3LoSojp9h1O4rgBnANOAtm+NQKjCUNzKXUHkheKo6Pxawjlu8F3atBlecNYWCn06dXJeBytKGxe7yhmXNEZlgzTM05CxwOMHVpW3hNZOtCcAYs1hE+tsZg1K2K9kH7lIQB0TEgTOi9e/Vc4RVmZTkHigbe6c1hYEd8nfAq+OgLM/a7jUaLp8F0d3siae9hEXBSXfDxnkHJvWLiIf+J7b+PUNCOn1xGbuvAA5JRG4CbgJITk62ORql2lnhHphzA2Qsttp9T3sERlxsJYLWiOoGNy2GH56Dgp1w7M3WBGZ2cJfC4qcOVP4AWatgzzr/TwBgTej3++9g6UtWxX3szX73uXw+ARhjpgPTAVJTU3VyfBU43GXww/NW5Q/gLoEv7oXDT219AggJgbi+MOkf4HFbZ6p2qXJDwa6G5YWNlPmj8GjoMRymvAgSAiGOQ7/Gx+h9AErZpaLIWlGqvj3tsMBIaJi9lT+AKxbG3FS3LNQFA06yJ56O4nD6ZeUPmgCUsk94DAwY17C8ewAN2eyVAhf+B/oeC0dMtoakdsLoFtU8tjYBich7wHggUUQygb8aY163MyalOk1oOJxwO2Svh01fWwnh9Mc7vSOwwxTnwNxboKrMGt3iLgWM9bmVT7B7FNBldh5fKdtFd4MLpntHAYVYI0kCpYIs2gNbF1o/b1ti/fvbYrhkpn2jklQdPt8JrFTAi4i3HoGmsZvPSnPBU9n5sahGaR+AUqpjJBzWsDnrmBut+xSUT9ArAKVUx4hKsjp9v3kS9v8Go6+xJkzz0xEzgUgTgFKqY4SEWBOknfW0NWdOa+9tUB1GE4BSqmM5I9o2vYXqMJoAlFK+w1NlDR/1VFqjoQJlSKyP0gSg/FtFsbWikgCRidq+7M8qK2DnSphzPRRkWVMjX/KO1YykOoSOAvIB+4sr2JlXyq78UgrL3HaH4z+Ks+HLB+D54fDaqdYyehXFdkelWqs0F9692Kr8AXb9DHNvrjuzqWpXTSYAEZmnUzV3vJyicv4w6yfGPrGQsU8s5Jn5v5JbXNHpMezILWF3fhmlFTbNG99SVZWQ9gasetPqYMzPhPcutaZWVv6pvAjK6907sH2pNamc6hAHuwJ4A/haRB4UEWdnBRRMPB7Dx+k7+fbXHGvbwBs/ZLA1u6jTYsjKK+WK15Zx0j8XMe6pRXywcod/XIWU5cOGz+uWGY911qj8U3h0wwnseh8NDm2p7ihNJgBjzAfAaCAWSBORe0Tkj9WPToswgJVXVrF0S8Mz1pXb9nfK8YvK3fxj3no27in0xuPhr5+sJa/EDxKAM8Ja/KS+roM6PxbVPlxxcPHbB+6K7joQzn8VIrvaG1cAO1RqrQCKgXAgBvB0eERBxOV0MPHIHixYv7dO+djDO2fkQ2mFh58z8+uUGQM780rpmxDZKTG0WlgkjL8ftv8IOb9a8+iccKffLcihanG6YMDJcOuP1lrGoS79fXawJhOAiEwGngE+AUYbY0o6LaogISJMPLI7P2cm80HaDsJDHfzhtEH0ie+cMdPR4aGcNCiRmcu215SFhgj9fL3yrxbbG6793Or4dYRZs2m6Yu2OSrWFwx1APnkAABdBSURBVAkxPeyOImgc7ArgQeAiY8zazgomGCVEhfHgmUO5a4LVdBEX4STc2b5DGas8hn1F5WTll9IlIoz4SCdxkWFEhFkJZ3d+GQs37iUpOpwnfjeCLhF+1OWjZ4hKtVqTCcAYE2DL9viuqPBQosI7rqNre24JF7z8A/u9bfsXp/bhgTOGEh8VRlKMi2cvSaHUXYVDhPhIJw6Hjg5WKhgE/P/0glI3GTnFfL46i63ZRRSU+kEHZzsqLHPz2Ofraip/gPfTMskpKq/ZdoQIxsDugjJyiisoc/vJUFClVJsE9PiqcncVn63exdS5v9SUPXT2kVw2JpmIsOC4Y7S80sP23IbdN3sKyhjUPYYydxULN+zlf2alU+kxhIeG8NrVqYwd2FWvBJQKcAH9Pzyv1M3j89bXKfvnVxso8Idx7u0kLsLJ+aN61ymLcDoY1D0GgPxSN/fPWU2lxwBWwrj7g5/ZV9K5N6MppTqf3WsCTwaeBxzAv40xT7Tn+xtjKCqvu/pQmdtDlbeyCwahjhAuOaYv5ZUeZq/MpEesi0fPPYr4SKujt6LSQ3G9u3+zC8vx6IBfpQKebQlARBzAS8BEIBNYISKfGGPWtdcxXE4Hpw5JYuGG7Jqy4wd2JaKdR9n4uoSocG4dfziXj0nG6QghPiqs5rkIp4NB3aLZtPfA3cdjBiQQHhrQF4dKKextAhoDbDbGbDXGVAD/Bc5tzwPERYbx5O9Gcuv4gYzo04WbTj6MFy5NqVMBBouw0BC6xboafPbEmHDeuO4YTh2SREJUGGcO78ELl44Kyu9IqWAjxtjTHCIiFwKTjTE3eLevAo41xtxeb7+bgJsAkpOTj962bVuLj+Wu9FBYXkl0eChhembbqIJSN2XuKiLDHES7/Og+AKXUIYnISmNMav1yn68NjTHTjTGpxpjUpKSkVr2HMzSEhKgwrfwPIjbCSbdYl1b+SgURO2vEnUDfWtt9vGVKKaU6gZ0JYAUwSEQGiEgYcCnWvENKKaU6gW2jgIwxlSJyO/AV1jDQ/+i8Q0p1IE8VFO2FHcutifN6DNO5lIKcrfcBGGPmAfPsjEGpoJGfCa+eDGV51nbSELjmU00CQUx7RZUKBpUV8MNzByp/gOwNsGOZfTGpQyvaYy19uvBvkLMJytt3zeuAngtI2aOyykN+qZvIMAcRYfon5hM8lVbzT33F2Q3LlG8o2gOvT4L9v1nb3z0DNyywlslsJ3oFoNrVvqJyXvl2C1f/Zzl/+XgtWXmldoekwFpB7fjb6paFuuDwifbEow5tz/oDlT9Ya14vegzKCtrtEHp6ptpNaUUlz//fJt5aat2stzargBUZucy++QSSYsJtjk7R/Si4+mP44XmrE/iUqdr+78s8lY2UVVmJoJ1oAlDtprCskjkrM+uUbdtXQnF5ZdAkgOLySgrLKqnyeIgICyXBl6bUcHWBw8ZDr9EQ4oCwKLsjUgfTYzjE9ITCXQfKTr4PIuLa7RCaAFS7ERGSYsIp3ldSq4yguQM7r6SCGUsyeGnRZtxVhuMOS+DFy0b7XvLTdZN9R1UVVBSCMxJC650sxHSHGxfBT29bI7jG3Ajx/dv18MHxPzMIGWPILizjt5xidueXUlTe8WsgJEaH8ffzhuEIkZqy68cOILoDl7v0Jbvyy3huwSbcVdb8Wj9uzeXNpRlUVOrc2qoRxTnw40vw38vh2yetTt/6YnvCuPvg7OesK4LwmHYNITj+Zwah7bklXP7aMnbmleIIEe6dNJjLxyQT24ELvosIo/vF8919p7BmZz79E6PoFhPeocf0JWt25jcoW/5bLqUVVUFzFaSaqawQvv4z/Pyetb3tB9j2PVzyLkR1bbh/SMf8/WgCCEAFpW4e+XQtO70jcKo8hie+2MDZI3p2eGUcGRZKZFgoveIiOvQ4ncFd5WF/cQUVVR5coQ4SD9GUMzo5vkHZKYO7ERUeXOtPqGZwF8Ev79ct2/4juIuBRhJAB9HTkgBUXlnFxt1FDcp355fZEI1/KndXsWzrPiY9t5gTn1zExa8uJSPn4DfhJMWE8/j5w4h1hRIiMGVkTy5O7UOorq2sGhAIr9cXExJqPTqR/mUGoOhwJ6cOqTu8Lzw0hD4JkTZF5H/ySt38/u2V7C+x+k625hTzP7PSyS1ueq3k2AgnFx7dlwV/HMfSBybw2PnD6RrtYx3AyjdEJMDEv9UtG3tXu7fxH4o2AQWgiDAHd00YRGGZm3m/7KZvQgRP/m4E8UHSFt8eSioqG6yVnJ6ZR+UhFkuuXnlNqYMKDYMjp0DysdbkfD1HQmxvTQCqfSTGhPP384Yx9ayhhCCHbL9WdUWGhRITHkph+YGbcVL7xePsoM44FYRcXaxH4hG2haAJIIBFu5y6wlcrJUQ6mXvbWHbuL+XHrftIy8jlmYuDcz1pFbg0AShVT2WVh83Zxdw/ZzW/5RQz8cjuTLt8NN27aNOOCiyaAJSqJ7e4gkumL6Wg1Gr+mbNqJ44Q4eEpRxGps5uqAKINmkrVs6+4oqbyrzZ/3R6KyhqZnEsdmscD7hIwxu5IVD16OqNUPV0inIjUra8OS4rGqXfztlxRNvzyAWQshiFnwxGTISrR7qiUly1/0SJykYisFRGPiKTaEYNSTYlxhXL/GUOontIoPtLJPy4YTnykdgC3SMk+mHsTfPUAbPwCPr4NvvkHlDe8SVHZw64rgDXABcCrNh1fqSbFuJxcPiaZKSN7UVRWSZdIJ1218m+5ihLYsrBu2aq34OR7ITzanphUHbYkAGPMerAmD1NNyyupICuvjB8255CSHMfApGjfml8+gMW4nMS4nNDF7kj8mIRYj9oLmITqSCpf4vN9ACJyE3ATQHJyss3RdJ6KSg8fp2fx10/W1pRdeVwy900aEjSzayo/Fx4Nqf8PVrx2oGz8VIhoOGmeskeHJQARWQD0aOSpB40xHzf3fYwx04HpAKmpqUEzjKCo3E369v2EOUKoqLLOoN5dtp3bxh+uCeAgjDGUV3oIDw3RK0y7ubrA+Adg2AWwYwUcNg7i+0Go3pXuKzosARhjTuuo9w5k+aVuft1TyDs/bqNXlwje//1x3P3BarZkF+Ex4NGhdE3aV1TOV2t38+2vOUwY2o0JQ7vRNUorG1tFdYWoE6DfCXZHohrh801Adqus8pBTVMHiTdlEh4dyTP94kmI6ph3TGMOSzTncMnNVTdmnq7N44oIRXPn6Mk4Y2JUIvRGpUfklFTz08Ro+/2U3AF+t3c2FR/fhr+ccabXl+4CCUjfF5ZWUVXqIDnd02N+RUs1lS20iIucDLwJJwOcikm6MmWRHLIeSlV/Gmc9/R5F3UrDkhEjm3HJ8h/znzS2p4KVvNtcpy9xfirvKw9/OPYozhvXUTuAmlFRUMW/N7jplc3/ayT2nD/aJBJBf4ua177fy0qLNGAN94iOYddPx9I73/4VzlP+y5T4AY8xcY0wfY0y4Maa7r1b+FZUe/vXNlprKH6ylFn/cmtshxwtBCGtk8ZCEqDAuP7afzuh5EB5jcNRr8w8NEXylGyC3pJxpCzfX3FyWub+UJ75cX+dvS6nOprc2HoTHGApKGy4Akl/aMQusx0eFcd/kIXUqrSN7xtI7PqLOQuuqofW7CrjiuH51ym44aQCxPnD2D7Ajt7RB2fpdhZTVW3NAqc6kDcoH4XI6uOnkgTXtygARTkeD1bba0/DeXVjwP+P4bHUW/ROjGDswkURdVeqQVm7fz/DesfzrytGs3pHPqOQ44iPDiAjzjfV4B3WLxukQ3FUHOvEnHdWdLjqiS9lIjB+NKklNTTVpaWmdeszCMjeb9xbxyrdbiXWFctsph9M73oXT4RsVi7LsyC1h4rPfkhQTzsDEaHbmlTLjujE+08ZeWlFJ+o48ps5dw+78Mqak9OK+SYN1yUjVKURkpTGmwbQ7mgCaqbSikhARwp1a8fuiisoq9hSU8+7y7YQIXD6mH91iw3H60ILsxhhyiiowGKLDQ3VqadVpmkoA+hfYTDr80reFhTromxDJnyYP6bRj5pe6CQ8NwdXMkwIRIUk78pUP0VpNqRbKK6ng21+z+e/yHfSJj+CuCYPoFRdBiHbUKz+jCUCpFvB4DF+u2c39H/5SU7Zww16++MNJdNMbu5Sf8Z0GUqX8QG5JBW8uzahTtq+4gq3ZxbbEo1RbaAJQqgWcIUJcRMO7sXWCPuWPNAEo1QJdIsN48Kyhde7YHnt4V7pr567yQ9oHoAJelcewr7ic0ooqXE4H8ZFOwkJbP5x3UPdovrl3PMt/y6V3fASHJUbpeH7llzQBqIC3eW8RV/9nGXsKyokJD2XaFaM4bkDXVt/TER7qoFdcBOeN6t3OkSrVubQJSAW0fUXl3PHeKvYUlANQWF7JbTN/Iq8Z8zlZU4GXU1jWMXM/KWU3vQJQAa3SY/h1T1GdsqLySkrdB5+ELbe4nFkrMvnop50kd43gwTOPJDkhUsf6q4CiVwAqoIU5QhidXHcN2sToMCIPMkmcu9LDW0u38eSXG9i4p5D56/Zywb+WkFNU3tHhKtWpNAGogBYfFcYLl6UwOjkOgIFJUbx5/RgSIpteWCevtILZKzPrlOUWV5CVX9ahsSrV2bQJSAW8PvGR/PuaVNxVBkeIHHJ67dCQELrHusjcX3cO/zgd6+/fqiqhJAcqy8AZAZGJEBLckzvacgUgIk+JyAYRWS0ic0Ukzo44VPBIiAqne6yr0cq/qKySrdlFvP7dVr7duBcReHTKUbicB/57XJzah7hITQB+q6oKsn6CV06E50fCq+Ngz1rwo9mQO4It00GLyOnAQmNMpYg8CWCM+dOhXmfndNAqcH2zcS/XzVhRUxecNCiR5y9NwV1l+HVPIT28iSNe12P2X4W74dWToGjvgbL4/vD/vobo7raF1Vl8ajpoY8zXtTZ/BC60Iw6lcorKeXze+jongt9tyiGvxM1hSdF0j9UJ3gJCZXndyh9gf4bVLBTEfKET+HrgC7uDUMHJGENJI+vyVlR5bIhGdZhQF3TpW7csaTA4grtZr8MSgIgsEJE1jTzOrbXPg0AlMPMg73OTiKSJSFp2dnZHhauCVHxUGDeceFidsgGJUXSN0qkdAkpUElw+C7oOtLa7DYVLZkJ0x63v7Q9sWxJSRK4Ffg9MMMaUNOc12gegOsL+kgpW/JbLByszGdIjhiuP66dNP4GqaC943BASBtFJdkfTaXyqD0BEJgP3AeOaW/kr1VHiI8M4/agenDQoEWdoCKEhvtAyWo+7FMoKQNDhi20R5Gf89dn1lz4NiAHmi0i6iLxiUxxK1YgIC/XNyr84BxY+Bi+OhtcmwMYvoLzQ7qhUALBrFNDhdhxXKb/j8cDaj2Dpi9Z2RRG8fyXcvgrCY+yNTfk9HzzdUUrVKC+AdXPrlhkD25faE48KKJoAlPJlzgjomdKwvNvQzo9FBRxNAEr5stBwOOH2uhX+6Gsgrp99MamAoZPBKeXrYnrC1Z9ARbF141JYNETo9FltVuUO+hvBNAEo5Q90+GL7KdoLa+ZC5jIYcTH0GQORCXZHZQtNAEqp4FGcA7OuhB3LrO01c2DCw3D8rVZzW5DRPgClVPCoKDpQ+Vdb8jyU5tkTj800ASilgoc0UuU5wqw7rIOQJgClVPAIi4bBZ9QtO+XPENHVnnhspn0ASqngEZkAU16EzBWwcxUMOdtaGMYRnFVhcH5qpVTwikqCwWdajyCnTUBKKRWkNAEopVSQ0gSglFJBShOAUkoFKU0ASikVpDQBKKVUkNIEEIQKSt0UlrntDkMpZTO7FoX/G3Au4AH2AtcaY7LsiCWYFJW5WZtVwIsLNxPqEO6eeAQDu0UTGaa3gygVjOy6AnjKGDPCGJMCfAY8ZFMcQSVjXwmXTP+R7zfn8M3GbM57eQl7C8rtDkspZRNbEoAxpqDWZhRg7IgjmFR6PLy5JKNOWZXH8FH6TnsCUm1TnAOFe6Bkv92RKD9m27W/iDwGXA3kA6ccZL+bgJsAkpOTOye4ABSC0DMuokF5jy4uG6JRreapgpxN8OENsGcN9DsRzn8FuvSxOzLlhzrsCkBEFojImkYe5wIYYx40xvQFZgK3N/U+xpjpxphUY0xqUlJSR4Ub8EJChMvHJJMUc2DRi35dIzl1iK405VeKc+Dt82D3L2AMZHwHH94IJbl2R6b8UIddARhjTmvmrjOBecBfOyoWZekeG87nd57Imp0FOB3CkB6xdRKC8gPuEijcVbds2xKo0r4c1XJ2jQIaZIzZ5N08F9hgRxzBRkToFuPi1CHa7OO3Ql3WnPYVRQfKEgeBOOyLSfktu0YBPeFtDloNnA7cZVMcyk9VVFaxp6CMrLxScosr7A6n80TGwwWvgTPSu90Vfve6LhqvWsWWKwBjzO/sOK4KDCUVlSz+NZs/zfmF/FI3o5PjefmK0cHRoR3qgoGnwh2rrOagsGgrCSjVCnonsPI7+aVubnv3J/JLrbuZV23fz+Pz1lFcXmlzZJ3E6YLYntB1IMR0D9rVrFTbaQJQficrr4wqT91bR5b/tj94EoBS7UQTgPI7veJcOEKkTtkx/eOJDNeOUKVaQhOA8jtdXE6evzSFWJfV9DGyTxemnjWU6HCnzZEp5V+08VD5ncjwUE4/sgfH9E+gymNwhYaQEK33MyjVUpoAlF8KCw2he2wQjPpRqgNpE5BSSgUpTQBKKRWkNAEopVSQ0gSglFJBShOAUkoFKU0ASikVpMQY/1mNUUSygW02HT4RyLHp2L5Ov5vG6ffSNP1umtYR300/Y0yDFbX8KgHYSUTSjDGpdsfhi/S7aZx+L03T76ZpnfndaBOQUkoFKU0ASikVpDQBNN90uwPwYfrdNE6/l6bpd9O0TvtutA9AKaWClF4BKKVUkNIEoJRSQUoTQAuJyN0iYkQk0e5YfIWIPCUiG0RktYjMFZE4u2Oym4hMFpGNIrJZRO63Ox5fISJ9RWSRiKwTkbUicpfdMfkaEXGIyE8i8llHH0sTQAuISF/gdGC73bH4mPnAMGPMCOBX4AGb47GViDiAl4AzgCOBy0TkSHuj8hmVwN3GmCOB44Db9Ltp4C5gfWccSBNAyzwL3Adoz3ktxpivjTHVK7L/CPSxMx4fMAbYbIzZaoypAP4LnGtzTD7BGLPLGLPK+3MhVkXX296ofIeI9AHOAv7dGcfTBNBMInIusNMY87Pdsfi464Ev7A7CZr2BHbW2M9FKrgER6Q+MApbZG4lPeQ7rJNPTGQfTJSFrEZEFQI9GnnoQmIrV/BOUDvbdGGM+9u7zINYl/szOjE35HxGJBuYAfzDGFNgdjy8QkbOBvcaYlSIyvjOOqQmgFmPMaY2Vi8hwYADws4iA1cSxSkTGGGN2d2KItmnqu6kmItcCZwMTjN5cshPoW2u7j7dMASLixKr8ZxpjPrQ7Hh8yFpgiImcCLiBWRN4xxlzZUQfUG8FaQUQygFRjjM5miDXiBXgGGGeMybY7HruJSChWZ/gErIp/BXC5MWatrYH5ALHOoN4Eco0xf7A7Hl/lvQK4xxhzdkceR/sAVHuYBsQA80UkXUResTsgO3k7xG8HvsLq5HxfK/8aY4GrgFO9fyvp3jNeZQO9AlBKqSClVwBKKRWkNAEopVSQ0gSglFJBShOAUkoFKU0ASikVpDQBKNVK3pktfxORBO92vHe7v72RKdU8mgCUaiVjzA7gX8AT3qIngOnGmAzbglKqBfQ+AKXawDutwUrgP8CNQIoxxm1vVEo1j84FpFQbGGPcInIv8CVwulb+yp9oE5BSbXcGsAsYZncgSrWEJgCl2kBEUoCJWKtb/Y+I9LQ5JKWaTROAUq3kndnyX1hz2m8HngKetjcqpZpPE4BSrXcjsN0YM9+7/TIwVETG2RiTUs2mo4CUUipI6RWAUkoFKU0ASikVpDQBKKVUkNIEoJRSQUoTgFJKBSlNAEopFaQ0ASilVJD6/0g6HtMvDC+AAAAAAElFTkSuQmCC\n", 713 | "text/plain": [ 714 | "
" 715 | ] 716 | }, 717 | "metadata": { 718 | "needs_background": "light" 719 | }, 720 | "output_type": "display_data" 721 | } 722 | ], 723 | "source": [ 724 | "from MulticoreTSNE import MulticoreTSNE as TSNE\n", 725 | "import seaborn as sns\n", 726 | "import matplotlib.pyplot as plt\n", 727 | "\n", 728 | "\n", 729 | "tsne = TSNE(n_components = 2, n_jobs= -1, verbose = 10, perplexity = 30)\n", 730 | "tsne_data = tsne.fit_transform(x_train)\n", 731 | "\n", 732 | "tsne_data = pd.DataFrame(tsne_data, columns = ['X', 'Y'])\n", 733 | "tsne_data['label'] = train.label.values\n", 734 | "\n", 735 | "sns.scatterplot(x = 'X', y = 'Y', hue = 'label', data = tsne_data)\n", 736 | "plt.title('TSNE on IDF-Glove Title Encodings')\n", 737 | "plt.show()" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": { 743 | "Collapsed": "false" 744 | }, 745 | "source": [ 746 | "This time we see some seperation between the 2 classes in the 2D projection. To some extent this explains the high accuracy we are able to get with simple Log Reg." 747 | ] 748 | }, 749 | { 750 | "cell_type": "markdown", 751 | "metadata": { 752 | "Collapsed": "false" 753 | }, 754 | "source": [ 755 | "To increase performance further, we can add some hand made features. Let's try this in the next section. " 756 | ] 757 | } 758 | ], 759 | "metadata": { 760 | "kernelspec": { 761 | "display_name": "Python 3", 762 | "language": "python", 763 | "name": "python3" 764 | }, 765 | "language_info": { 766 | "codemirror_mode": { 767 | "name": "ipython", 768 | "version": 3 769 | }, 770 | "file_extension": ".py", 771 | "mimetype": "text/x-python", 772 | "name": "python", 773 | "nbconvert_exporter": "python", 774 | "pygments_lexer": "ipython3", 775 | "version": "3.7.3" 776 | } 777 | }, 778 | "nbformat": 4, 779 | "nbformat_minor": 4 780 | } 781 | -------------------------------------------------------------------------------- /notebooks/__pycache__/featurization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anirudhshenoy/text-classification-small-datasets/0e1ceb90addd2c2ec8644de3ae8bb0b6e2ec04ab/notebooks/__pycache__/featurization.cpython-37.pyc -------------------------------------------------------------------------------- /notebooks/__pycache__/utility.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anirudhshenoy/text-classification-small-datasets/0e1ceb90addd2c2ec8644de3ae8bb0b6e2ec04ab/notebooks/__pycache__/utility.cpython-37.pyc -------------------------------------------------------------------------------- /notebooks/featurization.py: -------------------------------------------------------------------------------- 1 | from nltk import word_tokenize 2 | import re 3 | from nltk.sentiment.vader import SentimentIntensityAnalyzer 4 | import textstat 5 | import numpy as np 6 | import multiprocessing 7 | import pandas as pd 8 | import os 9 | from tqdm import tqdm_notebook 10 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 11 | from numpy import hstack 12 | from scipy import sparse 13 | import string 14 | from sklearn.preprocessing import Normalizer, StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler 15 | from pymagnitude import * 16 | from nltk.corpus import stopwords 17 | 18 | 19 | 20 | 21 | def starts_with_number(df): 22 | is_number = [] 23 | for title in df.title.values: 24 | if re.findall('[\d*]',' '.join(title.split()[0:2])): 25 | is_number.append(1) 26 | else: 27 | is_number.append(0) 28 | return np.array(is_number).reshape(-1,1) 29 | 30 | def click_bait_phrases(df): 31 | clickbait_phrases = { 32 | "A Single" : "A", 33 | "Absolutely" : "Moderately", 34 | "Amazing" : "Barely Noticeable", 35 | "Awesome" : "Probably Slightly Less Boring Than Working", 36 | "Best" : "Most Unexceptional", 37 | "Breathtaking" : "Fleetingly Inspirational", 38 | "But what happened next" : "And As You Expect It", 39 | "Can change your life" : "Will Not Change Your Life in ANY Meaningful Way", 40 | "Can't Even Handle" : "Can Totally Handle Without Any Significant Issue", 41 | "Can't Handle" : "Can Totally Handle Without Any Significant Issue", 42 | "Cannot Even Handle" : "Can Probably Totally Handle", 43 | "Doesn't want you to see" : "Doesn't Really Care If You See", 44 | "Epic" : "Mundane", 45 | "Everything You Need To Know" : "Something You Don't Need To Know", 46 | "Gasp-Worthy" : "Yawn-Worthy", 47 | "Go Viral" : "Be Overused So Much That You'll Silently Pray for the Sweet Release of Death to Make it Stop", 48 | "Greatest" : "Average", 49 | "Incredible" : "Painfully Ordinary", 50 | "Infuriate" : "Mildly Annoy", 51 | "Literally" : "Figuratively", 52 | "Mind Blowing" : "Mind-Numbingly Ordinary", 53 | "Mind-Blowing" : "Painfully Ordinary", 54 | "Mind BLOWN" : "Meh", 55 | "Mind Blown" : "Meh", 56 | "Need To Visit Before You Die" : "May Enjoy If You Get Around To It", 57 | "Nothing Could Prepare Me For" : "Does ANYONE Fucking Care About", 58 | "Of All Time" : "For Now", 59 | "Of All Time" : "Of The Last 30 Seconds", 60 | "Of All-Time" : "For Now", 61 | "OMG" : "*yawn*", 62 | "OMG" : "No One Cares. At All", 63 | "One Weird Trick" : "One Piece of Completely Anecdotal Horseshit", 64 | "Perfection" : "Mediocrity", 65 | "Priceless" : "Painfully Ordinary", 66 | "Prove" : "Suggest", 67 | "Right Now" : "Eventually", 68 | "Scientific Reasons" : "Vaguely Science-y Reasons", 69 | "Shocked" : "Vaguely Surprised", 70 | "Shocking" : "Barely Noticeable", 71 | "Simple Lessons" : "Inane Pieces of Bullshit Advice", 72 | "Stop What You're Doing" : "Bookmark Now and Later Completely Forget About", 73 | "Stop What You’re Doing" : "Bookmark Now and Later Completely Forget About", 74 | "Stop What You’re Doing" : "Bookmark Now and Later Completely Forget About", 75 | "TERRIFYING" : "MODERATELY UNCOMFORTABLE", 76 | "Terrifying" : "Thoroughly Banal", 77 | "That Will Make You Rethink" : "That You May Find Vaguely Interesting But Won't Change Your Life in Any Way", 78 | "The World's Best" : "An Adequate", 79 | "This Is What Happens" : "This Is Our Bullshit Clickbait Version Of What Happens", 80 | "Totally blew my mind" : "Bored Me To Tears", 81 | "Unbelievable" : "Painfully Ordinary", 82 | "Unimaginable" : "Actually Kind of Droll", 83 | "WHAT?" : "Some Other Crap", 84 | "Whoa" : "*yawn*", 85 | "WHOA" : "Zzzzzzzzzzz", 86 | "Whoah" : "*yawn*", 87 | "Will Blow Your Mind" : "Might Perhaps Mildly Entertain You For a Moment", 88 | "Will Change Your Life Forever" : "Will Not Change Your Life in ANY Meaningful or Lasting Way", 89 | "Won the Internet" : "Seemed Pretty Cool", 90 | "Wonderful" : "Mildly Decent", 91 | "Worst" : "Vaguely Unpleasant", 92 | "Wow" : "Oh GOD This is SO Boring. Please Kill Me", 93 | "WOW" : "Zzzzzzzzzzz", 94 | "You Didn't Know Exist" : "No One Gives a Shit About", 95 | "You Didn't Know Existed" : "No One Gives a Shit About", 96 | "You Didn’t Know Exist" : "No One Gives a Shit About", 97 | "You Didn’t Know Existed" : "No One Gives a Shit About", 98 | "You Didn’t Know Exist" : "No One Gives a Shit About", 99 | "You Didn’t Know Existed" : "No One Gives a Shit About", 100 | "You Won't Believe" : "In All Likelihood, You'll Believe", 101 | "You Won’t Believe" : "In All Likelihood, You'll Believe", 102 | "You Won’t Believe" : "In All Likelihood, You'll Believe", 103 | "You Wont Believe" : "In All Likelihood, You'll Believe", 104 | "Have To See To Believe": "Might Have Trouble Picturing" 105 | } 106 | clickbait_phrases = [phrase.lower() for phrase in list(clickbait_phrases.keys())] 107 | 108 | with open('../corpus/common_phrases.txt') as f: 109 | common_phrases = [line.rstrip(' \n') for line in f] 110 | 111 | 112 | clickbait_phrases += common_phrases 113 | 114 | is_click = [] 115 | for title in df.title.values: 116 | no_of_phrases = 0 117 | for phrase in clickbait_phrases: 118 | if title.find(phrase) != -1: 119 | no_of_phrases = 1 120 | break 121 | is_click.append(no_of_phrases) 122 | return np.array(is_click).reshape(-1,1) 123 | 124 | 125 | def click_bait_re(df): 126 | clickbait_re = ['\b^(Is|Can|Do|Will)(.*)\?\B', 127 | '\b[Rr]estored [Mm]y [Ff]aith [Ii]n [Hh]umanity\b', 128 | '\b[Rr]estored [Oo]ur [Ff]aith [Ii]n [Hh]umanity\b', 129 | 'The Best(\s\w+)+\s', 130 | '\b([Rr]easons\s|[Ww]hy\s|[Hh]ow\s|[Ww]hat\s[Yy]ou\s[Ss]hould\s[Kk]now\s[Aa]bout\s)(.*)\b'] 131 | is_click = [] 132 | for title in df.title.values: 133 | no_of_phrases = 0 134 | for re_patten in clickbait_re: 135 | if re.findall(re_patten, title): 136 | no_of_phrases = 1 137 | break 138 | is_click.append(no_of_phrases) 139 | return np.array(is_click).reshape(-1,1) 140 | 141 | def at_mentions(df): 142 | is_click = [] 143 | for title in df.title.values: 144 | is_click.append(title.count('@')) 145 | return np.array(is_click).reshape(-1,1) 146 | 147 | def num_dots(df): 148 | num_dots = [] 149 | for title in df.title.values: 150 | num_dots.append(title.count('.')) 151 | return np.array(num_dots).reshape(-1,1) 152 | 153 | def readability_scores_mp(data): 154 | result_dict, idx, text = data 155 | 156 | # flesch_reading_ease = textstat.flesch_reading_ease(text) 157 | flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) 158 | dale_chall_readability_score = textstat.dale_chall_readability_score(text) 159 | 160 | result_dict[idx] = [flesch_kincaid_grade, dale_chall_readability_score] 161 | 162 | def calc_readability_scores(df): 163 | manager = multiprocessing.Manager() 164 | result_dict = manager.dict() 165 | mp_list = [(result_dict, idx, title) for idx, title in enumerate(df.title.values)] 166 | 167 | with multiprocessing.Pool(os.cpu_count()) as p: 168 | r = list(tqdm_notebook(p.imap(readability_scores_mp, mp_list), total=len(mp_list))) 169 | rows = [result_dict[idx] for idx in range(df.title.values.shape[0])] 170 | return pd.DataFrame(rows).values 171 | 172 | def text_features(df): 173 | longest_word_length = [] 174 | mean_word_length = [] 175 | length_in_chars = [] 176 | 177 | for title in df.title.values: 178 | length_in_chars.append(len(title)) 179 | longest_word_length.append(len(max(title.split(), key = len))) 180 | mean_word_length.append(np.mean([len(word) for word in title.split()])) 181 | 182 | longest_word_length = np.array(longest_word_length).reshape(-1,1) 183 | mean_word_length = np.array(mean_word_length).reshape(-1,1) 184 | length_in_chars = np.array(length_in_chars).reshape(-1,1) 185 | 186 | return np.concatenate([longest_word_length, mean_word_length, length_in_chars], axis = 1) 187 | 188 | def count_punctuations(df): 189 | puncts = [] 190 | punctuations = set(string.punctuation) 191 | count = lambda l1,l2: sum([1 for x in l1 if x in l2]) 192 | for title in df.title.values: 193 | puncts.append(count(title,punctuations)) 194 | return np.array(puncts).reshape(-1,1) 195 | 196 | 197 | 198 | def word_ratio(df): 199 | with open('../corpus/DaleChallEasyWordList.txt') as f: 200 | easy_words_list = [line.rstrip(' \n') for line in f] 201 | 202 | with open('../corpus/terrier-stopword.txt') as f: 203 | terrier_stopword_list = [line.rstrip(' \n') for line in f] 204 | 205 | terrier_stopword_list += stopwords.words('english') 206 | 207 | with open('../corpus/common.txt') as f: 208 | common = [line.rstrip(' \n') for line in f] 209 | 210 | terrier_stopword_list += common 211 | 212 | with open('../corpus/contractions.txt') as f: 213 | contractions_list = [line.rstrip(' \n') for line in f] 214 | 215 | with open('../corpus/hyperbolic.txt') as f: 216 | hyperbolic_list = [line.rstrip(' \n') for line in f] 217 | 218 | clickbait_subjects = ['dog', 'everyone', 'girl', 'girls', 'guy', 'guys', 'he', 'here', 'i', 'it', 'kid', 'kids', 'man', 'men', 'mom', 'one', 'parent', 'people', 'photos', 'reasons', 'she', 'signs', 'something', 'that', 'they', 'thing', 'things', 'this', 'thoughts', 'times', 'video', 'ways', 'we', 'what', 'who', 'woman', 'women', 'you'] 219 | 220 | non_clickbait_subjects = ['bomb', 'court', 'crash', 'earthquake', 'explosion', 'fire', 'government', 'group', 'house', 'u.s.', 'china', 'india', 'iran', 'israel', 'korea', 'leader', 'obama', 'police', 'president', 'senate'] 221 | 222 | easy_words_ratio = [] 223 | stop_words_ratio = [] 224 | contractions_ratio = [] 225 | hyperbolic_ratio = [] 226 | clickbait_subs_ratio = [] 227 | non_clickbait_subs_ratio = [] 228 | for title in df.title.values: 229 | easy_words = 0 230 | stop_words = 0 231 | total_words = 0 232 | contracted_words = 0 233 | hyperbolic_words = 0 234 | clickbait_subs = 0 235 | nonclickbait_subs = 0 236 | 237 | for word in title.split(): 238 | if word.lower() in easy_words_list: 239 | easy_words += 1 240 | if word.lower() in terrier_stopword_list: 241 | stop_words += 1 242 | if word.lower() in contractions_list: 243 | contracted_words += 1 244 | if word.lower() in hyperbolic_list: 245 | hyperbolic_words += 1 246 | if word.lower() in clickbait_subjects: 247 | clickbait_subs += 1 248 | if word.lower() in non_clickbait_subjects: 249 | nonclickbait_subs += 1 250 | total_words += 1 251 | 252 | easy_words_ratio.append(easy_words/total_words) 253 | stop_words_ratio.append(stop_words/total_words) 254 | contractions_ratio.append(contracted_words/total_words) 255 | hyperbolic_ratio.append(hyperbolic_words/total_words) 256 | clickbait_subs_ratio.append(clickbait_subs/total_words) 257 | non_clickbait_subs_ratio.append(nonclickbait_subs/total_words) 258 | 259 | easy_words_ratio = np.array(easy_words_ratio).reshape(-1,1) 260 | stop_words_ratio = np.array(stop_words_ratio).reshape(-1,1) 261 | contractions_ratio = np.array(contractions_ratio).reshape(-1,1) 262 | hyperbolic_ratio = np.array(hyperbolic_ratio).reshape(-1,1) 263 | clickbait_subs_ratio = np.array(clickbait_subs_ratio).reshape(-1,1) 264 | non_clickbait_subs_ratio = np.array(non_clickbait_subs_ratio).reshape(-1,1) 265 | 266 | return np.concatenate([easy_words_ratio, stop_words_ratio, contractions_ratio, hyperbolic_ratio, clickbait_subs_ratio, non_clickbait_subs_ratio], axis = 1) 267 | 268 | def num_hashtags(df): 269 | return np.array([title.count('#') for title in df.title.values]).reshape(-1,1) 270 | 271 | 272 | def calc_sentiment_scores(df): 273 | sid = SentimentIntensityAnalyzer() 274 | neg = [] 275 | neu = [] 276 | pos = [] 277 | compound = [] 278 | 279 | for title in df.title.values: 280 | sentiments = sid.polarity_scores(title) 281 | neg.append(sentiments['neg']) 282 | neu.append(sentiments['neu']) 283 | pos.append(sentiments['pos']) 284 | compound.append(sentiments['compound']) 285 | 286 | neg = np.array(neg).reshape(-1,1) 287 | neu = np.array(neu).reshape(-1,1) 288 | pos = np.array(pos).reshape(-1,1) 289 | compound = np.array(compound).reshape(-1,1) 290 | return np.concatenate([neg, pos, compound], axis = 1) 291 | 292 | def get_glove_vectors(df, glove): 293 | vectors = [] 294 | for title in tqdm_notebook(df.title.values): 295 | vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0)) 296 | return np.array(vectors) 297 | 298 | 299 | def tfidf_w2v(df, idf_dict, glove): 300 | vectors = [] 301 | for title in tqdm_notebook(df.title.values): 302 | w2v_vectors = glove.query(word_tokenize(title)) 303 | weights = [idf_dict.get(word, 1) for word in word_tokenize(title)] 304 | vectors.append(np.average(w2v_vectors, axis = 0, weights = weights)) 305 | return np.array(vectors) 306 | 307 | 308 | def featurize(train_df, test_df, embedding_type): 309 | 310 | print('Starts with number....') 311 | 312 | 313 | train_starts_with_number = starts_with_number(train_df) 314 | test_starts_with_number = starts_with_number(test_df) 315 | 316 | print('Clickbait Phrases....') 317 | train_cb_phrases = click_bait_phrases(train_df) 318 | test_cb_phrases = click_bait_phrases(test_df) 319 | 320 | 321 | print('Clickbait re....') 322 | train_cb_re = click_bait_re(train_df) 323 | test_cb_re = click_bait_re(test_df) 324 | 325 | 326 | print('Num dots....') 327 | train_num_dots = num_dots(train_df) 328 | test_num_dots = num_dots(test_df) 329 | 330 | 331 | print('Text Features....') 332 | train_text_features = text_features(train_df) 333 | test_text_features = text_features(test_df) 334 | 335 | 336 | print('Punctuation....') 337 | train_num_punctuations = count_punctuations(train_df) 338 | test_num_punctuations = count_punctuations(test_df) 339 | 340 | 341 | print('Word ratios....') 342 | train_word_ratio = word_ratio(train_df) 343 | test_word_ratio = word_ratio(test_df) 344 | 345 | 346 | print('Sentiment Scores....') 347 | train_sentiment = calc_sentiment_scores(train_df) 348 | test_sentiment = calc_sentiment_scores(test_df) 349 | 350 | print('Readability Scores....') 351 | 352 | train_readability_scores = calc_readability_scores(train_df) 353 | test_readability_scores = calc_readability_scores(test_df) 354 | 355 | if embedding_type == 'tfidf': 356 | print('TFIDF Title....') 357 | 358 | tfidf_word = TfidfVectorizer() 359 | 360 | print('TFIDF Word....') 361 | train_word_features = tfidf_word.fit_transform(train_df.title.values) 362 | test_word_features = tfidf_word.transform(test_df.title.values) 363 | 364 | 365 | normalizer_tfidf = MinMaxScaler() 366 | train_embedding_features = sparse.csr_matrix(normalizer_tfidf.fit_transform(train_word_features.todense())) 367 | 368 | 369 | test_embedding_features = sparse.csr_matrix(normalizer_tfidf.fit_transform(test_word_features.todense())) 370 | 371 | elif embedding_type == 'glove': 372 | print('Glove.....') 373 | glove = Magnitude("../vectors/glove.6B.100d.magnitude") 374 | train_glove = get_glove_vectors(train_df, glove) 375 | test_glove = get_glove_vectors(test_df, glove) 376 | 377 | normalizer_glove = MinMaxScaler() 378 | train_glove = normalizer_glove.fit_transform(train_glove) 379 | test_glove = normalizer_glove.transform(test_glove) 380 | 381 | 382 | train_embedding_features = sparse.csr_matrix(train_glove) 383 | test_embedding_features = sparse.csr_matrix(test_glove) 384 | 385 | elif embedding_type == 'tfidf_glove': 386 | print('Glove.....') 387 | 388 | glove = Magnitude("../vectors/glove.6B.100d.magnitude") 389 | 390 | tfidf = TfidfVectorizer() 391 | tfidf.fit(train_df.title.values) 392 | idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_)) 393 | 394 | train_glove = tfidf_w2v(train_df, idf_dict, glove) 395 | test_glove = tfidf_w2v(test_df, idf_dict, glove) 396 | 397 | normalizer_glove = MinMaxScaler() 398 | train_glove = normalizer_glove.fit_transform(train_glove) 399 | test_glove = normalizer_glove.transform(test_glove) 400 | 401 | 402 | train_embedding_features = sparse.csr_matrix(train_glove) 403 | test_embedding_features = sparse.csr_matrix(test_glove) 404 | 405 | 406 | train_features = hstack((train_starts_with_number, 407 | train_cb_phrases, 408 | train_cb_re, 409 | train_num_dots, 410 | train_text_features, 411 | train_word_ratio, 412 | train_sentiment, 413 | train_readability_scores, 414 | train_num_punctuations)) 415 | 416 | normalizer = MinMaxScaler() 417 | train_features = normalizer.fit_transform(train_features) 418 | 419 | train_features = sparse.csr_matrix(train_features) 420 | 421 | train_features = sparse.hstack(( 422 | train_features, 423 | train_embedding_features 424 | )) 425 | 426 | 427 | 428 | 429 | test_features = hstack((test_starts_with_number, 430 | test_cb_phrases, 431 | test_cb_re, 432 | test_num_dots, 433 | test_text_features, 434 | test_word_ratio, 435 | test_sentiment, 436 | test_readability_scores, 437 | test_num_punctuations)) 438 | test_features = normalizer.transform(test_features) 439 | 440 | test_features = sparse.csr_matrix(test_features) 441 | test_features = sparse.hstack(( 442 | test_features, 443 | test_embedding_features 444 | )) 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | feature_names = ['starts_with_number', 453 | 'clickbait_phrases', 454 | 'clickbait_re', 455 | 'num_dots', 456 | 'longest_word_length', 457 | 'mean_word_length', 458 | 'length_in_chars', 459 | 'easy_words_ratio', 460 | 'stop_words_ratio', 461 | 'contractions_ratio', 462 | 'hyperbolic_ratio', 463 | 'clickbait_subs_ratio', 464 | 'nonclickbait_subs_ratio', 465 | 'sentiment_neg', 466 | 'senitment_pos', 467 | 'sentiment_compound', 468 | 'flesch_kincaid_grade', 469 | 'dale_chall_readability_score', 470 | 'num_punctuations' 471 | ] 472 | 473 | if embedding_type == 'tfidf': 474 | feature_names = feature_names + ['tfidf_word_' + col for col in tfidf_word.get_feature_names()] 475 | else: 476 | 477 | feature_names = feature_names + ['glove_' + str(col) for col in range(100)] 478 | print('DONE!') 479 | 480 | return train_features, test_features, feature_names 481 | -------------------------------------------------------------------------------- /notebooks/models_ensembles_tuning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "Collapsed": "false" 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "from utility import *" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": { 18 | "Collapsed": "false" 19 | }, 20 | "source": [ 21 | "# Models" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": { 27 | "Collapsed": "false" 28 | }, 29 | "source": [ 30 | "In this section, we'll use the features we created in the previous section, along with IDF-weighted embeddings and try them on different models. \n", 31 | "\n", 32 | "As mentioned earlier, when dealing with small datasets, low-complexity models like Logistic Regression, SVMs and Naive Bayes will generalize the best. We'll try these models along with non-parameteric models like KNN and non-linear models like Random Forest, XGBoost etc. \n", 33 | "\n", 34 | "We'll also try bootstrap-aggregating or bagging with the best-performing classifier along with stacking using VotingClassifer. Let's get started!" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": { 41 | "Collapsed": "false" 42 | }, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "Starts with number....\n", 49 | "Clickbait Phrases....\n", 50 | "Clickbait re....\n", 51 | "Num dots....\n", 52 | "Text Features....\n", 53 | "Punctuation....\n", 54 | "Word ratios....\n", 55 | "Sentiment Scores....\n", 56 | "Readability Scores....\n" 57 | ] 58 | }, 59 | { 60 | "data": { 61 | "application/vnd.jupyter.widget-view+json": { 62 | "model_id": "fd1f983853cd4c4d9c8ae54571d7e7a2", 63 | "version_major": 2, 64 | "version_minor": 0 65 | }, 66 | "text/plain": [ 67 | "HBox(children=(IntProgress(value=0, max=50), HTML(value='')))" 68 | ] 69 | }, 70 | "metadata": {}, 71 | "output_type": "display_data" 72 | }, 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "\n" 78 | ] 79 | }, 80 | { 81 | "data": { 82 | "application/vnd.jupyter.widget-view+json": { 83 | "model_id": "98d332e04c154470addc73538472c536", 84 | "version_major": 2, 85 | "version_minor": 0 86 | }, 87 | "text/plain": [ 88 | "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))" 89 | ] 90 | }, 91 | "metadata": {}, 92 | "output_type": "display_data" 93 | }, 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "\n", 99 | "Glove.....\n" 100 | ] 101 | }, 102 | { 103 | "data": { 104 | "application/vnd.jupyter.widget-view+json": { 105 | "model_id": "f89e4bcff37a48b58b4fe0595dbc18bd", 106 | "version_major": 2, 107 | "version_minor": 0 108 | }, 109 | "text/plain": [ 110 | "HBox(children=(IntProgress(value=0, max=50), HTML(value='')))" 111 | ] 112 | }, 113 | "metadata": {}, 114 | "output_type": "display_data" 115 | }, 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "\n" 121 | ] 122 | }, 123 | { 124 | "data": { 125 | "application/vnd.jupyter.widget-view+json": { 126 | "model_id": "31fc2e9717a34b0e9457f419952ac01e", 127 | "version_major": 2, 128 | "version_minor": 0 129 | }, 130 | "text/plain": [ 131 | "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))" 132 | ] 133 | }, 134 | "metadata": {}, 135 | "output_type": "display_data" 136 | }, 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "\n", 142 | "DONE!\n" 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "from featurization import *\n", 148 | "import pandas as pd \n", 149 | "import numpy as np\n", 150 | "\n", 151 | "\n", 152 | "train = pd.read_csv('../datasets/train.csv')\n", 153 | "test = pd.read_csv('../datasets/test.csv')\n", 154 | "train_features, test_features, feature_names = featurize(train, test, 'tfidf_glove')" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 3, 160 | "metadata": { 161 | "Collapsed": "false" 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "y_train = np.where(train.label.values == 'clickbait', 1, 0)\n", 166 | "y_test = np.where(test.label.values == 'clickbait', 1, 0)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "Collapsed": "false" 173 | }, 174 | "source": [ 175 | "GridSearchCV HelperFunction" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 4, 181 | "metadata": { 182 | "Collapsed": "false" 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "from sklearn.model_selection import GridSearchCV, PredefinedSplit\n", 187 | "from sklearn.metrics import make_scorer\n", 188 | "from scipy import sparse\n", 189 | "\n", 190 | "\n", 191 | "def warn(*args, **kwargs):\n", 192 | " pass\n", 193 | "import warnings\n", 194 | "warnings.warn = warn\n", 195 | "\n", 196 | "\n", 197 | "def adjusted_f1(y_true, y_prob):\n", 198 | " f1 = print_model_metrics(y_true, y_prob, verbose = 0, return_metrics = True)[0]\n", 199 | " return f1\n", 200 | "\n", 201 | "score = make_scorer(adjusted_f1, greater_is_better = True, needs_proba = True)\n", 202 | "\n", 203 | "\n", 204 | "\n", 205 | "# Since we want to use a predefined Test/Val set, we'll use PredefinedSplit and pass it as the CV parameter\n", 206 | "# We need to merge both the datasets and label 0 for test and -1 for the train set\n", 207 | "\n", 208 | "X = sparse.vstack((train_features, test_features))\n", 209 | "test_fold = [-1 for _ in range(train_features.shape[0])] + [0 for _ in range(test_features.shape[0])]\n", 210 | "y = np.concatenate([y_train, y_test])\n", 211 | "ps = PredefinedSplit(test_fold)\n", 212 | "\n", 213 | "def run_grid_search(model, params, x_train, y_train):\n", 214 | " grid = GridSearchCV(model, params, cv = ps, n_jobs = -1, scoring = score, verbose = 0, refit = False)\n", 215 | " grid.fit(x_train, y_train)\n", 216 | " return (grid.best_params_, grid.best_score_)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 5, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "# Run log reg n times and average the metrics\n", 226 | "def fit_n_times(model, x_train, y_train, x_test, y_test, n_iters = 10):\n", 227 | " metrics = np.zeros(5)\n", 228 | " for _ in range(n_iters):\n", 229 | " model.fit(x_train, y_train)\n", 230 | " y_test_prob = model.predict_proba(x_test)[:,1]\n", 231 | " metrics += print_model_metrics(y_test, y_test_prob, verbose = False, return_metrics = True)\n", 232 | " metrics /=10\n", 233 | " print('F1: {:.3f} | Pr: {:.3f} | Re: {:.3f} | AUC: {:.3f} | Accuracy: {:.3f} \\n'.format(*metrics))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": { 239 | "Collapsed": "false" 240 | }, 241 | "source": [ 242 | "## Log Reg " 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 47, 248 | "metadata": { 249 | "Collapsed": "false" 250 | }, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "Best Parameters : {'alpha': 0.1, 'l1_ratio': 0.15, 'penalty': 'elasticnet'}\n", 257 | "F1: 0.966 | Pr: 0.959 | Re: 0.974 | AUC: 0.994 | Accuracy: 0.966 \n", 258 | "\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "from sklearn.linear_model import SGDClassifier\n", 264 | "\n", 265 | "lr = SGDClassifier(loss = 'log')\n", 266 | "lr_params = {'alpha' : [10**(-x) for x in range(7)],\n", 267 | " 'penalty' : ['l1', 'l2', 'elasticnet'],\n", 268 | " 'l1_ratio' : [0.15, 0.25, 0.5, 0.75]}\n", 269 | "\n", 270 | "best_params, best_f1 = run_grid_search(lr, lr_params, X, y)\n", 271 | "\n", 272 | "print('Best Parameters : {}'.format(best_params))\n", 273 | "\n", 274 | "lr = SGDClassifier(loss = 'log', \n", 275 | " alpha = best_params['alpha'], \n", 276 | " penalty = best_params['penalty'], \n", 277 | " l1_ratio = best_params['l1_ratio'])\n", 278 | "fit_n_times(lr, train_features, y_train, test_features, y_test)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": { 284 | "Collapsed": "false" 285 | }, 286 | "source": [ 287 | "## SVM" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 48, 293 | "metadata": { 294 | "Collapsed": "false" 295 | }, 296 | "outputs": [ 297 | { 298 | "name": "stdout", 299 | "output_type": "stream", 300 | "text": [ 301 | "Best Parameters : {'C': 10, 'degree': 2, 'kernel': 'poly'}\n", 302 | "Best F1 : 0.9683981828955164\n", 303 | "F1: 0.968 | Pr: 0.956 | Re: 0.981 | AUC: 0.994 | Accuracy: 0.968 \n", 304 | "\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "from sklearn.svm import SVC\n", 310 | "\n", 311 | "svm = SVC(probability = True)\n", 312 | "svm_params = {'C' : [10**(x) for x in range(-1,4)],\n", 313 | " 'kernel' : ['poly', 'rbf', 'linear'],\n", 314 | " 'degree' : [2, 3]}\n", 315 | "\n", 316 | "best_params, best_f1 = run_grid_search(svm, svm_params, X, y)\n", 317 | "\n", 318 | "print('Best Parameters : {}'.format(best_params))\n", 319 | "print('Best F1 : {}'.format(best_f1))\n", 320 | "\n", 321 | "svm = SVC(C = best_params['C'], kernel = best_params['kernel'], degree = best_params['degree'], probability = True)\n", 322 | "fit_n_times(svm, train_features, y_train, test_features, y_test)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": { 328 | "Collapsed": "false" 329 | }, 330 | "source": [ 331 | "## Naive Bayes" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 49, 337 | "metadata": { 338 | "Collapsed": "false", 339 | "scrolled": true 340 | }, 341 | "outputs": [ 342 | { 343 | "name": "stdout", 344 | "output_type": "stream", 345 | "text": [ 346 | "Best Parameters : {'alpha': 10000}\n", 347 | "Best F1 : 0.9634676145339652\n", 348 | "F1: 0.963 | Pr: 0.951 | Re: 0.976 | AUC: 0.993 | Accuracy: 0.963 \n", 349 | "\n" 350 | ] 351 | } 352 | ], 353 | "source": [ 354 | "from sklearn.naive_bayes import MultinomialNB\n", 355 | "\n", 356 | "nb = MultinomialNB(class_prior = [0.5, 0.5])\n", 357 | "nb_params = {'alpha' : [10**(x) for x in range(6)]}\n", 358 | "\n", 359 | "\n", 360 | "best_params, best_f1 = run_grid_search(nb, nb_params, X, y)\n", 361 | "\n", 362 | "print('Best Parameters : {}'.format(best_params))\n", 363 | "print('Best F1 : {}'.format(best_f1))\n", 364 | "\n", 365 | "nb = MultinomialNB(alpha = best_params['alpha'], class_prior = [0.5, 0.5])\n", 366 | "\n", 367 | "fit_n_times(nb, train_features, y_train, test_features, y_test)" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": { 373 | "Collapsed": "false" 374 | }, 375 | "source": [ 376 | "## KNN" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 51, 382 | "metadata": { 383 | "Collapsed": "false", 384 | "scrolled": true 385 | }, 386 | "outputs": [ 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "Best Parameters : {'n_neighbors': 7, 'weights': 'distance'}\n", 392 | "F1: 0.962 | Pr: 0.955 | Re: 0.970 | AUC: 0.992 | Accuracy: 0.962 \n", 393 | "\n" 394 | ] 395 | } 396 | ], 397 | "source": [ 398 | "from sklearn.neighbors import KNeighborsClassifier\n", 399 | "\n", 400 | "knn = KNeighborsClassifier(n_jobs = -1)\n", 401 | "\n", 402 | "knn_params = { 'n_neighbors' : [3, 5, 7, 9, 15, 31], \n", 403 | " 'weights' : ['uniform', 'distance']\n", 404 | "}\n", 405 | "\n", 406 | "best_params, best_f1 = run_grid_search(knn, knn_params, X, y)\n", 407 | "print('Best Parameters : {}'.format(best_params))\n", 408 | "\n", 409 | "knn = KNeighborsClassifier(n_neighbors = best_params['n_neighbors'], weights = best_params['weights'], n_jobs = -1)\n", 410 | "\n", 411 | "fit_n_times(knn, train_features, y_train, test_features, y_test)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": { 417 | "Collapsed": "false" 418 | }, 419 | "source": [ 420 | "## Random Forest" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 53, 426 | "metadata": { 427 | "Collapsed": "false" 428 | }, 429 | "outputs": [ 430 | { 431 | "name": "stdout", 432 | "output_type": "stream", 433 | "text": [ 434 | "Best Parameters : {'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 250}\n", 435 | "F1: 0.957 | Pr: 0.950 | Re: 0.964 | AUC: 0.991 | Accuracy: 0.956 \n", 436 | "\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "from sklearn.ensemble import RandomForestClassifier\n", 442 | "\n", 443 | "rf = RandomForestClassifier(n_jobs = -1)\n", 444 | "\n", 445 | "rf_params = { 'n_estimators' : [10, 100, 250, 500, 1000], \n", 446 | " 'max_depth' : [None, 3, 7, 15],\n", 447 | " 'min_samples_split' : [2, 5, 15]\n", 448 | "}\n", 449 | "\n", 450 | "best_params, best_f1 = run_grid_search(rf, rf_params, X, y)\n", 451 | "\n", 452 | "print('Best Parameters : {}'.format(best_params))\n", 453 | "rf = RandomForestClassifier(n_estimators = best_params['n_estimators'],\n", 454 | " min_samples_split = best_params['min_samples_split'],\n", 455 | " max_depth = best_params['max_depth'], \n", 456 | " n_jobs = -1)\n", 457 | "fit_n_times(rf, train_features, y_train, test_features, y_test)" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": { 463 | "Collapsed": "false" 464 | }, 465 | "source": [ 466 | "## XGBoost" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 55, 472 | "metadata": { 473 | "Collapsed": "false" 474 | }, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "Best Parameters : {'learning_rate': 0.3, 'max_depth': 1, 'n_estimators': 100, 'reg_alpha': 0}\n", 481 | "F1: 0.955 | Pr: 0.947 | Re: 0.964 | AUC: 0.990 | Accuracy: 0.955 \n", 482 | "\n" 483 | ] 484 | } 485 | ], 486 | "source": [ 487 | "from xgboost import XGBClassifier\n", 488 | "\n", 489 | "xgb = XGBClassifier(n_jobs = -1)\n", 490 | "\n", 491 | "xgb_params = { 'n_estimators' : [10, 100, 200, 500], \n", 492 | " 'max_depth' : [1, 2, 3, 7],\n", 493 | " 'learning_rate' : [0.1, 0.2, 0.01, 0.3],\n", 494 | " 'reg_alpha' : [0, 0.1, 0.2]\n", 495 | "}\n", 496 | "\n", 497 | "best_params, best_f1 = run_grid_search(xgb, xgb_params, X, y)\n", 498 | "\n", 499 | "print('Best Parameters : {}'.format(best_params))\n", 500 | "xgb = XGBClassifier(n_estimators = best_params['n_estimators'],\n", 501 | " learning_rate = best_params['learning_rate'],\n", 502 | " max_depth = best_params['max_depth'], \n", 503 | " reg_alpha = best_params['reg_alpha'], \n", 504 | " n_jobs = -1)\n", 505 | "fit_n_times(xgb, train_features, y_train, test_features, y_test)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": { 511 | "Collapsed": "false" 512 | }, 513 | "source": [ 514 | "## DL Tabular Data" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": { 521 | "Collapsed": "false" 522 | }, 523 | "outputs": [], 524 | "source": [ 525 | "from tensorflow.keras.models import Sequential\n", 526 | "from tensorflow.keras.layers import Dense, Dropout\n", 527 | "from tensorflow.keras.optimizers import RMSprop, Adam\n", 528 | "from tensorflow.keras.callbacks import ModelCheckpoint\n", 529 | "\n", 530 | "batch_size = 128\n", 531 | "epochs = 40\n", 532 | "\n", 533 | "simple_nn = Sequential()\n", 534 | "simple_nn.add(Dense(150, activation='relu', input_shape=(119,)))\n", 535 | "simple_nn.add(Dropout(0.2))\n", 536 | "simple_nn.add(Dense(100, activation='relu'))\n", 537 | "simple_nn.add(Dropout(0.2))\n", 538 | "simple_nn.add(Dense(1, activation='sigmoid'))\n", 539 | "\n", 540 | "simple_nn.summary()\n", 541 | "\n", 542 | "simple_nn.compile(loss='binary_crossentropy',\n", 543 | " optimizer=Adam(),\n", 544 | " metrics=['accuracy'])\n", 545 | "\n", 546 | "checkpoint = ModelCheckpoint('./saved_models', monitor = 'val_accuracy', verbose = 1, save_best_only=True)\n", 547 | "history = simple_nn.fit(train_features.todense(), y_train,\n", 548 | " batch_size=batch_size,\n", 549 | " epochs=epochs,\n", 550 | " verbose=1,\n", 551 | " callbacks = [checkpoint],\n", 552 | " validation_data=(test_features.todense(), y_test))" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 4, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "from tensorflow.keras.models import load_model\n", 562 | "\n", 563 | "simple_nn = load_model('./saved_models')" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": 70, 569 | "metadata": { 570 | "Collapsed": "false" 571 | }, 572 | "outputs": [ 573 | { 574 | "name": "stdout", 575 | "output_type": "stream", 576 | "text": [ 577 | "F1: 0.961 | Pr: 0.952 | Re: 0.970 | AUC: 0.992 | Accuracy: 0.960 \n", 578 | "\n" 579 | ] 580 | } 581 | ], 582 | "source": [ 583 | "y_pred_prob = simple_nn.predict(test_features.todense())\n", 584 | "print_model_metrics(y_test, y_pred_prob)" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": { 590 | "Collapsed": "false" 591 | }, 592 | "source": [ 593 | "# Bagging" 594 | ] 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": { 599 | "Collapsed": "false" 600 | }, 601 | "source": [ 602 | "Since SVM worked so well, we can try a bagging classifier by using SVM as a base estimator. This should improve the variance of the base model and reduce overfitting. " 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 7, 608 | "metadata": { 609 | "Collapsed": "false" 610 | }, 611 | "outputs": [ 612 | { 613 | "name": "stdout", 614 | "output_type": "stream", 615 | "text": [ 616 | "F1: 0.968 | Pr: 0.961 | Re: 0.975 | AUC: 0.995 | Accuracy: 0.968 \n", 617 | "\n" 618 | ] 619 | } 620 | ], 621 | "source": [ 622 | "from sklearn.ensemble import BaggingClassifier\n", 623 | "from sklearn.svm import SVC\n", 624 | "from sklearn.model_selection import RandomizedSearchCV\n", 625 | "\n", 626 | "svm = SVC(C = 10, kernel = 'poly', degree = 2, probability = True, verbose = 0)\n", 627 | "\n", 628 | "svm_bag = BaggingClassifier(svm, n_estimators = 200, max_features = 0.9, max_samples = 1.0, bootstrap_features = False, bootstrap = True, n_jobs = 1, verbose = 0)\n", 629 | "\n", 630 | "svm_bag.fit(train_features, y_train)\n", 631 | "y_test_prob = svm_bag.predict_proba(test_features)[:,1]\n", 632 | "print_model_metrics(y_test, y_test_prob)" 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": { 638 | "Collapsed": "false" 639 | }, 640 | "source": [ 641 | "# Stacking Classifier" 642 | ] 643 | }, 644 | { 645 | "cell_type": "markdown", 646 | "metadata": { 647 | "Collapsed": "false" 648 | }, 649 | "source": [ 650 | "Finally, one last thing we can try is the Stacking Classifier = basically a weighted average of the predictions of different models. Since we are using the fast ai tabular learner we wont be able to use Sklearn's `VotingClassifier` instead we'll just run a simple loop that gets the predictions of each model and runs a weighted average. " 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": 8, 656 | "metadata": { 657 | "Collapsed": "false" 658 | }, 659 | "outputs": [ 660 | { 661 | "name": "stdout", 662 | "output_type": "stream", 663 | "text": [ 664 | "Training LR\n", 665 | "Training SVM\n", 666 | "Training NB\n", 667 | "Training KNN\n", 668 | "Training RF\n" 669 | ] 670 | }, 671 | { 672 | "name": "stderr", 673 | "output_type": "stream", 674 | "text": [ 675 | "/home/anirudh/.local/lib/python3.7/site-packages/sklearn/svm/base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 676 | " \"avoid this warning.\", FutureWarning)\n" 677 | ] 678 | }, 679 | { 680 | "name": "stdout", 681 | "output_type": "stream", 682 | "text": [ 683 | "Training XGB\n", 684 | "F1: 0.970 | Pr: 0.967 | Re: 0.972 | AUC: 0.995 | Accuracy: 0.970 \n", 685 | "\n" 686 | ] 687 | } 688 | ], 689 | "source": [ 690 | "# Define all models \n", 691 | "from sklearn.ensemble import RandomForestClassifier\n", 692 | "from sklearn.neighbors import KNeighborsClassifier\n", 693 | "from sklearn.naive_bayes import MultinomialNB\n", 694 | "from xgboost import XGBClassifier\n", 695 | "from sklearn.svm import SVC\n", 696 | "from sklearn.linear_model import SGDClassifier\n", 697 | "\n", 698 | "lr = SGDClassifier(loss = 'log', alpha = 0.1, penalty = 'elasticnet')\n", 699 | "svm = SVC(C = 10, kernel = 'poly', degree = 2, probability = True)\n", 700 | "nb = MultinomialNB(alpha = 10000, class_prior = [0.5, 0.5])\n", 701 | "knn = KNeighborsClassifier(n_neighbors = 7, weights = 'distance', n_jobs = -1)\n", 702 | "rf = RandomForestClassifier(n_estimators = 250, min_samples_split = 5, max_depth = 15, n_jobs = -1)\n", 703 | "xgb = XGBClassifier(n_estimators = 100, learning_rate = 0.3, max_depth = 1, n_jobs = -1)\n", 704 | "\n", 705 | "model_dict = dict(zip(['LR', 'SVM', 'NB', 'KNN', 'RF', 'XGB'], [lr, svm, nb, knn, rf, xgb]))\n", 706 | "\n", 707 | "for model_name, model in model_dict.items():\n", 708 | " print('Training {}'.format(model_name))\n", 709 | " model.fit(train_features, y_train)\n", 710 | "\n", 711 | "model_weights = { 'LR' : 0.9,\n", 712 | " 'SVM' : 0.9,\n", 713 | " 'NB' : 0.8,\n", 714 | " 'KNN' : 0.75,\n", 715 | " 'RF' : 0.75,\n", 716 | " 'XGB' : 0.6,\n", 717 | " 'simple_nn' : 0.7\n", 718 | "}\n", 719 | "\n", 720 | "y_pred_prob = 0\n", 721 | "\n", 722 | "for model_name, model in model_dict.items():\n", 723 | " y_pred_prob += (model.predict_proba(test_features)[:,1] * model_weights[model_name])\n", 724 | "\n", 725 | "y_pred_prob += (simple_nn.predict(test_features.todense()).ravel() * model_weights['simple_nn'])\n", 726 | "y_pred_prob /= sum(model_weights.values())\n", 727 | "\n", 728 | "print_model_metrics(y_test, y_pred_prob)" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 75, 734 | "metadata": { 735 | "Collapsed": "false" 736 | }, 737 | "outputs": [], 738 | "source": [ 739 | "def run_voting_clf(model_weights):\n", 740 | " #result_list, model_weights = data\n", 741 | " \n", 742 | " y_pred_prob = 0\n", 743 | "\n", 744 | " for model_name, model in model_dict.items():\n", 745 | " y_pred_prob += (model.predict_proba(test_features)[:,1] * model_weights[model_name])\n", 746 | "\n", 747 | " #y_pred_prob += (simple_nn.get_preds(ds_type = DatasetType.Valid)[0].numpy()[:,0] * model_weights['simple_nn'])\n", 748 | " y_pred_prob += (simple_nn.predict(test_features.todense()).ravel() * model_weights['simple_nn'])\n", 749 | " y_pred_prob /= sum(model_weights.values())\n", 750 | " f1 = print_model_metrics(y_test, y_pred_prob, return_metrics = True, verbose = 0)[0]\n", 751 | " return {'loss' : -f1, 'status' : STATUS_OK}" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": 76, 757 | "metadata": { 758 | "Collapsed": "false" 759 | }, 760 | "outputs": [ 761 | { 762 | "name": "stdout", 763 | "output_type": "stream", 764 | "text": [ 765 | "100%|██████████| 500/500 [08:45<00:00, 1.05s/it, best loss: -0.9708506841165973]\n" 766 | ] 767 | } 768 | ], 769 | "source": [ 770 | "from hyperopt import fmin, tpe, hp, STATUS_OK, Trials\n", 771 | "\n", 772 | "trials = Trials()\n", 773 | "model_weights = fmin(run_voting_clf,\n", 774 | " space= {\n", 775 | " 'LR' : hp.uniform('LR', 0, 1),\n", 776 | " 'SVM' : hp.uniform('SVM', 0, 1),\n", 777 | " 'NB' : hp.uniform('NB', 0, 1),\n", 778 | " 'KNN' : hp.uniform('KNN', 0, 1),\n", 779 | " 'RF' : hp.uniform('RF', 0, 1),\n", 780 | " 'XGB' : hp.uniform('XGB', 0, 1),\n", 781 | " 'simple_nn' : hp.uniform('simple_nn', 0, 1),\n", 782 | "\n", 783 | "\n", 784 | " },\n", 785 | " algo=tpe.suggest,\n", 786 | " max_evals=500,\n", 787 | " trials = trials)" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": 9, 793 | "metadata": { 794 | "Collapsed": "false" 795 | }, 796 | "outputs": [], 797 | "source": [ 798 | "model_weights = {'KNN': 0.7866810233035141,\n", 799 | " 'LR': 0.8036572275670447,\n", 800 | " 'NB': 0.9102009774357307,\n", 801 | " 'RF': 0.1559824350958057,\n", 802 | " 'SVM': 0.9355079606348642,\n", 803 | " 'XGB': 0.33469066125332436,\n", 804 | " 'simple_nn': 0.000545264707939086}" 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": 10, 810 | "metadata": { 811 | "Collapsed": "false" 812 | }, 813 | "outputs": [ 814 | { 815 | "name": "stdout", 816 | "output_type": "stream", 817 | "text": [ 818 | "F1: 0.971 | Pr: 0.963 | Re: 0.980 | AUC: 0.995 | Accuracy: 0.971 \n", 819 | "\n" 820 | ] 821 | }, 822 | { 823 | "data": { 824 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEWCAYAAABollyxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3dd5wV1fnH8c+zEKRIxwaoYGFsgMESVKzB2Im9/LChBoloRGPsHQuiKBG7UbFHTSRG0VhjRaMgiFiOoBQVlI7ShX1+f8wsXtbdZe+ys2X2+/Z1X9x7zsycM7vrs2fPnHnG3B0REcmugurugIiIpEuBXkQk4xToRUQyToFeRCTjFOhFRDJOgV5EJOMU6GWtmVkjM3vWzBaY2VNrcZzeZvZSZfatOpjZC2Z2UnX3Q6SIAn0dYmb/Z2ajzWyhmc1IAlKPSjj0kcAGQGt3P6qiB3H3R939d5XQn9WY2V5m5mY2olh516T89XIe50oze2RN27n7Ae7+YAW7K1LpFOjrCDM7FxgKXEcclDcB7gB+XwmH3xT4wt1XVMKx0jIL2MXMWueUnQR8UVkNWEz/T0mNox/KOsDMmgNXA/3d/Wl3X+TuP7n7s+7+l2SbdcxsqJlNT15DzWydpG4vM/vGzP5sZjOTvwb6JHVXAZcDxyR/KZxafORrZh2SkXP95PPJZvaVmf1oZpPNrHdO+ds5++1qZh8kU0IfmNmuOXWvm9lAM3snOc5LZtamjC/DcuBfwLHJ/vWAY4BHi32t/mpmX5vZD2Y2xsx2T8r3By7OOc+PcvpxrZm9AywGNkvKTkvq7zSzf+Yc/wYze9XMrNzfQJG1pEBfN+wCNARGlLHNJUB3YHugK7AzcGlO/YZAc6AdcCpwu5m1dPcriP9KeMLd13X3+8rqiJk1AW4FDnD3psCuwLgStmsFjEy2bQ3cDIwsNiL/P6APsD7QADivrLaBh4ATk/f7AROA6cW2+YD4a9AKeAx4yswauvt/ip1n15x9TgD6Ak2BqcWO92egc/JLbHfir91JrtwjUoUU6OuG1sDsNUyt9AaudveZ7j4LuIo4gBX5Kan/yd2fBxYCUQX7UwhsZ2aN3H2Gu39SwjYHARPd/WF3X+HujwOfA4fkbPOAu3/h7kuAJ4kDdKncfRTQyswi4oD/UAnbPOLuc5I2hwDrsObzHO7unyT7/FTseIuJv443A48AZ7n7N2s4nkilUqCvG+YAbYqmTkrRltVHo1OTslXHKPaLYjGwbr4dcfdFxFMm/YAZZjbSzLYqR3+K+tQu5/N3FejPw8CZwN6U8BeOmZ1nZp8l00Xzif+KKWtKCODrsird/X/AV4AR/0ISqVIK9HXDu8Ay4NAytplOfFG1yCb8clqjvBYBjXM+b5hb6e4vuvu+wEbEo/R7y9Gfoj59W8E+FXkYOAN4Phltr5JMrZwPHA20dPcWwALiAA1Q2nRLmdMwZtaf+C+D6cnxRaqUAn0d4O4LiC+Y3m5mh5pZYzP7lZkdYGaDk80eBy41s/WSi5qXE081VMQ4YA8z2yS5EHxRUYWZbWBmv0/m6pcRTwEVlnCM54FOyZLQ+mZ2DLAN8FwF+wSAu08G9iS+JlFcU2AF8Qqd+mZ2OdAsp/57oEM+K2vMrBNwDXA88RTO+WZW5hSTSGVToK8jkvnmc4kvsM4inm44k3glCsTBaDQwHvgY+DApq0hbLwNPJMcaw+rBuSDpx3RgLnHQ/WMJx5gDHEx8MXMO8Uj4YHefXZE+FTv22+5e0l8rLwL/IV5yORVYyurTMkU3g80xsw/X1E4yVfYIcIO7f+TuE4lX7jxctKJJpCqYLv6LiGSbRvQiIhmnQC8iknEK9CIiGadALyKScWXdQFOtGu10rq4Syy/MfmdIdXdBaqAmDdY+d1CjX59Z7pizZOxttSpXkUb0IiIZV2NH9CIiVSrlDNNRFF0BXAl0DiFMiKLoFOAcYCXxjXrnhBDeSrbtDtwNNAKmAMeHEGauqa40GtGLiAAU1Cv/K09RFHUjzg47Nfncmvj5ED1DCNsTpxG/O6krIL7Rrn8IoRPwJjBoTXVl0YheRAQgj2n+KIpaAC1KqJofQphfbNt1gNuB44DXi1pLXk2JU2u0AIqymu4ALA0hFD2b4S7ikfspa6grlUb0IiIQT92U9wUDgMklvAaUcOSrgUdCCFOKCkIIs4HTgQ+jKJpG/KyDM5LqTcjJ3JpsWxBFUas11JVKI3oREchrRE887TK8hPLio/ldgB2BC4uVNyPONbVTCCFEUXQ0MCKKoi75dKK8FOhFRCCvi7HJ9Mz8NW4YJ+3bGpgcRRFAe+LkeecQT/OE5HhPRlE0nPjZB9PISdEdRVEboDCEMDcZ/ZdYV1YnNHUjIgLxiL68r3IKIQwKIbQNIXQIIXQgnoffD/gS6BZF0foAURTtDfwAzCbO+NooiqIeyWH68XPm1LLqSqURvYgIVGg1TUWFEMZEUTQYeCOKouXEz2Y4MoTggEdRdAJwdxRFDUmWUCb7FZZWV5Yam6ZYd8ZKSXRnrJSkUu6M7XFZ+e+MfXtgrbozViN6ERHI92JsraJALyICqd8ZW50U6EVEQIFeRCTz6lXdxdiqpkAvIgKaoxcRyTxN3YiIZJxG9CIiGacRvYhIxmlELyKScVWYAqGqKdCLiICmbkREMk9TNyIiGacRvYhIxinQi4hknC7GiohknOboRUQyTlM3IiIZpxG9iEi2mQK9iEi2KdCLiGScFSjQi4hkmkb0IiIZp0AvIpJxCvQiIlmX3TivQC8iAhrRi4hkXkGB7owVEcm0tEf0URRdAVwJdA4hTIiiqDtwN9AImAIcH0KYmWxbobrSZPdXmIhIPiyPV56iKOoGdAemJp8LgEeA/iGETsCbwKC1qSuLRvQiIuQ3oo+iqAXQooSq+SGE+cW2XQe4HTgOeD0p3gFYGkJ4O/l8F/Ho/JS1qCuVRvQiIsSBvrwvYAAwuYTXgBIOfTXwSAhhSk7ZJiSje4AQwmygIIqiVmtRVyqN6EVEyDsFwlBgeAnlxUfzuwA7AhdWuGOVQIFeRIT8pm6S6Zn5a9wQ9gS2BiZHUQTQHngRuBXYtGijKIraAIUhhLlRFE2rSF1ZndDUjYgIeU/dlEsIYVAIoW0IoUMIoQPwDbAfcCPQKIqiHsmm/YCnkvdjKlhXKgV6ERHSCfSlCSEUAicAd0ZRNJF45H/h2tSVRVM3IiJUzZ2xyai+6P0ooHMp21WorjQK9CIioFw3IiJZpxQIIiIZp6RmUmnOO/m3DOx/EHc9+Tbn3Pg0AE0aNWBg/4PotVdnWjVvwtffz+Nv/xzFsMffXLXfKYd15+jfdaNr1I4WTRsR9RrItBnzVtWbGU/e1IeundqxXst1mffjEl7/YCKXDnuO6bMWVPl5Sv7GjP6Ahx+8n88+/YRZM2dy5cDr6HXo4avqFy9exLChN/PfV19hwYL5bLjRRhxx1LEcf+LJq7b5+utpDL1pMGPHjuGn5cvZdbfdOf+iS2ndpk01nFEtk904r1U3VWnn7Tbl1EO7M/6L6auV33DO79m/xzaccsVjbH/0IG64/xUGnnkwxx2ww6ptGjdswCv/C1x774ulHv+N0ZM4/qKH6HrkIP7vguF0bNeKJ27sk9r5SOVasngxm2+xJX+54GIaNmz4i/ohgwfx9ptvMPD6G/jnMyM59Q/9GDZ0CM89+8yq/fv3PRV35+6/Def+hx7jp59+YsBZf6SwsLCqT6fWqcpVN1UttUBvZkeVp6yuaNakIQ8M7M3pA59g/o+LV6vr3qUDjz0/mjfHTGLajHk89vxo3p8wlZ23W3VfBLc9/iY3DX+VUeO+KvH47s5tj7/J+xOmMu27ebw3fgo3PfgaO267Ces00B9utUGPPfbkrLPPpefv9sfsl/9rjv9oHAce0ouddu5O23btObjXoXTu0pUJ4z8CYNy4D/n222+4cuB1bNkpYstOEVddO4hPP5nAB/97r6pPp9ZRoK+Yi8pZVifcfslRjHh1PG+OmfSLulHjJnPg7tvSfoM4R1L3Lh3o0qktL737eYXba9msMcfu3433P57KsuUrKnwcqTm2/3U33nrjdb77bgYAH437kC/C5+zaY3cAli9fjpnRYJ11Vu2zzjrrUFBQwNixY6qlz7VJlgN9pQ/1zOwA4ECgnZndmlPVDKiTEafPod3ZrH0b+lz2aIn1f75pBLddfBQTn7ucn1asBODcG5/mhbc/zbuta848mH5H70aTRuvwv/FTOPycv61V36XmOP+iS7jmqis4cN+9qV+/flJ2KXvsuTcAXbpsT+PGjRk6ZDBnn3MeALcOHcLKlSuZPWtWtfW7tsgz102tksaIfjowGlhKfLtu0evfxLf+lsrM+prZaDMbvWLW+BS6VvW23HQ9rjrjQE6+7BFWrCx5nvSMY3rQvUsHjjj3b+x6ws2cf/MzXH92L/bdZau827vl4f/S/fibOaj/XawsLOT+gb3X9hSkhvj7Y48w/qOx3DLsDh75+z859y8XcstNg3nn7bcAaNmqFTcMGcqot9+iR/cd2GPXnfjxxx/ZauttMr10sLJoRJ8Hd/8I+MjMHnX3vEbw7n4PcA9Ao53O9cruW3X4TecOrNdyXT78+/mryurXr0ePX2/GaYfvQruel3F1/4PofeGDPP9WPIKfMGkGXTq1ZcDxe/FyntM3cxYsYs6CRUyaNosw5XsmjbyC3bbvyDvjJlfqeUnVWrp0KcOG3sLgIUPZc699AOgURXwRPufh4fezWzJ9s8uuPfj3Cy8zb9486terR9Nmzdh3rx60a79xdXa/VqiNAby80pi6edLdjwbGmtkvgrW7d6nsNmuyZ1//mB0++3q1snsuP5ZJ02YzePgrADT4VX1WFq7+pVpZ6BSs5Q9e0f4NdDG21luxYgUrVvxEQb3VR+YFBQUU+i//UmzZsiUA7//vPebOncOee+1dJf2szTIc51NZR3928u/BKRy71lmwcCkLFn63WtmiJcuZ98NiPv0yLn9zzCQGnnkQCxcvY9p389i92+b0PnBHLhn27Kp9NmjdlA1aN2XLTdYHYOuOG9KiaSO+/m4+835YzG86b8r2W7Vn1LjJLPhxCR3bt+aKfgcw5ds5jNJovlZYvHgRX0+bBoB7Id/NmEH4/DOaNW/ORhu1ZYcdd2LY0Jtp3LgxG23UjjGj32fks8+smo8HeGbEP+nQcTNatW7N+HHjuOmGa+l9wkl06LhZdZ1WrZHlEb2518wZkqxM3ZTkxbvO4NMvv1t1w9QGrZtydf+D6PmbiJbNGjPtu7kMf+Z/DH3k9VX7XPKH/bi07y8vcfzhqsd55LkP6NKpLYPPOZTtttiIJo0a8N3sH3jp3c8Z/MArfDszOzdMzX5nSHV3ITWjP/gffU856Rflh/Q6lKuuHcTs2bMYNvRm3nv3HX5YsICNNmrLoUccyQknnbIqSN16yxCefWYECxYsoG27thx51LH0PvHkTAcxgCYN1v4EowteLHfMCTfsV6u+oKkFejPrDgwjTrrfAKgHLHL3ZuXZP8uBXiouy4FeKq4yAv1WF5Y/0H8+qHYF+jQnb28DjiVOir8jcCLQKcX2REQqrEDLKyvG3ScB9dx9pbs/AOyfZnsiIhVlVv5XbZPmiH6xmTUAxpnZYGAGyq0jIjVUlq9jpBl4T0iOfyawCNgYOCLF9kREKkwj+gpw96nJiL4D8DQQ3H15Wu2JiKyNLN89nFqgN7ODgLuAL4kzPXc0s9Pd/YW02hQRqajaOFIvrzTn6IcAeycXZDGzzYGRgAK9iNQ4WZ6jTzPQ/1gU5BNfAT+m2J6ISIVlOM6nkuum6Nlno83seeBJwIGjgA8quz0RkcqgEX1+Dsl5/z2wZ/J+FvDL56OJiNQAGY7zqaQp7gNgZq3cfW5unZl1rOz2REQqg+6MrZhnzWxVXhsz2xp4toztRUSqTZYfPJJmoL+OONiva2Y7AP8Ajk+xPRGRCtMNUxXg7iPN7FfAS0BT4DB3/yKt9kRE1kZaI/Uoiv4FdAQKgYXAWcDXwMPA5sByYCJweghhVrJPd+BuoBEwBTg+hDBzTXWlqfQRvZkNM7NbkweD7wM0ByYDZxZ7WLiISI2R4oj+pBBC1xDCr4GbgPuJVyIODiFEIYTOxDeWDgKIoqgAeAToH0LoBLxZnrqypDGiH13s85gU2hARqVT5XIyNoqgF0KKEqvkhhPm5BSGE3Cf/NAcKQwhzgddzyt8D/pi83wFYGkJ4O/l8F/HI/ZQ11JUqjVU3DwKYWRNgqbuvTD7XA9ap7PZERCpDnlM3A4ArSii/CriyeGEURX8DfkecDmb/YnUFxEH+30nRJsDUovoQwuwoigqiKGpVVl3yy6NEad4Z+yrQk3hOCuL5pJeAXVNsU0SkQvIM9EOB4SWUzy+hjBDCaQBRFJ0A3AgcmFM9jDhO3pZPB/KRZqBv6O5FQR53X2hmjVNsT0SkwvKJ88n0TIlBfQ37PRxF0T1RFLUOIcyJougmYEvgkBBCYbLZNGDTon2iKGpDMt0TRVGpdWW1m+byykVm1q3oQ7LEckmK7YmIVFga6+ijKFo3iqKNcz4fAswF5kZRdB3xnPuhIYRlObuNARpFUdQj+dyP+JGsa6orVZoj+gHAU2Y2nXheakPgmBTbExGpsJRWVzYBnoqiqAmwkjjIHwJsA1wEfAGMiqIIYHII4bAQQmEyxXN3FEUNSZZQApRVV5Y019F/YGZbAVFSFNz9p7TaExFZG2mkQAghfA90L6W61AZDCKOAzvnWlSaN7JX7uPtrOVksi3QyM9z96cpuU0RkbRXUxlteyymNEf2ewGusnsWyiBM/VlBEpEbJcJxPZR39Fcm/fSr72CIiaamNycrKK42pm3PLqnf3myu7TRGRtZXhLMWpTN00TeGYIiKpynI++jSmbq6q7GOKiKTNSl8EU+ulkb3yRjM7vYTy081sjVnWRESqQ4GV/1XbpHFn7D7APSWU3wscnEJ7IiJrLctPmEpjjn4dd/fihe5eaLXxKyQidUKWo1MaI/olZrZl8cKkTLluRKRGKjAr96u2SWNEfznwgpldw88PHdmROK/DgBTaExFZa1p1kwd3f8HMDgX+QvxsRIAJwBHu/nFltyciUhlq4UC93FJJaubuE4CT0ji2iEgaauOUTHmlmaZYRKTWyG6YV6AXEQHqeK4bM+tO/BDcTZPtDXB375Ry30REqkyGr8WWa0T/AHA+8QqaleU9sJm1J37obQ/i9MRvAWe7+zcV6KeISKrq+qqbH9z92Qoc+wHgMeCo5PPxSdm+FTiWiEiq6uTUjZl1Sd6+ZmbXEz8wZNUDbN19/BqOvZ67P5DzebiZaR29iNRIGR7Qlzmiv73Y5x457x3YYw3HnmNmxwOPJ5+PA+bk1z0RkapRJ0f07r47gJlt6u5Tc+vMbNNyHPsU4jn6W4h/MYwC9NQpEamRshvmyzdHPwLoVo6y1SS/HHpVsF8iIlWqXobnbsqao+8EbA00N7PcgN0MaFjGfpeX0Z67+8C8eykikrI6OXUDbAscDrTg55UzAD8Cv3iwSI5FJZQ1AU4FWgMK9CJS42Q4zpc5Rz8CGGFmPdz97fIe0N2HFL03s6bA2cRz838HhpS2n4hIdarruW5OMrMTixe6e9/SdjCzVsC5QG/gQaCbu8+rcC9FRFKW4ThfrkD/Ss77hsBhwNelbWxmNxJP+dwDdHb3hRXp2Lx3b67IbpJxLXc6s7q7IDXQkrG3rfUx6uocPQDu/kTuZzN7GChrKufPxDdWXQpckvPFK8qR06xiXRURSU+9FAJ9FEWtgYeBzYHlwETg9BDCrJxt7iee3m4aQliYlB0C3Egco8cAfUIIi9dUV5qKPEqwI7BBaZXuXuDujdy9qbs3y3k1VZAXkZqqwMr/yoMDg0MIUQihM/AlMKioMgnaqz1jO4qidYF7gUNCCFsQL4A5b011ZZ7bmjYws3lmNjd5zQdeJn4soIhIZqQR6EMIc0MIr+cUvUecCbhotH8F8fXMXAcAo0MIE5PPdwHHlKOuVGVO3Vg879IV+DYpKnR3L2MXEZFaKZ85+iiKWhAvPS9ufghhfin7FAB/BP6dFN0OXBFCWBBFUe6mmwC52QimARuXo65UZY7ok6D+vLuvTF4K8iKSSXmO6AcAk0t4lZW4cRiwELgtiqKjgeUhhJGpnlSiPKtuxpnZr919bOq9ERGpJnleix0KDC+hvLTR/E3AlsRz64VRFO0F7BNF0ZSczT6JougA4lH63jnlm/DzSsey6kpVVgqE+u6+Avg18IGZfUl812vR6pkyc92IiNQm9fOI9Mn0TIlBvbgoiq4DdgAOCiEsS/Y/AzgjZxsHtg0hLIyi6GviUf+WyVx8P+DJZNP/lFFX+rmVUfc+ceIyJSYTkcxLYxl9FEXbEi9e+QIYlczFTw4hHFbaPiGEH6Mo6gs8F0VRPWAscYaBMuvKUlagNwB3/7J8pyQiUnulkQIhhPAJ5ciAHEKwYp+fAZ4pZdtS60pTVqBfz8yKL/tZxd1166qIZEaGb4wtM9DXA9Yl2/n4RUSAuvsowRnufnWV9UREpBrVyQePoJG8iNQhGY7zZQb631ZZL0REqplleGxb1oNH5lZlR0REqlNdHdGLiNQZCvQiIhlXpx88IiJSF9SryNM5agkFehER9HBwEZHM0xy9iEjGZXhAr0AvIgJQUBfX0YuI1CUa0YuIZFz9DE/SK9CLiKARvYhI5ml5pYhIxmU4zivQi4gAZPjGWAV6ERHQ1I2ISOYp0IuIZFx2w7wCvYgIoIuxIiKZp3z0IiIZp1U3IiIZp4uxIiIZp6kbEZGM09SNiEjGpTGij6LoJuAIoAPQOYQwISlvCNwC9ASWAu+GEPomdZ2AB4HWwBzgxBDCxDXVlSXLv8RERMrN8njl4V/AHsDUYuWDiQN8pxBCZ+CynLq7gNtDCJ2A24G7y1lXKo3oRUSAenmM6KMoagG0KKFqfghhftGHEMLbyfa5+64LnAi0DyF4st33Sd36QDdg32Tzx4Hboihaj/h3TIl1IYRZZfVXI3oREeIbpsr7AgYAk0t4DShHU5sTT7tcEUXR6CiKXo+iqEdStzHwbQhhJUDy7/SkvKy6MmlELyICWH6TMkOB4SWUzy+hrLh6wGbA2BDCX6Io+g3wbBRFW+TTgXwo0IuIkF8KhGR6pjxBvSTTgBXEUy+EEP4XRdFsoFNS1y6KonohhJVRFNUD2gJfE0/dlFZXJk3diIgABVi5X2sjhDAb+C/JXHuykmZ9YFIIYSYwDjgu2fw44pH/rLLq1tSmRvQiIqST1CyKoluBw4ENgVeiKJoTQtgW6AfcH0XREOAn4ISci7j9gAejKLocmEd84ZZy1JXK3L1STqiyLV1BzeyYVKuWO51Z3V2QGmjJ2NvWOky//NnscsecfbduU6tuo9WIXkQEKKhVoTs/CvQiIuS96qZW0cXYajBm9Af8qX8/eu69O123jXhmxNOr1bs7d94+jJ579WDnbl049eQTmDTp57ucv/32G6647GIO3O+37NytCwfu91v+essQli5dWtWnIpXkvFN+x5Kxt3HLBUetKmvSqAE3X3AUk/4zkLnv3sxHIy7jrN57r7Zfx/ZteGLIH5j22vV8/9aNPHLDKazfqukvjt9zl615/cE/M2fUzcx4czAv3H1W6udU2+S5jr5WUaCvBosXL2aLLTtxwYWX0LBhw1/UP3DfvTw0/H4uvPgyHn3iH7Rq1Yp+p/Vh0aKFAEz56isKVxZyyWVX8vQzI7nw4st49t//YvD111b1qUgl2LlzB049fFfGf/HNauU3/PkI9u+xLadc+hDbH34NN9z3IgP/1IvjDtoJgMYNG/DcHf0xgwP6DmOfPrfQ4Ff1+OdfT18tb8she3XhoUF9eOy59+l+3CD2OmkID/7r3So9x9rA8vivtlGgrwa777EnfxpwLvvutz9mq38L3J1HH36IU07rS8/f7ceWW3Zi4HU3sGjRIp4f+RwAu+2+BwOvG8RuPXan/cYbs8eee/GHvv145eWXquN0ZC00W7chD1x7Eqdf+Sjzf1iyWl33rh15bOT7vDl6ItNmzOWx597n/Y+nsPN2HQDYZfvN6NCuNX2veIRPJk3nk0nTOe3yh+m2zSbstXMnAAoKjCHnH8klQ//FPU+9xcSpMwmTv+fvL4yu6lOt8Qqs/K/aRoG+hvn2m2+YPXsWu+y626qyhg0bssOOO/HR2LGl7rdw4SKaNWtWFV2USnT7pccx4pVxvDn6lwkIR437igP36Ez7DeKUKt27dqRLp/a8NOozANZpUB93WLp8xap9li5bQWGhs+v2mwPQbetN2HijVixfsZJRj13A5Jev49k7+tM1al8FZ1e7FJiV+1XbpBrozezh8pTJz2bPju99aN26zWrlrVq3Zvbs2SXuM336tzw0/D6OPvb/Uu+fVJ4+h+3KZhuvx5V3PFti/Z9veIqPv/iWif+5hh/e/ysv3TuAS299hhfemgDA+x9PYeHiZVw/4FAaN2xA44YNGHTuYdSvX48N28S/9Du2j3+OLv/jQdx4/4sc/qc7+fb7+bx479mrtpFYStkra4S0R/Tb5n4ws3rADqVtbGZ9zWy0mY2+7957Uu5aNsyZPZszTj+N7rvsxgknnVzd3ZFy2nLT9bnqrEM4+eLhrFhRWOI2Zxy3J927duSIs+9i1943cP6Qf3L9OYex765bAzB73kJ6n38fv9ttG2a9cxPfv3UjzddtxIefTqMwuT+maPR5w99eZMQr4xj72df0v+Zxfli4hN4H71w1J1tLZHlEn8rySjO7CLgYaGRmPxQVA8uBUiO4u99TVF9Xb5hq02Y9AObMmc1GbduuKp87Zw5t2qw+yp89axZ/OOUktthiS64dNDjTj0LLmt906ch6LZvy4T8uWVVWv349enTbnNOO7EG7vS/g6rN60fv8+3j+zXgEP2HidLpE7Rlw4m95OZm+efW9z9m211W0btGEFSsKWbBwCZNfvo4pL44BYMbsBQB8/tWMVe2sXFnIpGmzaL9hq6o63Vohy//3pBLo3f164Hozu97dL0qjjaxq1749bdqsx3vvjmK7zl0AWLZsGR+OGc05552/aoHRa2UAAA2eSURBVLtZs2ZyWp8T2XzzLRl0483Ur69bImqTZ/87nh2OXH2V1D1XHc+kabMYfN+LADT4VX1Wrlx9vLNyZWGJI8o58xcBsOdOnVi/1bo898bHAIz97GuWLvuJLTtswKhxXwHxk5Q227gNr7z7WaWfV62W4Uif1oh+K3f/HHjKzLoVr3f3D9Not7ZYvGgR06ZNA8C9kBkzpvP5Z5/RvHlzNmrblt4nnMh9995Nh46bsWmHDtx79500btyYAw86GICZM7/n1JNPZP311+f8Cy9m/rx5q47dslUr6tWrVy3nJeW3YOESFixcfZXNoiXLmbdgEZ9+GY++3xw9kYF/6sXCxcuYNmMuu++wBb0P3plL/vrMqn1O6NWdL6Z8z8y5P/KbLh256S9HMuzR/zJx6kwAfly0lL/9420u63cg334/n6nT5/DHY/ekRdPGPDby/ao74VqgNk7JlFdaw8Bzgb7AkBLqHNgnpXZrhU8+mcBpfX7ORXTn7cO48/Zh9Pr9YQy8bhB9Tv0Dy5Yt4/prruaHHxbQuUtX7rz3fpo0WReAd995h2lTpzBt6hT267nXasd+/qVXaddOKyqy4MQL7+fqs37P8OtOomWzxkybMZer7xjJnX9/Y9U2nTqsz9Vn9aJV88ZMnT6Xwfe9yK2PvLbacS4aOoLlP63g3qtPoHHDXzHu82/Yv+9f+W72D8WbrNOyG+aV1ExqGSU1k5JURlKzDyYvKHfM2alj81r1eyH1iV0z2w7YBlh1C6i7P5R2uyIi+aiNd7yWV6qB3syuAPYiDvTPAwcAbwMK9CJSo2R4ij71dfRHAr8FvnP3PkBXoHnKbYqI5C3LN0ylPXWzxN0LzWyFmTUDZlKOJ5aLiFS1LN+HknagH21mLYB7gTHAQkBp80SkxslwnE830Lv7Gcnbu8zsP0Azdx+fZpsiIhWR4ThfJatuDgd6EK+ffxtQoBeRmifDkT7tVTd3AFsAjydFp5tZT3fvn2a7IiL50vLKitsH2NqTu7LM7EHgk5TbFBHJW5bn6NNeXjkJ2CTn88ZJmYhIjZLlZ8amldTsWeI5+abAZ2b2fvL5N4AyKYlIjaOpm/zdlNJxRURSURtH6uWVVj76NwDM7AB3fyG3zsz6AW+UuKOISDXJcJxPfY7+MjNblZLYzM4Hfp9ymyIi+ctwDoS0V930Ap4zs78A+wNboUAvIjVQWg8eiaLoYGAgP/+auCqE8HQURZ2AB4HWwBzgxBDCxGSfUusqItURvbvPJg72twNtgSPdfXmabYqIVEQaA/ooigx4GDghhLA9cALwYBRFBcBdwO0hhE7EMfLunF3LqstbWqtufiReZWPJvw2AzYAjzczdvVka7YqIVFgeETyKohZAixKq5ocQ5hcrK+TnrL0tgBlAG6AbsG9S/jhwWxRF6yU9KbEuhDCr/L38WSojendv6u7Ncv5t6O7rFn1Oo00RkbVhefwHDAAml/AakHvMEIIDRwPPRFE0FfgXcCLxPUXfhhBWJtutBKYn5WXVVUiqUzdmdpiZNc/53MLMDk2zTRGRisjzhqmhQMcSXkNzjxlFUX3gIuD3IYRNgUOAJ4F1q+7M0r8Ye4W7jyj64O7zk6dO/SvldkVE8pLP3HsyPVN8iqYk2wNtQwjvJPu9E0XRImAp0C6KonohhJVRFNUjvo75ddKV0uoqJO3llSUdP/WMmSIi+TKzcr/y8A3QPoqiCCCKoq2BDYCJwDjguGS744CxIYRZIYSZpdVV9NzSDvSjzexmM9s8ed1M/AASEZEaJY1cNyGE74A/Av+Iougj4O/AKSGEuUA/4Kwoir4Azko+FymrLv9zSxJLpsLMmgCXAT2TopeBa9x90Zr2XbqC9DomtVbLnc6s7i5IDbRk7G1rvQh+yuyl5Y45Hdo0rFW3TaX9hKlFwIVptiEiUilqVejOT1rr6Ie6+4CcLJarcfdeabQrIlJRyl6Zv4eTf5XFUkRqBWWvzJO7j0n+VZZKEakVChTo82NmH1PClE0Rd++SRrsiIhWX3Uif1tTN4cRrRYsv8N8Y+C6lNkVEKizLUzdpraO/BVjg7lNzX8CCpE5EpEbJcDr61Eb0G7j7x8UL3f1jM+uQUpsiIhWW5RF9WoG+pPSdRRql1KaISIXlmdqgVklr6ma0mf2heKGZnYZSIIhIDaSpm/wNAEaYWW9+Duw7Ej+A5LCU2hQRqbAMD+hTW0f/PbCrme0NbJcUj3T319JoT0RkbenO2Apy9/8C/02zDRGRSpHdOK/c8CIikOk4r0AvIgJQkOFJegV6ERGyfTE27SdMiYhINdOIXkSEbI/oFehFRNDyShGRzNOIXkQk4xToRUQyTlM3IiIZpxG9iEjGZTjOK9CLiACZjvQK9CIiZDsFgrl7dfdB1sDM+rr7PdXdD6lZ9HMh5aUUCLVD3+rugNRI+rmQclGgFxHJOAV6EZGMU6CvHTQPKyXRz4WUiy7GiohknEb0IiIZp0AvIpJxCvR5MjM3syE5n88zsyvzPMYBZjbazD41s7FFxzOzK83svOT91WbWs4xjnGxmt5VQPtzMjsyjL23N7B/J++3N7MB8zkXKz8w2NLO/m9mXZjbGzJ43s05mNiGp39HMbl3DMRaWULaXmT2XZ1/+ZmbbJO8vzmdfqX0U6PO3DDjczNpUZGcz2w64DTje3bcBdgQmFd/O3S9391fWqqfl4O7T3b3oF8P2gAJ9CszMgBHA6+6+ubvvAFwEbFC0jbuPdvc/VUV/3P00d/80+ahAn3EK9PlbQbza4ZziFWbWwcxeM7PxZvaqmW1Swv7nA9e6++cA7r7S3e8s4VirRuZmtpOZjTKzj8zsfTNrWmzbg8zs3ZxfPj2Tvxi+MLODc/r2lpl9mLx2zSmfYGYNgKuBY8xsnJkdU+GvkJRkb+And7+rqMDdPwK+LvqcOzI3s3XN7AEz+zj5eToi92Bm1ib5nh+UFDUzs5FmFszsLjMrSLa7M/lZ+MTMrsrZ//XkL4hBQKPke/5oeqcv1UmBvmJuB3qbWfNi5cOAB929C/AoUNKf4dsBY8rbUBKAnwDOdveuQE9gSU79YcCFwIHuPjsp7gDsDBwE3GVmDYGZwL7u3g04pnjf3H05cDnwhLtv7+5PlLePUi55fd+By4AF7t45+Xl6rajCzDYARgKXu/vIpHhn4CxgG2Bz4PCk/BJ33xHoAuxpZl1yG3H3C4Elyfe8dwXOS2oBBfoKcPcfgIeA4n9m7wI8lrx/GOhRCc1FwAx3/6CobXdfkdTtA1wAHOTu83L2edLdC919IvAVsBXwK+BeM/sYeIo4IEjN1ZN4QAFAzvf3V8CrwPnu/nLO9u+7+1fuvhJ4nJ9/9o42sw+BscC26PteJynQV9xQ4FSgSZ77fQLsUEl9+BJoCnQqVl785ggnnmr6HuhKfF2gQSX1Qcqnsr7vK4j/MtivWPkvvudm1hE4D/ht8lfBSKBhJfRBahkF+gpy97nAk8TBvsgo4NjkfW/grRJ2vRG42Mw6AZhZgZn1K6OpAGxkZjsl2zc1s6L00lOBI4CHzGzbnH2OSo67ObBZcozmxH8ZFAInAPVKaOtH4l8cUvleA9Yxs1WJyJJplI1L2f5loH/Oti2Ttw6cAmxlZhfkbL+zmXVM5uaPAd4GmgGLgAXJdM8BpbT1k5n9qgLnJLWEAv3aGQLkrr45C+hjZuOJg+nZxXdw9/HAAOBxM/sMmEAcjEuUzJ0fAwwzs4+IA0DDnPrPiX+pPJUEdoBpwPvAC0A/d18K3AGclBxjK+IAUNx/gW10MbbyeXwL+mHEF8q/NLNPgOuB70rZ5RqgZXKh/CPii7lFx1oJHAfsY2ZnJMUfEK/m+gyYDIxILvaOBT4nnlJ8p5S27gHG62JsdikFgohIxmlELyKScQr0IiIZp0AvIpJxCvQiIhmnQC8iknEK9FKpzGxlsjxzgpk9ZWaN1+JYublfepnZhWVs2yJnqeFqWTlF6joFeqlsRXlTtgOWA6vdDGaxvH/u3P3f7j6ojE1aAGfkbJ+blVOkTlOglzS9BWyRZMgMZvYQ8Q1iG5vZ75Lsix8mI/91AcxsfzP7PMnPUpSYa7X8+2a2gZmNSLJ5fpRk4hwEbJ78NXFjUVbOZPuGOZkgx5rZ3jnHfNrM/mNmE81scNV+eUSqhgK9pCJJ03AA8HFStCVwh7tvS3xX7qVAzySb5mjg3CTL5r3AIcR5YTYs5fC3Am8k2Ty7EeeRuRD4Mvlr4i/Ftu9PfHNqZ+I7Sh9M2oI4B/8xQGfiFM2lpSQQqbUU6KWyNTKzccTBexpwX1I+1d3fS953J86i+E6y7UnApsSpGSa7+8QkZcAjpbSxD3AnrMrnv2ANfepRdKwkZcRUfk4E96q7L0jSRHya9EMkU+qveRORvCxx9+1zC8wMVs+tY8DL7n5cse1W26+KLMt5vxL9PyEZpBG9VIf3gN3MbAsAM2uSZPP8HOiQk5ztuFL2fxX4Y7JvveQBMGVl3nyLOPEbSTubEGf0FKkTFOilyrn7LOBk4gye44F3ga2S6ZO+wMjkYuzMUg5xNrB38hCVMcA27j6HeCpogpndWGz7O4CCZPsngJPdfRkidYSyV4qIZJxG9CIiGadALyKScQr0IiIZp0AvIpJxCvQiIhmnQC8iknEK9CIiGff/6izg14albO0AAAAASUVORK5CYII=\n", 825 | "text/plain": [ 826 | "
" 827 | ] 828 | }, 829 | "metadata": { 830 | "needs_background": "light" 831 | }, 832 | "output_type": "display_data" 833 | } 834 | ], 835 | "source": [ 836 | "y_pred_prob = 0\n", 837 | "for model_name, model in model_dict.items():\n", 838 | " y_pred_prob += (model.predict_proba(test_features)[:,1] * model_weights[model_name])\n", 839 | "\n", 840 | "y_pred_prob += (simple_nn.predict(test_features.todense()).ravel() * model_weights['simple_nn'])\n", 841 | "y_pred_prob /= sum(model_weights.values())\n", 842 | "print_model_metrics(y_test, y_pred_prob, confusion = True)" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": 11, 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "precision, recall, threshold = precision_recall_curve(y_test, y_pred_prob, pos_label = 1)\n", 852 | " \n", 853 | "#Find the threshold value that gives the best F1 Score\n", 854 | "best_f1_index =np.argmax([calc_f1(p_r) for p_r in zip(precision, recall)])\n", 855 | "best_threshold, best_precision, best_recall = threshold[best_f1_index], precision[best_f1_index], recall[best_f1_index]\n", 856 | "\n", 857 | "# Calulcate predictions based on the threshold value\n", 858 | "y_test_pred = np.where(y_test_prob > best_threshold, 1, 0)" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": 12, 864 | "metadata": {}, 865 | "outputs": [], 866 | "source": [ 867 | "misclassified_idx = y_test != y_test_pred\n", 868 | "high_confidence_indices = np.argsort(y_test_prob[misclassified_idx])[-10:]" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": 13, 874 | "metadata": {}, 875 | "outputs": [ 876 | { 877 | "name": "stdout", 878 | "output_type": "stream", 879 | "text": [ 880 | "Title : Those Who Lost Savings Find Little Comfort\n", 881 | "Label : not-clickbait\n", 882 | "Predicted Probability : 0.8737945792317662\n", 883 | "----------\n", 884 | "Title : Smartphone From Dell? Just Maybe\n", 885 | "Label : not-clickbait\n", 886 | "Predicted Probability : 0.8881080427675058\n", 887 | "----------\n", 888 | "Title : Male models win The Amazing Race\n", 889 | "Label : not-clickbait\n", 890 | "Predicted Probability : 0.8909699961891452\n", 891 | "----------\n", 892 | "Title : Ainge Has Heart Attack After Celtics Say Garnett May Miss Playoffs\n", 893 | "Label : not-clickbait\n", 894 | "Predicted Probability : 0.8916277123562639\n", 895 | "----------\n", 896 | "Title : If Vick Is Sincere, Give Him a Chance to Prove It\n", 897 | "Label : not-clickbait\n", 898 | "Predicted Probability : 0.894014300180017\n", 899 | "----------\n", 900 | "Title : Cellphone Abilities That Go Untapped\n", 901 | "Label : not-clickbait\n", 902 | "Predicted Probability : 0.8969317223056715\n", 903 | "----------\n", 904 | "Title : A Peaking Tiger Woods\n", 905 | "Label : not-clickbait\n", 906 | "Predicted Probability : 0.9113495803235656\n", 907 | "----------\n", 908 | "Title : A Little Rugby With Your Cross-Dressing?\n", 909 | "Label : not-clickbait\n", 910 | "Predicted Probability : 0.9120835562001224\n", 911 | "----------\n", 912 | "Title : Woods Returns as He Left: A Winner\n", 913 | "Label : not-clickbait\n", 914 | "Predicted Probability : 0.930506343172195\n", 915 | "----------\n", 916 | "Title : Darwinism Must Die So That Evolution May Live\n", 917 | "Label : not-clickbait\n", 918 | "Predicted Probability : 0.9334297085656929\n", 919 | "----------\n" 920 | ] 921 | } 922 | ], 923 | "source": [ 924 | "for idx in high_confidence_indices:\n", 925 | " print('Title : {}'.format(test[misclassified_idx].title.values[idx]))\n", 926 | " print('Label : {}'.format(test[misclassified_idx].label.values[idx]))\n", 927 | " print('Predicted Probability : {}'.format(y_test_prob[misclassified_idx][idx]))\n", 928 | " print('-' * 10)" 929 | ] 930 | }, 931 | { 932 | "cell_type": "code", 933 | "execution_count": 14, 934 | "metadata": {}, 935 | "outputs": [ 936 | { 937 | "data": { 938 | "text/plain": [ 939 | "array(['Insurgents Are Said to Capture Somali Town',\n", 940 | " 'Abducted teen in Florida found',\n", 941 | " 'As Iraq Stabilizes, China Eyes Its Oil Fields',\n", 942 | " 'Paramilitary group calls for end to rioting in Northern Ireland',\n", 943 | " 'Finding Your Way Through a Maze of Smartphones',\n", 944 | " 'Thousands demand climate change action',\n", 945 | " 'Paternity Makes Punch Line of Paraguay President',\n", 946 | " 'Comcast and NFL Network Continue to Haggle',\n", 947 | " 'Constant Fear and Mob Rule in South Africa Slum',\n", 948 | " 'Sebastian Vettel wins 2010 Japanese Grand Prix'], dtype=object)" 949 | ] 950 | }, 951 | "execution_count": 14, 952 | "metadata": {}, 953 | "output_type": "execute_result" 954 | } 955 | ], 956 | "source": [ 957 | "test[test.label.values == 'not-clickbait'].sample(10).title.values" 958 | ] 959 | }, 960 | { 961 | "cell_type": "code", 962 | "execution_count": null, 963 | "metadata": {}, 964 | "outputs": [], 965 | "source": [] 966 | } 967 | ], 968 | "metadata": { 969 | "kernelspec": { 970 | "display_name": "Python 3", 971 | "language": "python", 972 | "name": "python3" 973 | }, 974 | "language_info": { 975 | "codemirror_mode": { 976 | "name": "ipython", 977 | "version": 3 978 | }, 979 | "file_extension": ".py", 980 | "mimetype": "text/x-python", 981 | "name": "python", 982 | "nbconvert_exporter": "python", 983 | "pygments_lexer": "ipython3", 984 | "version": "3.7.3" 985 | } 986 | }, 987 | "nbformat": 4, 988 | "nbformat_minor": 4 989 | } 990 | -------------------------------------------------------------------------------- /notebooks/saved_models/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anirudhshenoy/text-classification-small-datasets/0e1ceb90addd2c2ec8644de3ae8bb0b6e2ec04ab/notebooks/saved_models/saved_model.pb -------------------------------------------------------------------------------- /notebooks/saved_models/variables/variables.data-00000-of-00002: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anirudhshenoy/text-classification-small-datasets/0e1ceb90addd2c2ec8644de3ae8bb0b6e2ec04ab/notebooks/saved_models/variables/variables.data-00000-of-00002 -------------------------------------------------------------------------------- /notebooks/saved_models/variables/variables.data-00001-of-00002: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anirudhshenoy/text-classification-small-datasets/0e1ceb90addd2c2ec8644de3ae8bb0b6e2ec04ab/notebooks/saved_models/variables/variables.data-00001-of-00002 -------------------------------------------------------------------------------- /notebooks/saved_models/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anirudhshenoy/text-classification-small-datasets/0e1ceb90addd2c2ec8644de3ae8bb0b6e2ec04ab/notebooks/saved_models/variables/variables.index -------------------------------------------------------------------------------- /notebooks/utility.py: -------------------------------------------------------------------------------- 1 | # Adverserial Validation 2 | from scipy import sparse 3 | from sklearn.metrics import roc_auc_score 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.linear_model import SGDClassifier 6 | from sklearn.model_selection import StratifiedKFold 7 | import matplotlib.pyplot as plt 8 | from sklearn.metrics import precision_recall_curve, f1_score, accuracy_score, roc_auc_score, confusion_matrix 9 | import seaborn as sns 10 | import numpy as np 11 | 12 | sns.set_palette("muted") 13 | 14 | 15 | def adversarial_validation(X, Y, n_splits = 10): 16 | 17 | # Combine both datasets 18 | sparse_merge = sparse.vstack((X, Y)) 19 | 20 | # Label the datasets 21 | y = np.array([0 for _ in range(X.shape[0])] + [1 for _ in range(Y.shape[0])]) 22 | 23 | # Do 10 Fold CV 24 | kfold = StratifiedKFold(n_splits = n_splits, shuffle = True) 25 | 26 | lr_auc = np.array([]) 27 | rf_auc = np.array([]) 28 | for train_idx, test_idx in kfold.split(sparse_merge, y): 29 | 30 | #Run Log Reg 31 | x_train, y_train = sparse_merge[train_idx], y[train_idx] 32 | x_test, y_test = sparse_merge[test_idx], y[test_idx] 33 | 34 | log_reg = SGDClassifier(loss = 'log') 35 | log_reg.fit(x_train, y_train) 36 | y_test_prob = log_reg.predict_proba(x_test)[:,1] 37 | lr_auc = np.append(lr_auc, roc_auc_score(y_test, y_test_prob)) 38 | # Run RF 39 | rf = RandomForestClassifier(n_estimators = 100, n_jobs = -1) 40 | rf.fit(x_train, y_train) 41 | y_test_prob = rf.predict_proba(x_test)[:,1] 42 | rf_auc = np.append(rf_auc, roc_auc_score(y_test, y_test_prob)) 43 | 44 | 45 | # Display results 46 | print('Logisitic Regression AUC : {:.3f}'.format(lr_auc.mean())) 47 | print('Random Forest AUC : {:.3f}'.format(rf_auc.mean())) 48 | 49 | 50 | 51 | 52 | 53 | def calc_f1(p_and_r): 54 | p, r = p_and_r 55 | return (2*p*r)/(p+r) 56 | 57 | 58 | # Print the F1, Precision, Recall, ROC-AUC, and Accuracy Metrics 59 | # Since we are optimizing for F1 score - we will first calculate precision and recall and 60 | # then find the probability threshold value that gives us the best F1 score 61 | 62 | def print_model_metrics(y_test, y_test_prob, confusion = False, verbose = True, return_metrics = False): 63 | 64 | precision, recall, threshold = precision_recall_curve(y_test, y_test_prob, pos_label = 1) 65 | 66 | #Find the threshold value that gives the best F1 Score 67 | best_f1_index =np.argmax([calc_f1(p_r) for p_r in zip(precision, recall)]) 68 | best_threshold, best_precision, best_recall = threshold[best_f1_index], precision[best_f1_index], recall[best_f1_index] 69 | 70 | # Calulcate predictions based on the threshold value 71 | y_test_pred = np.where(y_test_prob > best_threshold, 1, 0) 72 | 73 | # Calculate all metrics 74 | f1 = f1_score(y_test, y_test_pred, pos_label = 1, average = 'binary') 75 | roc_auc = roc_auc_score(y_test, y_test_prob) 76 | acc = accuracy_score(y_test, y_test_pred) 77 | 78 | 79 | if confusion: 80 | # Calculate and Display the confusion Matrix 81 | cm = confusion_matrix(y_test, y_test_pred) 82 | 83 | plt.title('Confusion Matrix') 84 | sns.set(font_scale=1.0) #for label size 85 | sns.heatmap(cm, annot = True, fmt = 'd', xticklabels = ['No Clickbait', 'Clickbait'], yticklabels = ['No Clickbait', 'Clickbait'], annot_kws={"size": 14}, cmap = 'Blues')# font size 86 | 87 | plt.xlabel('Truth') 88 | plt.ylabel('Prediction') 89 | 90 | if verbose: 91 | print('F1: {:.3f} | Pr: {:.3f} | Re: {:.3f} | AUC: {:.3f} | Accuracy: {:.3f} \n'.format(f1, best_precision, best_recall, roc_auc, acc)) 92 | 93 | if return_metrics: 94 | return np.array([f1, best_precision, best_recall, roc_auc, acc]) 95 | 96 | 97 | 98 | # Run log reg 10 times and average the result to reduce predction variance 99 | def run_log_reg(train_features, test_features, y_train, y_test, alpha = 1e-4, confusion = False, return_f1 = False, verbose = True): 100 | metrics = np.zeros(5) 101 | for _ in range(10): 102 | log_reg = SGDClassifier(loss = 'log', alpha = alpha, n_jobs = -1, penalty = 'l2') 103 | log_reg.fit(train_features, y_train) 104 | y_test_prob = log_reg.predict_proba(test_features)[:,1] 105 | metrics += print_model_metrics(y_test, y_test_prob, confusion = confusion, verbose = False, return_metrics = True) 106 | metrics /=10 107 | if verbose: 108 | print('F1: {:.3f} | Pr: {:.3f} | Re: {:.3f} | AUC: {:.3f} | Accuracy: {:.3f} \n'.format(*metrics)) 109 | if return_f1: 110 | return metrics[0] 111 | return log_reg --------------------------------------------------------------------------------