├── dictionary_final.txt ├── final_dictionary.txt ├── templates ├── result.html └── index.html ├── test.py ├── requirements.txt ├── add_dict.py ├── word_seg.py ├── left_words.txt ├── add_words_dictionary.py ├── extract_database.py ├── seperate_words.py ├── correction_count.py ├── Notes.txt ├── correction.py ├── processing_original_question.py ├── README.md ├── one_time.py ├── word2vec.py ├── matching.py ├── server.py ├── testing.py ├── common_words.txt └── common_words2.txt /dictionary_final.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sparsh-Bansal/QNA_project/HEAD/dictionary_final.txt -------------------------------------------------------------------------------- /final_dictionary.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sparsh-Bansal/QNA_project/HEAD/final_dictionary.txt -------------------------------------------------------------------------------- /templates/result.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |

{{answer}}

5 | 6 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 |
8 | 9 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_keys2.csv') 4 | list2 = data['Final_filters'].to_list() 5 | 6 | file = open('D:/ML/QNA_project/left_words.txt','r') 7 | list1 = file.read().split('\n') 8 | file.close() 9 | 10 | w=[] 11 | i=0 12 | for item in list2: 13 | print(i) 14 | i=i+1 15 | if item in list1: 16 | continue 17 | else: 18 | w.append(item) 19 | 20 | print(len(w)) 21 | df = pd.DataFrame(w,columns=['Final_filters']) 22 | df.to_csv('D:/ML/QNA_project/CSV_files/final_words_keys3.csv') -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto==2.49.0 2 | boto3==1.9.243 3 | botocore==1.12.243 4 | certifi==2019.9.11 5 | chardet==3.0.4 6 | Click==7.0 7 | docutils==0.15.2 8 | Flask==1.1.1 9 | gensim==3.8.1 10 | idna==2.8 11 | itsdangerous==1.1.0 12 | Jinja2==2.10.3 13 | jmespath==0.9.4 14 | MarkupSafe==1.1.1 15 | nltk==3.4.5 16 | numpy==1.17.2 17 | pandas==0.25.1 18 | PyMySQL==0.9.3 19 | python-dateutil==2.8.0 20 | pytz==2019.2 21 | requests==2.22.0 22 | s3transfer==0.2.1 23 | scipy==1.3.1 24 | six==1.12.0 25 | smart-open==1.8.4 26 | symspellpy==6.5.0 27 | urllib3==1.25.6 28 | Werkzeug==0.16.0 29 | wincertstore==0.2 30 | -------------------------------------------------------------------------------- /add_dict.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from symspellpy.symspellpy import SymSpell # import the module 4 | 5 | 6 | def main(): 7 | # maximum edit distance per dictionary precalculation 8 | max_edit_distance_dictionary = 2 9 | prefix_length = 7 10 | # create object 11 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 12 | 13 | # create dictionary using corpus.txt 14 | if not sym_spell.create_dictionary('D:/ML/QNA_project/corpus.txt'): 15 | print("Corpus file not found") 16 | return 17 | 18 | for key, count in sym_spell.words.items(): 19 | print("{} {}".format(key, count)) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() -------------------------------------------------------------------------------- /word_seg.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from symspellpy.symspellpy import SymSpell # import the module 4 | 5 | 6 | def main(): 7 | # maximum edit distance per dictionary precalculation 8 | max_edit_distance_dictionary = 0 9 | prefix_length = 7 10 | # create object 11 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 12 | # load dictionary 13 | dictionary_path = os.path.join(os.path.dirname(__file__),"dictionary_final.txt") 14 | term_index = 0 # column of the term in the dictionary text file 15 | count_index = 1 # column of the term frequency in the dictionary text file 16 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 17 | print("Dictionary file not found") 18 | return 19 | 20 | # a sentence without any spaces 21 | input_term = "bangalore" 22 | # input_term = "thequickbrownfoxjumpsoverthelazydog" 23 | # input_term = 'universitycollegesbangalore' 24 | result = sym_spell.word_segmentation(input_term) 25 | x = result.corrected_string.split(' ') 26 | # display suggestion term, term frequency, and edit distance 27 | print(x) 28 | print("{}, {}, {}".format(result.corrected_string, result.distance_sum, 29 | result.log_prob_sum)) 30 | 31 | 32 | if __name__ == "__main__": 33 | main() -------------------------------------------------------------------------------- /left_words.txt: -------------------------------------------------------------------------------- 1 | gamble 2 | ambition 3 | personal 4 | beyond 5 | activity 6 | ultimate 7 | late 8 | emphasis 9 | chase 10 | transmitted 11 | brakes 12 | wits 13 | desire 14 | issue 15 | aah 16 | risks 17 | detail 18 | blogs 19 | capability 20 | body 21 | duty 22 | home 23 | demand 24 | flying 25 | change 26 | secure 27 | ear 28 | keeping 29 | look 30 | joy 31 | rate 32 | name 33 | posts 34 | idea 35 | young 36 | cash 37 | text 38 | safe 39 | bond 40 | god 41 | sleep 42 | including 43 | solid 44 | little 45 | sum 46 | specific 47 | glorious 48 | tips 49 | blank 50 | gold 51 | block 52 | technique 53 | smile 54 | support 55 | inspired 56 | duties 57 | abilities 58 | wings 59 | inwards 60 | learn 61 | manage 62 | task 63 | real 64 | word 65 | scope 66 | object 67 | supreme 68 | said 69 | mutual 70 | bad 71 | help 72 | differently 73 | key 74 | use 75 | making 76 | view 77 | bag 78 | time 79 | step 80 | sure 81 | total 82 | light 83 | wake 84 | improvement 85 | record 86 | special 87 | crack 88 | live 89 | great 90 | black 91 | share 92 | bed 93 | progress 94 | pass 95 | created 96 | means 97 | dream 98 | case 99 | thought 100 | aware 101 | ran 102 | aspire 103 | purpose 104 | perfect 105 | grant 106 | next 107 | dress 108 | check 109 | call 110 | carry 111 | edge 112 | soft 113 | pure 114 | smart 115 | built 116 | think 117 | ask 118 | genius 119 | door 120 | yes 121 | made 122 | self 123 | free 124 | usage 125 | lead 126 | hope 127 | complex 128 | path 129 | insist 130 | good 131 | go 132 | heavy 133 | stop 134 | gonna 135 | living 136 | peace 137 | important 138 | make 139 | action 140 | big 141 | full 142 | best 143 | get 144 | useful 145 | catch 146 | greater 147 | related 148 | nice 149 | classic 150 | happy 151 | work -------------------------------------------------------------------------------- /add_words_dictionary.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import math 3 | import collections 4 | 5 | data1 = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_keys.csv') # Keywords 6 | 7 | data2 = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total2.csv') # Questions 8 | 9 | count = data2['Total_words'].value_counts() 10 | """file = open('D:/ML/QNA_project/dictionary_words.txt','w') """ 11 | 12 | for i in range(len(data1)): 13 | print(i) 14 | # if math.isnan(float('nan'))==math.isnan(float(data1['Final_filters'][i])): 15 | # if str(data1['Final_filters'][i])=='nan': 16 | if i==79: 17 | print('sparsh') 18 | # x = count(str(data1['Final_filters'][i])) 19 | # x=0 20 | continue 21 | else: 22 | try: 23 | x = count[data1['Final_filters'][i]] 24 | except: 25 | x=0 26 | 27 | x = x + 2022459848 28 | x = str(x) 29 | s = data1['Final_filters'][i] + " " + x 30 | file.write(s) 31 | file.write('\n') 32 | 33 | file.close() 34 | 35 | 36 | """file = open('D:/ML/QNA_project/dictionary_words.txt','r')""" 37 | data = file.read().split('\n') 38 | file.close() 39 | m = {} 40 | for i in range(len(data)-1): 41 | print("s {}".format(i)) 42 | 43 | w = data[i].split(' ') 44 | m[w[0]]=w[1] 45 | 46 | sorted_x = sorted(m.items(), key=lambda kv: int(kv[1])) 47 | print('hgfhg') 48 | sorted_dict = collections.OrderedDict(sorted_x) 49 | 50 | """file2 = open('D:/ML/QNA_project/dictionary_words.txt','w')""" 51 | 52 | for key , value in sorted_dict.items(): 53 | file2.write(key+" "+value) 54 | file2.write('\n') 55 | print('compelete') 56 | file2.close() 57 | 58 | """file1 = open('D:/ML/QNA_project/dictionary_words.txt','r')""" 59 | """file2 = open('D:/ML/QNA_project/final_dictionary.txt','w')""" 60 | data = file1.read().split('\n') 61 | file1.close() 62 | for i in range(len(data)-1,-1,-1): 63 | file2.write(data[i]) 64 | file2.write('\n') 65 | 66 | file2.close() -------------------------------------------------------------------------------- /extract_database.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | import pandas as pd 3 | 4 | db = pymysql.connect("localhost","root","12343249","sparsh" ) 5 | cursor = db.cursor() 6 | 7 | def ex_questions_answers(query,file_path): 8 | 9 | sql = query 10 | 11 | try: 12 | cursor.execute(sql) 13 | data = cursor.fetchall() 14 | db.commit() 15 | except: 16 | db.rollback() 17 | # 18 | # file_ques = open('D:/ML/QNA_project/text_files/questions.txt','w') 19 | # 20 | # for i in range(len(data)): 21 | # d = ('{}. '.format(i + 1) + data[i][0]).encode('utf-8') 22 | # file_ques.write(str(d)) 23 | # file_ques.write('\n') 24 | # 25 | # file_ques.close() 26 | # 27 | o_d = [data[i][0] for i in range(len(data))] 28 | df = pd.DataFrame(o_d,columns=['Answers']) 29 | df.to_csv(file_path) 30 | 31 | 32 | def extract_keywords_filter(query , file_path): 33 | sql = query 34 | try: 35 | 36 | cursor.execute(sql) 37 | data = cursor.fetchall() 38 | db.commit() 39 | except: 40 | db.rollback() 41 | 42 | o_d = [data[i] for i in range(len(data))] 43 | df = pd.DataFrame(o_d,columns=['Entity','Keywords']) 44 | df.to_csv(file_path) 45 | 46 | def ex_view_count(query,file_path): 47 | 48 | sql = query 49 | 50 | try: 51 | cursor.execute(sql) 52 | data = cursor.fetchall() 53 | db.commit() 54 | except: 55 | db.rollback() 56 | 57 | df = pd.DataFrame(data, columns=['ID', 'Answers', 'question_id', 'modified_on', 'upvote_count', 'comment_count']) 58 | df.to_csv(file_path) 59 | 60 | 61 | # ex_questions_answers('select text from sparsh.question_answers','D:/ML/QNA_project/CSV_files/answers.csv') 62 | # ex_questions_answers('select title from sparsh.questions','D:/ML/QNA_project/CSV_files/answers.csv') 63 | # extract_keywords_filter('select entity_type , keyword from sparsh.keywords ','D:/ML/QNA_project/CSV_files/keywords.csv') 64 | ex_view_count('select id ,text,question_id,modified_on , upvote_count , comment_count from sparsh.question_answers ' , 'D:/ML/QNA_project/CSV_files/answers.csv') 65 | 66 | db.close() -------------------------------------------------------------------------------- /seperate_words.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | import time 4 | 5 | 6 | def original(): 7 | 8 | data = pd.read_csv("D:/ML/QNA_project/CSV_files/words_total.csv") 9 | 10 | # t1 = time.time() 11 | s = "" 12 | # s = data.head()['tokens'].sum() 13 | print(data.head()) 14 | for i in range(len(data.head())): 15 | print(i) 16 | s = s + data['tokens'][i]+" " 17 | s = s[:-1] 18 | 19 | print(s) 20 | # t2 = time.time() 21 | 22 | # print(t2-t1) 23 | 24 | s = re.sub('\[|\]|\,|\'', '', s) 25 | print(s) 26 | words = s.split(' ') 27 | print(words) 28 | df = pd.DataFrame(words,columns=['Total_words']) 29 | df.to_csv('D:/ML/QNA_project/CSV_files/final_words_total.csv') 30 | 31 | 32 | words = list(set(words)) 33 | 34 | df2 = pd.DataFrame(words,columns=['Final_words']) 35 | df2.to_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd.csv') 36 | 37 | 38 | 39 | def keywords_filters(path_to_data): 40 | # data = pd.read_csv("D:/ML/QNA_project/CSV_files/words_filters.csv") 41 | data = pd.read_csv(path_to_data) 42 | # s = data.head()['tokens'].sum() 43 | s = "" 44 | print(data.head()) 45 | for i in range(len(data)): 46 | print(i) 47 | s = s + data['tokens'][i]+" " 48 | s = s[:-1] 49 | # print(t2-t1) 50 | 51 | s = re.sub('\[|\]|\,|\'', '', s) 52 | words = s.split(' ') 53 | words = list(set(words)) 54 | 55 | s2 = "" 56 | for i in range(len(data)): 57 | print(i) 58 | s2 = s2+data['Entity'][i]+" " 59 | 60 | s2=s2[:-1] 61 | 62 | 63 | words2 = s2.split(' ') 64 | words2 = list(set(words2)) 65 | final = words +words2 66 | final = list(set(final)) 67 | return final 68 | 69 | def combine(): 70 | word_k = keywords_filters('D:/ML/QNA_project/CSV_files/words_keywords.csv') 71 | word_f = keywords_filters('D:/ML/QNA_project/CSV_files/words_filters.csv') 72 | print(word_k) 73 | print(word_f) 74 | total = word_f + word_k 75 | total = list(set(total)) 76 | df = pd.DataFrame(total,columns=['Final_filters']) 77 | df.to_csv('D:/ML/QNA_project/CSV_files/final_words_keys2.csv') 78 | 79 | 80 | # original() 81 | combine() -------------------------------------------------------------------------------- /correction_count.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from symspellpy.symspellpy import SymSpell, Verbosity # import the module 4 | 5 | def main(): 6 | # maximum edit distance per dictionary precalculation 7 | max_edit_distance_dictionary = 2 8 | prefix_length = 9 9 | # data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv') 10 | 11 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 12 | 13 | dictionary_path = os.path.join(os.path.dirname(__file__),"dictionary_final.txt") 14 | term_index = 0 # column of the term in the dictionary text file 15 | count_index = 1 # 16 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 17 | print("Dictionary file not found") 18 | return 19 | # lookup suggestions for single-word input strings 20 | 21 | # input_term = "agricultr" # misspelling of "members" 22 | # max edit distance per lookup 23 | # (max_edit_distance_lookup <= max_edit_distance_dictionary) 24 | max_edit_distance_lookup = 2 25 | 26 | suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL 27 | s = "" 28 | # print('original') 29 | # print(len(words)) 30 | # for i in range(len(data)): 31 | # # print(i) 32 | # if i==0 or i==51124 or i==65070: 33 | # continue 34 | # input_term = data['Final_words'][i] 35 | # suggestions = sym_spell.lookup(input_term, suggestion_verbosity, 36 | # max_edit_distance_lookup) 37 | # print(i) 38 | # try: 39 | # s = s + str(suggestions[0].term)+" " 40 | # except: 41 | # s = s+ input_term 42 | # 43 | # s = s[:-1] 44 | # words = s.split(' ') 45 | # # print(len(words)) 46 | # print('After') 47 | # print(len(words)) 48 | # for suggestion in suggestions: 49 | # print("{}, {}, {}".format(suggestion.term, suggestion.distance, 50 | # suggestion.count)) 51 | 52 | # input_term = ("whereis th elove hehad dated forImuch of thepast who " 53 | # "couqdn'tread in sixtgrade and ins pired him") 54 | input_term = 'live' 55 | # max_edit_distance_lookup = 2 56 | suggestions = sym_spell.lookup_compound(input_term,max_edit_distance_lookup) 57 | for suggestion in suggestions: 58 | print("{}, {}, {}".format(suggestion.term, suggestion.distance, 59 | suggestion.count)) 60 | if __name__ == "__main__": 61 | main() -------------------------------------------------------------------------------- /Notes.txt: -------------------------------------------------------------------------------- 1 | Embeddings 2 | word2vec and globe 3 | Tokenizer 4 | Lamenizer 5 | NLTK 6 | Stopwords 7 | Spell Correction 8 | RASA 9 | TAG Extraction 10 | Finding Similar Sentences 11 | Intent and Entity Recognition 12 | lavenhise distance 13 | 14 | 15 | 16 | Norvigs algorithm ........from collection import Counter https://towardsdatascience.com/correcting-your-spelling-error-with-4-operations-50bcfd519bb8 17 | Symspell https://towardsdatascience.com/essential-text-correction-process-for-nlp-tasks-f731a025fcc3 18 | 19 | N-gram Analysis 20 | Dictionary Lookup https://pdfs.semanticscholar.org/c64f/1bd3a1bd7f7fe4cadc469b4b94c45ad12b5d.pdf 21 | 22 | A simple spell checker ................https://blog.usejournal.com/a-simple-spell-checker-built-from-word-vectors-9f28452b6f26 23 | 24 | python pix2pix.py --mode train --output_dir facades_train --max_epochs 200 --input_dir facades/train --which_direction BtoA 25 | 26 | python pix2pix.py --mode test --output_dir facades_test --input_dir facades/val --checkpoint facades_train 27 | 28 | 29 | view_count 30 | modified_on 31 | answer_count 32 | comment_count 33 | 34 | 35 | keywords - only on similar cosine ranking 36 | 37 | 38 | final_ranking = w1*cosine_similarity + w2*(view_count_diff) + w3() ..... 39 | 40 | 41 | modified_on 42 | upvote_count 43 | comment_count 44 | 45 | 46 | ..extract_database.py 47 | matching.py 48 | word2vec.py 49 | ..serate_words.py 50 | ..add_words_dictionary.py 51 | ..processing_original_question.py 52 | 53 | Parametrer 54 | margin = 0.02 55 | keyword_weight = 50 56 | sum_weight = 50 57 | date 58 | 59 | 60 | id,question,answers,modified_on,similarity,commomn_keyword,sum3,final_score 61 | 62 | results_file = open("Results.txt","w") 63 | 64 | for i in range(len(results_final)): 65 | print("{} Question".format(i+1)) 66 | results_file.write("{} Question\n".format(i+1)) 67 | print(results_final.iloc[i]["Question"]) 68 | results_file.write(results_final.iloc[i]["Question"]) 69 | print("Similarity Score : {}".format(results_final.iloc[i]['final_score'])) 70 | results_file.write("Similarity Score : {}".format(results_final.iloc[i]['final_score'])) 71 | print('Answer') 72 | results_file.write('Answer') 73 | print(results_final.iloc[i]['Answers']) 74 | results_file.write(results_final.iloc[i]['Answers']) 75 | 76 | 77 | I am weak in mathmatcs but i-want to....do civilengineering!!!!....then how can i manage....???? 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /correction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from symspellpy.symspellpy import SymSpell, Verbosity # import the module 4 | 5 | def main(): 6 | # maximum edit distance per dictionary precalculation 7 | max_edit_distance_dictionary = 2 8 | prefix_length = 9 9 | data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv') 10 | 11 | # create object 12 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 13 | # load dictionary 14 | dictionary_path = os.path.join(os.path.dirname(__file__),"frequency_dictionary_en_82_765.txt") 15 | term_index = 0 # column of the term in the dictionary text file 16 | count_index = 1 # column of the term frequency in the dictionary text file 17 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 18 | print("Dictionary file not found") 19 | return 20 | # lookup suggestions for single-word input strings 21 | 22 | # input_term = "agricultr" # misspelling of "members" 23 | # max edit distance per lookup 24 | # (max_edit_distance_lookup <= max_edit_distance_dictionary) 25 | max_edit_distance_lookup = 2 26 | 27 | suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL 28 | s = "" 29 | print('original') 30 | # print(len(words)) 31 | for i in range(len(data)): 32 | # print(i) 33 | if i==0 or i==51124 or i==65070: 34 | continue 35 | input_term = data['Final_words'][i] 36 | suggestions = sym_spell.lookup(input_term, suggestion_verbosity, 37 | max_edit_distance_lookup) 38 | print(i) 39 | try: 40 | s = s + str(suggestions[0].term)+" " 41 | except: 42 | s = s+ input_term 43 | 44 | s = s[:-1] 45 | words = s.split(' ') 46 | # print(len(words)) 47 | print('After') 48 | print(len(words)) 49 | # for suggestion in suggestions: 50 | # print("{}, {}, {}".format(suggestion.term, suggestion.distance, 51 | # suggestion.count)) 52 | 53 | # lookup suggestions for multi-word input strings (supports compound 54 | # splitting & merging) 55 | # input_term = ("whereis th elove hehad dated forImuch of thepast who " 56 | # "couqdn'tread in sixtgrade and ins pired him") 57 | # # input_term = 'he lives in bngalre' 58 | # max_edit_distance_lookup = 2 59 | # suggestions = sym_spell.lookup_compound(input_term, 60 | # max_edit_distance_lookup) 61 | # for suggestion in suggestions: 62 | # print("{}, {}, {}".format(suggestion.term, suggestion.distance, 63 | # suggestion.count)) 64 | if __name__ == "__main__": 65 | main() -------------------------------------------------------------------------------- /processing_original_question.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from nltk.tokenize import RegexpTokenizer 3 | from nltk.corpus import stopwords 4 | import re 5 | 6 | data = pd.read_csv('D:/ML/QNA_project/CSV_files/questions.csv') 7 | 8 | stop_words = set(stopwords.words('english')) 9 | 10 | def remove_stopwords(words): 11 | wx = [w for w in words if not w in stop_words] ## Removing Stopwords 12 | return wx 13 | 14 | 15 | def remove_duplicates(my_list): 16 | return list(set(my_list)) 17 | 18 | 19 | def standardize_text(df, text_field): 20 | df[text_field] = df[text_field].str.lower() 21 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r"http\S+", "", str(elem))) # get rid of URLs 22 | df[text_field] = df[text_field].apply(lambda elem: re.sub('[0-9]', "", str(elem))) 23 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r'[{}@_*>()\\#%+=\[\]\-]',' ', str(elem))) 24 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\(|\)|\[|\]',' ', str(elem))) 25 | df[text_field] = df[text_field].apply(lambda elem: re.sub('a0','', str(elem))) 26 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\.','. ', str(elem))) 27 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\!','! ', str(elem))) 28 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\?','? ', str(elem))) 29 | df[text_field] = df[text_field].apply(lambda elem: re.sub(' +',' ', str(elem))) 30 | return df 31 | 32 | 33 | def from_questions_csv(): 34 | 35 | data = pd.read_csv('D:/ML/QNA_project/CSV_files/questions.csv') 36 | clean_questions = standardize_text(data, "Question") 37 | # print(clean_questions.head()) 38 | tokenizer = RegexpTokenizer(r'\w+') 39 | clean_questions["tokens"] = clean_questions["Question"].apply(tokenizer.tokenize) #Tokenization 40 | clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_total.csv') 41 | 42 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates 43 | clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_after_removing_duplicates.csv') 44 | 45 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) # Removing Stopwords 46 | clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_after_removing_stopwords.csv') 47 | 48 | print(clean_questions.head()) 49 | 50 | 51 | def from_keywords_filters_csv(): 52 | 53 | clean_questions = standardize_text(data, "Entity") 54 | clean_questions = standardize_text(data, "Filters") 55 | 56 | # print(clean_questions.head()) 57 | tokenizer = RegexpTokenizer(r'\w+') 58 | 59 | clean_questions["tokens"] = clean_questions["Filters"].apply(tokenizer.tokenize) # Tokenization 60 | 61 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates 62 | 63 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) # Removing Stopwords 64 | 65 | clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_filters.csv') 66 | 67 | print(clean_questions.head()) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # QNA Project (A major NLP Project): 2 | 3 | As we all know , after passing 12th Grade , one has to appear and clear his/her respective examination for higher studies (JEE for Engineering , NEET for Medical,BBA,MTech etc. etc.). 4 | 5 | There are lot of questions thats comes in our mind and we do extensive google searching to get our answers, like (Previous year ranks, Placement , Fees ,which college is best for our rank etc etc ..thousands of questions). 6 | 7 | So here is a NLP based BOT trained on huge question/answers dataset to give the best/most appropriate answer of all the questions about colleges in every field. 8 | 9 | 10 | # Process: 11 | 1. Data Preprocessing (M. IMP). 12 | 2. Storing Vectors of questions and words in to a dictionary. 13 | 3. Spell Correction and Word Segmentation on the input question. 14 | 4. Finding Cosine Similarity between vectors(to find Top 30 answers). 15 | 5. Best Answer is calculated by : 16 | w1*(Similarity) + w2*(Upvote count on answers) + w3*(number of common words) 17 | 18 | 19 | # Files : 20 | 21 | add_words_dictionary.py : This file add important specific words from the dataset into a dictionary for Spell Correction and Word Segmentation . dictionary_final.txt(contains original words + words from the dataset) 22 | 23 | correction_count.py : To check the accuracy of SymSpell Correction / 24 | Word Segmentation after training on the dataset (Accuracy : 85%) 25 | 26 | extract_database.py : To extract CSV files from the SQL database. 27 | 28 | matching.py : Single question to answer main file 29 | 30 | testing.py : Main file which calls all other files(includes the main code) 31 | Run Command : python testing.py 32 | 33 | one_time.py : One Can auto train a updated dataset by running this file. 34 | 35 | preprocessing_original_question.py : Data(Question-Answers) Preprocessing 36 | 37 | server.py : Flask Application(Deploy the model on web server). 38 | 39 | separate_words.py : Data Processing 40 | 41 | word2vec.py : Convert Each question into a Single Vector 42 | Download the Google Word to vector model(GoogleNews-vectors-negative300.bin) trained on News Dataset(4.92GB) 43 | 44 | # Results: 45 | 46 | Some Results are Shown below ....A same question is asked in a different way with a lot of speling mistakes 47 | 48 | a) Question : 49 | 50 | 1. What is the total fee of VIT VELLORE of 4 year 51 | 2. annual fees of vit velor..?????? 52 | 3. can you tellme totalfees of vitvellore in 2019. 53 | 4. VIT VELLLOre1 totalannual fees..of full year....????? 54 | 55 | Answer : 56 | Total fees will be around 12-13 lacs while u r in category 1 it's including hostel and mess fees. And for category 2 it will be around 16-17 lacs.  1-2 lac increases as category increases... 57 | 58 | 59 | 60 | b) Question : 61 | 62 | 1. What is the registration date of bba entrance 63 | 2. entrance registration date(BBA)..?? 64 | 3. can you tell me the regstrtion examdate of bba..... 65 | 66 | Answer : 67 | The application form will be tenatively available in third week of February 2019.Application closes on first week of May 2019.The application mode will be available online . Thanks 68 | 69 | 70 | 71 | 72 | c) Question 73 | 1. what is the fees structure for B tech , computer science including hostel fee in lmnit jaipur 74 | 2. Feesstructure (B.tech) in computerscience in LMNIT jaipurrr..??? 75 | 3. btech fees struture in LMNIT jaipur coputer scince...??? 76 | 77 | Answer : 78 | It is genereally same fee for all branches in a college.And it is 1,78,000 for first semester.Please go through the below link for the fee structure of LNMIIT, Jaipur:- https://www.lnmiit.ac.in/Admissions/ugadmissions/Fee_Structure.html . Hope you found this helpful...!!!. All the best...!!! 79 | 80 | 81 | 82 | d) Question : 83 | 84 | 1. where can i get the refund list of neet 2019 85 | 2. if i dont want to take admissssssion , How can i get the reefund after neet counceling.... ..??? 86 | 87 | Answer: 88 | 89 | Refund Procedure : If candidate do not wish to pursue the study in college after 2nd round of counselling in this case only refund is initiated.Aspirants get their amount refunded on the same account through which they had submitted their fee. 90 | And once all the rounds of Counselling are completed, MCI will release the list of the Xandidated eligible for the refund of the Security deposit at their official website at mcc.nic.in. 91 | However candidate has to contact on MCC tollfree no and drop a mail to their finance department.This is the only way a candidate may get a refund. Hope this help you aspirants. 92 | 93 | 94 | e) Question : 95 | 96 | 1. Previous year rank od computer science branch in different NIT's 97 | 2. tell me the previos year ranks of computerscience brnch in all (NIT))))>>>...?????????? 98 | 99 | Answer: 100 | 101 | Following is the 2018 JEE MAIN cutoff for Computer Science for some NITs:- 102 | NIT Warangal : 1745 103 | NIT Suratkhal : 1767 104 | NIT Trichy : 1140 105 | MNNIT Allahbad : 3504 106 | NIT Rourkela : 3576 107 | NIT Calicut : 4822 108 | NIT Durgapur : 8516 109 | NIT Hamirpur : 15821 110 | MANIT Bhopal : 6827 111 | MANIT Jaipur : 4875 112 | Hope this helps :) 113 | Best of luck! 114 | 115 | f) Question : 116 | 117 | 1. my mat score September 735.50 get my chance of good MBA college 118 | 2. what are the chancesof geting gud mba colllegee ,,,, my score is around 700. 119 | 120 | Answer : 121 | 122 | Yes you have a good MAT score and you stand a good chance in getting colleges in Bengaluru (Your profile looks great, wait for the moment ) 123 | 124 | https://bschool.careers360.com/articles/mat-cutoff 125 | Cutoff reports and college information shared above. 126 | 127 | 128 | # Conclusion : 129 | 130 | Results are Pretty Good . 131 | 132 | Average time for Question with enough imformation (13-18 sec) 133 | 134 | Average time for questions with a little imformation (25-30 sec) 135 | -------------------------------------------------------------------------------- /one_time.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | import pandas as pd 3 | from nltk.tokenize import RegexpTokenizer 4 | from nltk.corpus import stopwords 5 | import re 6 | import math 7 | import collections 8 | 9 | db = pymysql.connect("localhost","root","12343249","sparsh" ) 10 | cursor = db.cursor() 11 | 12 | def extract_from_database(query,column_list): 13 | sql = query 14 | try: 15 | cursor.execute(sql) 16 | data = cursor.fetchall() 17 | db.commit() 18 | except: 19 | db.rollback() 20 | 21 | df = pd.DataFrame(data, columns=column_list) 22 | return df 23 | 24 | org_ques_data = extract_from_database('select id ,title,answer_count , comment_count,view_count,modified_on from sparsh.questions', 25 | ['ID','Question','answer_count','comment_count','view_count','modified_on']) 26 | org_ans_data = extract_from_database('select id ,text,question_id,modified_on , upvote_count , comment_count from sparsh.question_answers ', 27 | ['ID', 'Answers', 'question_id', 'modified_on', 'upvote_count', 'comment_count']) 28 | org_keyword_data = extract_from_database('select entity_type , keyword from sparsh.keywords' , ['Entity','Keywords']) 29 | org_filter_data = extract_from_database('select entity_type , filters from sparsh.filters' ,['Entity','Filter']) 30 | 31 | 32 | stop_words = set(stopwords.words('english')) 33 | def remove_stopwords(words): 34 | wx = [w for w in words if not w in stop_words] ## Removing Stopwords 35 | return wx 36 | 37 | 38 | def remove_duplicates(my_list): 39 | return list(set(my_list)) 40 | 41 | 42 | def standardize_text(df, text_field): 43 | df[text_field] = df[text_field].str.lower() 44 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r"http\S+", "", str(elem))) # get rid of URLs 45 | df[text_field] = df[text_field].apply(lambda elem: re.sub('[0-9]', "", str(elem))) 46 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r'[{}@_*>()\\#%+=\[\]\-]',' ', str(elem))) 47 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\(|\)|\[|\]',' ', str(elem))) 48 | df[text_field] = df[text_field].apply(lambda elem: re.sub('a0','', str(elem))) 49 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\.','. ', str(elem))) 50 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\!','! ', str(elem))) 51 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\?','? ', str(elem))) 52 | df[text_field] = df[text_field].apply(lambda elem: re.sub(' +',' ', str(elem))) 53 | return df 54 | 55 | 56 | def words_from_keywords_filters(data,column_name): 57 | 58 | clean_questions = standardize_text(data, column_name) 59 | clean_questions = standardize_text(clean_questions,'Entity') 60 | tokenizer = RegexpTokenizer(r'\w+') 61 | clean_questions["tokens"] = clean_questions[column_name].apply(tokenizer.tokenize) #Tokenization 62 | clean_questions['Entity'] = clean_questions['Entity'].apply(tokenizer.tokenize) 63 | clean_questions['tokens'] = clean_questions['tokens'] + clean_questions['Entity'] 64 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates 65 | 66 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) # Removing Stopwords 67 | 68 | return clean_questions 69 | 70 | 71 | def words_from_question(data,column_name): 72 | 73 | clean_questions = standardize_text(data, column_name) 74 | 75 | tokenizer = RegexpTokenizer(r'\w+') 76 | 77 | clean_questions["tokens"] = clean_questions[column_name].apply(tokenizer.tokenize) #Tokenization 78 | 79 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates 80 | 81 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) # Removing Stopwords 82 | 83 | return clean_questions 84 | 85 | 86 | words_keywords = words_from_keywords_filters(org_keyword_data , 'Keywords') 87 | words_filters = words_from_keywords_filters(org_filter_data , 'Filters') 88 | words_ques = words_from_question(org_ques_data,'Question') 89 | 90 | def original(data): 91 | w = [] 92 | for i in range(len(data.head())): 93 | w = w + data['tokens'][i] 94 | 95 | words = list(set(w)) 96 | 97 | df = pd.DataFrame(words,columns=['Final_words']) 98 | return df 99 | 100 | 101 | def keywords_filters(data): 102 | w = [] 103 | for i in range(len(data)): 104 | print(i) 105 | w = w + data['tokens'][i] 106 | words = list(set(w)) 107 | return words 108 | 109 | 110 | def combine(): 111 | word_k = keywords_filters(words_keywords) 112 | word_f = keywords_filters(words_filters) 113 | total = word_f + word_k 114 | total = list(set(total)) 115 | df = pd.DataFrame(total,columns=['Final_filters']) 116 | df.to_csv('D:/ML/QNA_project/CSV_files/final_words_keys2.csv') 117 | return df 118 | 119 | total_words_data = original(words_ques) # TODO : data2 120 | keys_data = combine() # TODO : data1 121 | 122 | 123 | def add_to_dictionary(): 124 | 125 | count = total_words_data['Total_words'].value_counts() 126 | 127 | w_keys = [] 128 | for i in range(len(keys_data)): 129 | print(i) 130 | if i==79: 131 | print('sparsh') 132 | continue 133 | else: 134 | try: 135 | x = count[keys_data['Final_filters'][i]] 136 | except: 137 | x=0 138 | 139 | x = x + 2022459848 140 | x = str(x) 141 | s = keys_data['Final_filters'][i] + " " + x 142 | w_keys.append(s) 143 | 144 | m = {} 145 | for i in range(len(w_keys)-1): 146 | print("s {}".format(i)) 147 | 148 | w = w_keys[i].split(' ') 149 | m[w[0]]=w[1] 150 | 151 | sorted_x = sorted(m.items(), key=lambda kv: int(kv[1])) 152 | sorted_dict = collections.OrderedDict(sorted_x) 153 | 154 | file = open('D:/ML/QNA_project/dictionary_words.txt','w') 155 | 156 | for key , value in sorted_dict.items(): 157 | file.write(key+" "+value) 158 | file.write('\n') 159 | print('compelete') 160 | file.close() 161 | 162 | file1 = open('D:/ML/QNA_project/dictionary_words.txt' , 'r') 163 | data = file1.read().split('\n') 164 | file1.close() 165 | 166 | file1 = open('D:/ML/QNA_project/frequency_dictionary.txt' , 'r') 167 | data2 = file1.split('\n') 168 | file1.close() 169 | 170 | file2 = open('D:/ML/QNA_project/dictionary_final.txt' , 'w') 171 | for i in range(len(data)-1,-1,-1): 172 | file2.write(data[i]) 173 | file2.write('\n') 174 | 175 | for i in range(len(data2)): 176 | file2.write(data2[i]) 177 | file2.write('\n') 178 | file2.close() 179 | 180 | db.close() -------------------------------------------------------------------------------- /word2vec.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from nltk.tokenize import RegexpTokenizer 4 | from nltk.corpus import stopwords 5 | import re 6 | import os 7 | from symspellpy.symspellpy import SymSpell, Verbosity 8 | from gensim.models import KeyedVectors 9 | import time 10 | 11 | data = pd.read_csv('D:/ML/QNA_project/CSV_files/questions.csv') 12 | 13 | def standardize_text(df, text_field): 14 | df[text_field] = df[text_field].str.lower() 15 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r"http\S+", "", str(elem))) # get rid of URLs 16 | df[text_field] = df[text_field].apply(lambda elem: re.sub('[0-9]', "", str(elem))) 17 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r'[{}@_*>()\\#%+=\[\]\-]',' ', str(elem))) 18 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\(|\)|\[|\]',' ', str(elem))) 19 | df[text_field] = df[text_field].apply(lambda elem: re.sub('a0','', str(elem))) 20 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\.','. ', str(elem))) 21 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\!','! ', str(elem))) 22 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\?','? ', str(elem))) 23 | df[text_field] = df[text_field].apply(lambda elem: re.sub(' +',' ', str(elem))) 24 | return df 25 | 26 | 27 | def remove_duplicates(my_list): 28 | return list(set(my_list)) 29 | 30 | 31 | def remove_stopwords(words): 32 | stop_words = set(stopwords.words('english')) 33 | wx = [w for w in words if not w in stop_words] ## Removing Stopwords 34 | return wx 35 | 36 | 37 | def spell_correction(words): 38 | s = "" 39 | print(words) 40 | for i in words: 41 | suggestions = sym_spell.lookup(i, suggestion_verbosity,max_edit_distance_lookup) 42 | try: 43 | # print('hello') 44 | # print(suggestions[0].term) 45 | s = s + suggestions[0].term + " " 46 | except: 47 | # print('vhjyfhfy') 48 | s = s + i + " " 49 | s = s[:-1] 50 | print(s) 51 | w = s.split(' ') 52 | w = list(set(w)) 53 | return w 54 | 55 | 56 | def word_segmentation(words): 57 | print('started') 58 | final = words 59 | for i in words: 60 | input_term = i 61 | try: 62 | result = sym_spell.word_segmentation(input_term) 63 | w = (result.corrected_string).split(' ') 64 | 65 | print(w) 66 | w = w + final 67 | 68 | except: 69 | print('fail') 70 | pass 71 | try: 72 | w = list(set(w)) 73 | except: 74 | print('YOYO') 75 | w = words 76 | print(w) 77 | return w 78 | 79 | 80 | def vectors(words): 81 | 82 | w = [] 83 | for i in words: 84 | try: 85 | vector = model[i] 86 | except: 87 | vector = np.zeros(300) 88 | vector = vector.tolist() 89 | # print(vector.shape) 90 | w.append(vector) 91 | 92 | return w 93 | 94 | 95 | def average_vector(vectors): 96 | v = np.zeros(300) 97 | x = np.zeros(300) 98 | n = len(vectors) 99 | for i in vectors: 100 | i = np.array(i) 101 | if (i==x).all(): 102 | n = n - 1 103 | else: 104 | v = v + i 105 | v = v/n 106 | v = v.tolist() 107 | 108 | return v 109 | 110 | if __name__ == '__main__': 111 | clean_questions = standardize_text(data.head(), "Question") 112 | 113 | 114 | tokenizer = RegexpTokenizer(r'\w+') 115 | clean_questions["tokens"] = clean_questions["Question"].apply(tokenizer.tokenize) 116 | 117 | 118 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates 119 | 120 | 121 | stop_words = set(stopwords.words('english')) 122 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) 123 | 124 | 125 | max_edit_distance_dictionary = 2 126 | prefix_length = 9 127 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 128 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 129 | term_index = 0 # column of the term in the dictionary text file 130 | count_index = 1 # column of the term frequency in the dictionary text file 131 | 132 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 133 | print("Dictionary file not found") 134 | 135 | max_edit_distance_lookup = 2 136 | suggestion_verbosity = Verbosity.CLOSEST 137 | clean_questions['tokens'] = clean_questions['tokens'].apply(spell_correction) 138 | 139 | # clean_questions.to_csv('D:/ML/QNA_project/CSV_files/main_spell1.csv') 140 | print('spell1 done') 141 | # clean_questions = pd.read_csv('D:/ML/QNA_project/CSV_files/main_spell1.csv') 142 | max_edit_distance_dictionary = 0 143 | prefix_length = 7 144 | # create object 145 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 146 | # load dictionary 147 | dictionary_path = os.path.join(os.path.dirname(__file__),"dictionary_final.txt") 148 | term_index = 0 # column of the term in the dictionary text file 149 | count_index = 1 # column of the term frequency in the dictionary text file 150 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 151 | print("Dictionary file not found") 152 | clean_questions['tokens'] = clean_questions['tokens'].apply(word_segmentation) 153 | # clean_questions.to_csv('D:/ML/QNA_project/CSV_files/main_word_seg.csv') 154 | print('wordseg done') 155 | 156 | max_edit_distance_dictionary = 2 157 | prefix_length = 9 158 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 159 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 160 | term_index = 0 # column of the term in the dictionary text file 161 | count_index = 1 # column of the term frequency in the dictionary text file 162 | 163 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 164 | print("Dictionary file not found") 165 | 166 | max_edit_distance_lookup = 2 167 | suggestion_verbosity = Verbosity.CLOSEST 168 | clean_questions['tokens'] = clean_questions['tokens'].apply(spell_correction) 169 | 170 | 171 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) 172 | clean_questions['processed_words'] = clean_questions['tokens'] 173 | # clean_questions.to_csv('D:/ML/QNA_project/CSV_files/main_spell2.csv') 174 | 175 | t1 =time.time() 176 | model = KeyedVectors.load_word2vec_format('D:/ML/QNA_project/model/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True) 177 | t2 = time.time() 178 | print('model loaded in {} seconds'.format(t2-t1)) 179 | clean_questions['vectors'] = clean_questions['tokens'].apply(vectors) 180 | # clean_questions.to_csv('D:/ML/QNA_project/CSV_files/main_vectors.csv') 181 | 182 | 183 | clean_questions['Average_vector'] = clean_questions['vectors'].apply(average_vector) 184 | 185 | clean_questions = clean_questions.drop(['vectors','tokens'],axis=1) 186 | clean_questions.to_csv('D:/ML/QNA_project/CSV_files/main_average.csv') 187 | 188 | print(clean_questions['tokens']) 189 | print(type(clean_questions['tokens'][0])) 190 | 191 | print(clean_questions.head()) 192 | 193 | 194 | -------------------------------------------------------------------------------- /matching.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from nltk.tokenize import RegexpTokenizer 4 | from nltk.corpus import stopwords 5 | import re 6 | import os 7 | from symspellpy.symspellpy import SymSpell, Verbosity 8 | from gensim.models import KeyedVectors 9 | import time 10 | from scipy import spatial 11 | from datetime import datetime 12 | 13 | text = "Why upsee is making compulsory for the students to get admission in allotted college if they want to take part in fifth round counselling.." 14 | text_cw = text 15 | 16 | t1 = time.time() 17 | def remove_duplicates(my_list): 18 | return list(set(my_list)) 19 | 20 | def remove_stopwords(words): 21 | stop_words = set(stopwords.words('english')) 22 | wx = [w for w in words if not w in stop_words] ## Removing Stopwords 23 | return wx 24 | 25 | def spell_correction(words): 26 | s = "" 27 | print(words) 28 | for i in words: 29 | suggestions = sym_spell.lookup(i, suggestion_verbosity,max_edit_distance_lookup) 30 | try: 31 | # print('hello') 32 | # print(suggestions[0].term) 33 | s = s + suggestions[0].term + " " 34 | except: 35 | # print('vhjyfhfy') 36 | s = s + i + " " 37 | s = s[:-1] 38 | print(s) 39 | w = s.split(' ') 40 | w = list(set(w)) 41 | return w 42 | 43 | 44 | def word_segmentation(words): 45 | print('started') 46 | final = [] 47 | for i in words: 48 | input_term = i 49 | try: 50 | result = sym_spell.word_segmentation(input_term) 51 | w = (result.corrected_string).split(' ') 52 | 53 | print(w) 54 | final = final +w 55 | 56 | except: 57 | print('fail') 58 | pass 59 | final = list(set(final)) 60 | return final 61 | 62 | def preprocessing(text): 63 | text = text.lower() 64 | text = re.sub(r"http\S+", "", text) # get rid of URLs 65 | text = re.sub('[0-9]', "", text) 66 | text = re.sub(r'[{}@_*>()\\#%+=\[\]\-]',' ', text) 67 | text = re.sub('\(|\)|\[|\]',' ', text) 68 | text = re.sub('a0','', text) 69 | text = re.sub('\.','. ', text) 70 | text = re.sub('\!','! ', text) 71 | text = re.sub('\?','? ', text) 72 | text = re.sub(' +',' ', text) 73 | return text 74 | 75 | 76 | def vectors(words): 77 | 78 | w = [] 79 | for i in words: 80 | try: 81 | vector = model[i] 82 | except: 83 | vector = np.zeros(300) 84 | vector = vector.tolist() 85 | # print(vector.shape) 86 | w.append(vector) 87 | 88 | return w 89 | 90 | 91 | def average_vector(vectors): 92 | v = np.zeros(300) 93 | x = np.zeros(300) 94 | n = len(vectors) 95 | for i in vectors: 96 | i = np.array(i) 97 | if (i==x).all(): 98 | n = n - 1 99 | else: 100 | v = v + i 101 | v = v/n 102 | v = v.tolist() 103 | 104 | return v 105 | 106 | def match(s): 107 | s = re.sub('\[|\]|\,|\'', '', s) 108 | words = s.split(' ') 109 | vec1 = [] 110 | for i in words: 111 | vec1.append(float(i)) 112 | 113 | result = 1 - spatial.distance.cosine(vec1,avg_v) 114 | # print(result) 115 | return result 116 | 117 | 118 | keyword_data = pd.read_csv('D:/ML/QNA_project/CSV_files/keywords.csv') 119 | filter_data = pd.read_csv('D:/ML/QNA_project/CSV_files/filters.csv') 120 | def common_keywords(text_q): 121 | 122 | text_q = text_q.lower() 123 | w = text_q.split(' ') 124 | 125 | max_edit_distance_dictionary = 2 126 | prefix_length = 9 127 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 128 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 129 | term_index = 0 # column of the term in the dictionary text file 130 | count_index = 1 # column of the term frequency in the dictionary text file 131 | 132 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 133 | print("Dictionary file not found") 134 | 135 | max_edit_distance_lookup = 2 136 | suggestion_verbosity = Verbosity.CLOSEST 137 | 138 | ques = "" 139 | for input in w: 140 | suggestions = sym_spell.lookup(input, max_edit_distance_lookup) 141 | try: 142 | ques = ques + suggestions[0].term + " " 143 | except: 144 | ques = ques + input + " " 145 | ques = ques + text_q 146 | print(ques) 147 | 148 | input_ques = text_cw 149 | input_ques = input_ques.lower() 150 | words_input = input_ques.split(' ') 151 | 152 | in_ques = '' 153 | for input in words_input: 154 | suggestions = sym_spell.lookup(input, max_edit_distance_lookup) 155 | try: 156 | in_ques = in_ques + suggestions[0].term + " " 157 | except: 158 | in_ques = in_ques + input + " " 159 | in_ques = in_ques + input_ques 160 | print(in_ques) 161 | 162 | w1 = [] 163 | w2 = [] 164 | for i in range(len(keyword_data)): 165 | str = keyword_data['Keywords'][i] 166 | str = str.lower() 167 | if (ques.find(str, 0, len(str)) != -1): 168 | w1.append(str) 169 | if(in_ques.find(str, 0, len(str)) != -1): 170 | w2.append(str) 171 | 172 | for i in range(len(filter_data)): 173 | str = filter_data['Filters'][i] 174 | str = str.lower() 175 | if (ques.find(str, 0, len(str)) != -1): 176 | w1.append(str) 177 | if (in_ques.find(str, 0, len(str)) != -1): 178 | w2.append(str) 179 | # print(len(w1)) 180 | # print(len(w2)) 181 | common = w2 + w1 182 | common_d = list(set(common)) 183 | # print(len(common)) 184 | # print(len(common_d)) 185 | x = len(common) - len(common_d) 186 | return x 187 | 188 | 189 | def getting_answer(final): 190 | answer_data = pd.read_csv('D:/ML/QNA_project/CSV_files/answers.csv') 191 | answers_list = [] 192 | for j in range(len(final)): 193 | id = final.iloc[j]['ID'] 194 | id = int(id) 195 | req = answer_data.loc[answer_data['question_id'] == id] 196 | 197 | max = -1 198 | id = req.iloc[0]['ID'] 199 | 200 | for i in range(len(req)): 201 | up_c = int(req.iloc[i]['upvote_count']) 202 | cm_c = int(req.iloc[i]['comment_count']) 203 | if up_c + cm_c > max: 204 | max = up_c + cm_c 205 | id = req.iloc[i]['ID'] 206 | 207 | ans = req.loc[req['ID'] == id]['Answers'] 208 | answers_list.append(ans) 209 | return answers_list 210 | 211 | 212 | 213 | text = preprocessing(text) 214 | 215 | tokenizer = RegexpTokenizer(r'\w+') 216 | words = tokenizer.tokenize(text) 217 | 218 | words = remove_duplicates(words) 219 | 220 | words = remove_stopwords(words) 221 | 222 | max_edit_distance_dictionary = 2 223 | prefix_length = 9 224 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 225 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 226 | term_index = 0 # column of the term in the dictionary text file 227 | count_index = 1 # column of the term frequency in the dictionary text file 228 | 229 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 230 | print("Dictionary file not found") 231 | 232 | max_edit_distance_lookup = 2 233 | suggestion_verbosity = Verbosity.CLOSEST 234 | 235 | words = spell_correction(words) 236 | 237 | 238 | 239 | max_edit_distance_dictionary = 0 240 | prefix_length = 7 241 | # create object 242 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 243 | # load dictionary 244 | dictionary_path = os.path.join(os.path.dirname(__file__),"dictionary_final.txt") 245 | term_index = 0 # column of the term in the dictionary text file 246 | count_index = 1 # column of the term frequency in the dictionary text file 247 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 248 | print("Dictionary file not found") 249 | 250 | words = word_segmentation(words) 251 | 252 | 253 | 254 | max_edit_distance_dictionary = 2 255 | prefix_length = 9 256 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 257 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 258 | term_index = 0 # column of the term in the dictionary text file 259 | count_index = 1 # column of the term frequency in the dictionary text file 260 | 261 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 262 | print("Dictionary file not found") 263 | 264 | max_edit_distance_lookup = 2 265 | suggestion_verbosity = Verbosity.CLOSEST 266 | 267 | words = spell_correction(words) 268 | 269 | words = remove_stopwords(words) 270 | 271 | 272 | # t1 =time.time() 273 | model = KeyedVectors.load_word2vec_format('D:/ML/QNA_project/model/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True) 274 | # t2 = time.time() 275 | print('model loaded ') 276 | vs = vectors(words) 277 | 278 | 279 | avg_v = average_vector(vs) 280 | 281 | vector_data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_question_vector.csv') 282 | print('sparsh') 283 | 284 | #TODO : 1. correction. 2, Make Generic thru user input 285 | now_date = datetime.now() 286 | now_date = now_date.strftime("%d/%m/%Y %H:%M:%S") 287 | filter_date_q = now_date 288 | 289 | vector_data['Similarity'] = vector_data['Average_vector'].apply(match) 290 | 291 | # vector_data = vector_data[vector_data['modified_on']()\\#%+=\[\]\-]', ' ', text) 82 | text = re.sub('\(|\)|\[|\]', ' ', text) 83 | text = re.sub('a0', '', text) 84 | text = re.sub('\.', '. ', text) 85 | text = re.sub('\!', '! ', text) 86 | text = re.sub('\?', '? ', text) 87 | text = re.sub(' +', ' ', text) 88 | return text 89 | 90 | def vectors(words): 91 | 92 | w = [] 93 | for i in words: 94 | try: 95 | vector = model[i] 96 | except: 97 | vector = np.zeros(300) 98 | vector = vector.tolist() 99 | # print(vector.shape) 100 | w.append(vector) 101 | 102 | return w 103 | 104 | def average_vector(vectors): 105 | v = np.zeros(300) 106 | x = np.zeros(300) 107 | n = len(vectors) 108 | for i in vectors: 109 | i = np.array(i) 110 | if (i == x).all(): 111 | n = n - 1 112 | else: 113 | v = v + i 114 | v = v / n 115 | v = v.tolist() 116 | 117 | return v 118 | 119 | def match(s): 120 | s = re.sub('\[|\]|\,|\'', '', s) 121 | words = s.split(' ') 122 | vec1 = [] 123 | for i in words: 124 | vec1.append(float(i)) 125 | 126 | result = 1 - spatial.distance.cosine(vec1, avg_v) 127 | # print(result) 128 | return result 129 | 130 | def common_keywords(text): 131 | keyword_data = pd.read_csv('D:/ML/QNA_project/CSV_files/keywords.csv') 132 | filter_data = pd.read_csv('D:/ML/QNA_project/CSV_files/filters.csv') 133 | 134 | # text = "he lives in bangalor1" 135 | text = text.lower() 136 | w = text.split(' ') 137 | print(w) 138 | 139 | max_edit_distance_dictionary = 2 140 | prefix_length = 9 141 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 142 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 143 | term_index = 0 # column of the term in the dictionary text file 144 | count_index = 1 # column of the term frequency in the dictionary text file 145 | 146 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 147 | print("Dictionary file not found") 148 | 149 | max_edit_distance_lookup = 2 150 | suggestion_verbosity = Verbosity.CLOSEST 151 | 152 | ques = "" 153 | for input in w: 154 | suggestions = sym_spell.lookup(input, max_edit_distance_lookup) 155 | try: 156 | ques = ques + suggestions[0].term + " " 157 | except: 158 | ques = ques + input + " " 159 | ques = ques + text 160 | # print(ques) 161 | words = [] 162 | for i in range(len(keyword_data)): 163 | str = keyword_data['Keywords'][i] 164 | str = str.lower() 165 | if (ques.find(str, 0, len(str)) != -1): 166 | words.append(str) 167 | 168 | for i in range(len(filter_data)): 169 | str = filter_data['Filters'][i] 170 | str = str.lower() 171 | if (ques.find(str, 0, len(str)) != -1): 172 | words.append(str) 173 | 174 | return len(words) 175 | 176 | def getting_answer(final): 177 | answer_data = pd.read_csv('D:/ML/QNA_project/CSV_files/answers.csv') 178 | 179 | now_date_a = datetime.now() 180 | now_date_a = now_date_a.strftime("%d/%m/%Y %H:%M:%S") 181 | filter_date_a = now_date_a 182 | 183 | answer_data = answer_data[answer_data['modified_on'] < filter_date_a] 184 | 185 | answers_list = [] 186 | for j in range(len(final)): 187 | id = final.iloc[j]['ID'] 188 | id = int(id) 189 | req = answer_data.loc[answer_data['question_id'] == id] 190 | 191 | max = -1 192 | id = req.iloc[0]['ID'] 193 | date = req.iloc[0]['modified_on'] 194 | 195 | for i in range(len(req)): 196 | up_c = int(req.iloc[i]['upvote_count']) 197 | cm_c = int(req.iloc[i]['comment_count']) 198 | d1 = req.iloc[i]['modified_on'] 199 | if up_c + cm_c > max: 200 | max = up_c + cm_c 201 | id = req.iloc[i]['ID'] 202 | 203 | ans = req.loc[req['ID'] == id]['Answers'] 204 | answers_list.append(ans) 205 | return answers_list 206 | 207 | 208 | text = request.form.to_dict() 209 | text = text['question'] 210 | print(text) 211 | 212 | text = preprocessing(text) 213 | 214 | tokenizer = RegexpTokenizer(r'\w+') 215 | words = tokenizer.tokenize(text) 216 | 217 | words = remove_duplicates(words) 218 | 219 | words = remove_stopwords(words) 220 | 221 | max_edit_distance_dictionary = 2 222 | prefix_length = 9 223 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 224 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 225 | term_index = 0 # column of the term in the dictionary text file 226 | count_index = 1 # column of the term frequency in the dictionary text file 227 | 228 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 229 | print("Dictionary file not found") 230 | 231 | max_edit_distance_lookup = 2 232 | suggestion_verbosity = Verbosity.CLOSEST 233 | 234 | words = spell_correction(words) 235 | 236 | max_edit_distance_dictionary = 0 237 | prefix_length = 7 238 | # create object 239 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 240 | # load dictionary 241 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 242 | term_index = 0 # column of the term in the dictionary text file 243 | count_index = 1 # column of the term frequency in the dictionary text file 244 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 245 | print("Dictionary file not found") 246 | 247 | words = word_segmentation(words) 248 | 249 | max_edit_distance_dictionary = 2 250 | prefix_length = 9 251 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 252 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 253 | term_index = 0 # column of the term in the dictionary text file 254 | count_index = 1 # column of the term frequency in the dictionary text file 255 | 256 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 257 | print("Dictionary file not found") 258 | 259 | max_edit_distance_lookup = 2 260 | suggestion_verbosity = Verbosity.CLOSEST 261 | 262 | words = spell_correction(words) 263 | 264 | words = remove_stopwords(words) 265 | 266 | # t1 =time.time() 267 | model = KeyedVectors.load_word2vec_format( 268 | 'D:/ML/QNA_project/model/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', 269 | binary=True) 270 | # t2 = time.time() 271 | print('model loaded ') 272 | vs = vectors(words) 273 | 274 | avg_v = average_vector(vs) 275 | 276 | vector_data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_question_vector.csv') 277 | print('sparsh') 278 | 279 | now_date = datetime.now() 280 | now_date = now_date.strftime("%d/%m/%Y %H:%M:%S") 281 | filter_date_q = now_date 282 | 283 | vector_data['Similarity'] = vector_data['Average_vector'].apply(match) 284 | # print(vector_data.nlargest(5,['Similarity'])) 285 | # 286 | dummy = vector_data.nlargest(30, ['Similarity']) 287 | # print(dummy['Question'].head(5)) 288 | # print(time.time()-t1) 289 | 290 | dummy = dummy[dummy['modified_on'] < filter_date_q] 291 | 292 | dummy['common_keyword'] = dummy['Question'].apply(common_keywords) 293 | min_kw = min(dummy['common_keyword']) 294 | max_kw = max(dummy['common_keyword']) 295 | dummy['common_keyword'] = (dummy['common_keyword'] - min_kw) / (max_kw - min_kw) 296 | 297 | dummy['sum3'] = dummy['view_count'] + dummy['answer_count'] + dummy['comment_count'] 298 | min_s = min(dummy['sum3']) 299 | max_s = max(dummy['sum3']) 300 | dummy['sum3'] = (dummy['sum3'] - min_s) / (max_s - min_s) 301 | 302 | margin = 0.02 303 | keyword_wt = 50 304 | sum_wt = 50 305 | w1 = 1 306 | w2 = margin * keyword_wt / 100 307 | w3 = margin * sum_wt / 100 308 | 309 | dummy['final_score'] = (w1 * dummy['Similarity']) + (w2 * dummy['common_keyword']) + (w3 * dummy['sum3']) 310 | 311 | final = dummy.nlargest(10, ['final_score']) 312 | print(final.head()) 313 | print(final['Question'].head()) 314 | 315 | ans_list = getting_answer(final) 316 | final['Answers'] = ans_list 317 | print(final['Answers'].head()) 318 | 319 | final = final.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Average_vector', 'answer_count', 'comment_count', 'view_count'], axis=1) 320 | 321 | dic = final.head().to_dict() 322 | print(time.time()-t1) 323 | 324 | return render_template('result.html' , answer = dic) 325 | 326 | if __name__ == '__main__': 327 | app.run(debug=True) 328 | 329 | -------------------------------------------------------------------------------- /testing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from nltk.tokenize import RegexpTokenizer 4 | from nltk.corpus import stopwords 5 | import re 6 | import os 7 | from symspellpy.symspellpy import SymSpell, Verbosity 8 | from gensim.models import KeyedVectors 9 | import time 10 | from scipy import spatial 11 | from datetime import datetime 12 | 13 | import sys 14 | 15 | print(sys.stdout.encoding) 16 | print(u"Stöcker".encode(sys.stdout.encoding, errors='replace')) 17 | print(u"Стоескер".encode(sys.stdout.encoding, errors='replace')) 18 | 19 | model = KeyedVectors.load_word2vec_format('D:/ML/QNA_project/model/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True) 20 | print('Model Loaded') 21 | keyword_data = pd.read_csv('D:/ML/QNA_project/CSV_files/keywords.csv') 22 | filter_data = pd.read_csv('D:/ML/QNA_project/CSV_files/filters.csv') 23 | keys_data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_keys3.csv') 24 | answer_data = pd.read_csv('D:/ML/QNA_project/CSV_files/answers.csv') 25 | vector_data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_question_vector.csv') 26 | print('All CSV Files readed') 27 | 28 | 29 | def remove_duplicates(my_list): 30 | return list(set(my_list)) 31 | 32 | 33 | def remove_stopwords(words): 34 | stop_words = set(stopwords.words('english')) 35 | wx = [w for w in words if not w in stop_words] ## Removing Stopwords 36 | return wx 37 | 38 | 39 | def spell_correction(words): 40 | s = "" 41 | print(words) 42 | for i in words: 43 | suggestions = sym_spell.lookup(i, suggestion_verbosity, max_edit_distance_lookup) 44 | try: 45 | s = s + suggestions[0].term + " " 46 | except: 47 | s = s + i + " " 48 | s = s[:-1] 49 | print(s) 50 | w = s.split(' ') 51 | w = list(set(w)) 52 | return w 53 | 54 | 55 | def word_segmentation(words): 56 | final = [] 57 | for i in words: 58 | input_term = i 59 | try: 60 | result = sym_spell.word_segmentation(input_term) 61 | w = (result.corrected_string).split(' ') 62 | final = final + w 63 | except: 64 | pass 65 | final = list(set(final)) 66 | print('Segmented') 67 | print(final) 68 | return final 69 | 70 | 71 | def preprocessing(text): 72 | text = text.lower() 73 | text = re.sub(r"http\S+", "", text) # get rid of URLs 74 | text = re.sub('[0-9]', "", text) 75 | text = re.sub(r'[{}@_*>()\\#%+=\[\]\-]', ' ', text) 76 | text = re.sub('\(|\)|\[|\]', ' ', text) 77 | text = re.sub('a0', '', text) 78 | text = re.sub('\.', '. ', text) 79 | text = re.sub('\!', '! ', text) 80 | text = re.sub('\?', '? ', text) 81 | text = re.sub(' +', ' ', text) 82 | return text 83 | 84 | 85 | def vectors(words): 86 | w = [] 87 | for i in words: 88 | try: 89 | vector = model[i] 90 | except: 91 | vector = np.zeros(300) 92 | vector = vector.tolist() 93 | w.append(vector) 94 | return w 95 | 96 | 97 | def average_vector(vectors): 98 | v = np.zeros(300) 99 | x = np.zeros(300) 100 | n = len(vectors) 101 | for i in vectors: 102 | i = np.array(i) 103 | if (i == x).all(): 104 | n = n - 1 105 | else: 106 | v = v + i 107 | v = v / n 108 | v = v.tolist() 109 | return v 110 | 111 | 112 | def match(s): 113 | s = re.sub('\[|\]|\,|\'', '', s) 114 | words = s.split(' ') 115 | vec1 = [] 116 | for i in words: 117 | vec1.append(float(i)) 118 | 119 | result = 1 - spatial.distance.cosine(vec1, avg_v) 120 | return result 121 | 122 | def common_keywords(text_q): 123 | print(w_in_ques) 124 | # text_q = text_q.lower() 125 | # w = text_q.split(' ') 126 | # 127 | # max_edit_distance_dictionary = 2 128 | # prefix_length = 9 129 | # sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 130 | # dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 131 | # term_index = 0 # column of the term in the dictionary text file 132 | # count_index = 1 # column of the term frequency in the dictionary text file 133 | # 134 | # if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 135 | # print("Dictionary file not found") 136 | # 137 | # max_edit_distance_lookup = 2 138 | # suggestion_verbosity = Verbosity.CLOSEST 139 | # 140 | # ques = "" 141 | # for input in w: 142 | # suggestions = sym_spell.lookup(input, max_edit_distance_lookup) 143 | # try: 144 | # ques = ques + suggestions[0].term + " " 145 | # except: 146 | # ques = ques + input + " " 147 | # ques = ques + text_q 148 | # print(ques) 149 | 150 | # input_ques = text_cw 151 | # input_ques = input_ques.lower() 152 | # words_input = input_ques.split(' ') 153 | # 154 | # in_ques = '' 155 | # for input in words_input: 156 | # suggestions = sym_spell.lookup(input, max_edit_distance_lookup) 157 | # try: 158 | # in_ques = in_ques + suggestions[0].term + " " 159 | # except: 160 | # in_ques = in_ques + input + " " 161 | # in_ques = in_ques + input_ques 162 | # print(in_ques) 163 | 164 | text_q = preprocessing(text_q) 165 | 166 | tokenizer = RegexpTokenizer(r'\w+') 167 | words_cw = tokenizer.tokenize(text_q) 168 | 169 | words_cw = remove_duplicates(words_cw) 170 | 171 | words_cw = remove_stopwords(words_cw) 172 | 173 | max_edit_distance_dictionary = 2 174 | prefix_length = 9 175 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 176 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 177 | term_index = 0 # column of the term in the dictionary text file 178 | count_index = 1 # column of the term frequency in the dictionary text file 179 | 180 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 181 | print("Dictionary file not found") 182 | 183 | max_edit_distance_lookup = 2 184 | suggestion_verbosity = Verbosity.CLOSEST 185 | 186 | words_cw = spell_correction(words_cw) 187 | 188 | max_edit_distance_dictionary = 0 189 | prefix_length = 7 190 | # create object 191 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 192 | # load dictionary 193 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 194 | term_index = 0 # column of the term in the dictionary text file 195 | count_index = 1 # column of the term frequency in the dictionary text file 196 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 197 | print("Dictionary file not found") 198 | 199 | words_cw = word_segmentation(words_cw) 200 | 201 | max_edit_distance_dictionary = 2 202 | prefix_length = 9 203 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 204 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 205 | term_index = 0 # column of the term in the dictionary text file 206 | count_index = 1 # column of the term frequency in the dictionary text file 207 | 208 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 209 | print("Dictionary file not found") 210 | 211 | max_edit_distance_lookup = 2 212 | suggestion_verbosity = Verbosity.CLOSEST 213 | 214 | words_cw = spell_correction(words_cw) 215 | 216 | words_cw = remove_stopwords(words_cw) 217 | 218 | w_ques = words_cw 219 | print(w_ques) 220 | w1 = [] 221 | w2 = [] 222 | # for i in range(len(keyword_data)): 223 | # str = keyword_data['Keywords'][i] 224 | # str = str.lower() 225 | # if (ques.find(str, 0, len(str)) != -1): 226 | # w1.append(str) 227 | # if (in_ques.find(str, 0, len(str)) != -1): 228 | # w2.append(str) 229 | # 230 | # for i in range(len(filter_data)): 231 | # str = filter_data['Filters'][i] 232 | # str = str.lower() 233 | # if (ques.find(str, 0, len(str)) != -1): 234 | # w1.append(str) 235 | # if (in_ques.find(str, 0, len(str)) != -1): 236 | # w2.append(str) 237 | 238 | for i in range(len(keys_data)): 239 | ss = keys_data['Final_filters'][i] 240 | # print(i) 241 | ss = str(ss) 242 | ss = ss.lower() 243 | # w_ques = ques.split() 244 | # w_ques = list(set(w_ques)) 245 | # w_in_ques = in_ques.split() 246 | # w_in_ques = list(set(w_in_ques)) 247 | if ss in w_ques: 248 | w1.append(ss) 249 | if ss in w_in_ques: 250 | w2.append(ss) 251 | 252 | 253 | common = w2 + w1 254 | common_d = list(set(common)) 255 | x = len(common) - len(common_d) 256 | print(common) 257 | print(common_d) 258 | print(x) 259 | return x 260 | 261 | 262 | def common_keyword_new(s): 263 | s = re.sub('\[|\]|\,|\'', '', s) 264 | w_ques = s.split(' ') 265 | print(w_ques) 266 | print(w_in_ques) 267 | w1=[] 268 | w2=[] 269 | for i in range(len(keys_data)): 270 | ss = keys_data['Final_filters'][i] 271 | # print(i) 272 | ss = str(ss) 273 | ss = ss.lower() 274 | if ss in w_ques: 275 | w1.append(ss) 276 | if ss in w_in_ques: 277 | w2.append(ss) 278 | 279 | 280 | common = w2 + w1 281 | common_d = list(set(common)) 282 | x = len(common) - len(common_d) 283 | print(common) 284 | print(common_d) 285 | print(x) 286 | return x 287 | 288 | def getting_answer(final): 289 | answers_list = [] 290 | for j in range(len(final)): 291 | id = final.iloc[j]['ID'] 292 | id = int(id) 293 | try: 294 | req = answer_data.loc[answer_data['question_id'] == id] 295 | max = -1 296 | id = req.iloc[0]['ID'] 297 | 298 | for i in range(len(req)): 299 | up_c = int(req.iloc[i]['upvote_count']) 300 | cm_c = int(req.iloc[i]['comment_count']) 301 | if up_c + cm_c > max: 302 | max = up_c + cm_c 303 | id = req.iloc[i]['ID'] 304 | 305 | ans = req.loc[id-1]['Answers'] 306 | answers_list.append(ans) 307 | except: 308 | answers_list.append("Answer not available") 309 | return answers_list 310 | 311 | 312 | def str_to_list(s): 313 | s = re.sub('\[|\]|\,|\'', '', s) 314 | words = s.split(' ') 315 | return words 316 | 317 | 318 | def length_of_c_KW(l_words): 319 | rd = list(set(l_words)) 320 | return len(l_words)-len(rd) 321 | 322 | def add(words): 323 | words = words + w_in_ques 324 | return words 325 | 326 | # results_file = open("Results4.txt", "w") 327 | 328 | j=36 329 | while True: 330 | # text = "Why upsee is making compulsory for the students to get admission in allotted college if they want to take part in fifth round counselling.." 331 | text = input("Enter Your Question: ") 332 | 333 | # results_file.write('{} Test Question'.format(j)) 334 | # results_file.write('\n') 335 | # results_file.write(text + '\n') 336 | # results_file.write('\n') 337 | 338 | t1 = time.time() 339 | text_cw = text 340 | 341 | 342 | text = preprocessing(text) 343 | 344 | tokenizer = RegexpTokenizer(r'\w+') 345 | words = tokenizer.tokenize(text) 346 | 347 | words = remove_duplicates(words) 348 | 349 | words = remove_stopwords(words) 350 | 351 | max_edit_distance_dictionary = 2 352 | prefix_length = 9 353 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 354 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 355 | term_index = 0 # column of the term in the dictionary text file 356 | count_index = 1 # column of the term frequency in the dictionary text file 357 | 358 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 359 | print("Dictionary file not found") 360 | 361 | max_edit_distance_lookup = 2 362 | suggestion_verbosity = Verbosity.CLOSEST 363 | 364 | words = spell_correction(words) 365 | 366 | max_edit_distance_dictionary = 0 367 | prefix_length = 7 368 | # create object 369 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 370 | # load dictionary 371 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 372 | term_index = 0 # column of the term in the dictionary text file 373 | count_index = 1 # column of the term frequency in the dictionary text file 374 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 375 | print("Dictionary file not found") 376 | 377 | words = word_segmentation(words) 378 | 379 | max_edit_distance_dictionary = 2 380 | prefix_length = 9 381 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) 382 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") 383 | term_index = 0 # column of the term in the dictionary text file 384 | count_index = 1 # column of the term frequency in the dictionary text file 385 | 386 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): 387 | print("Dictionary file not found") 388 | 389 | max_edit_distance_lookup = 2 390 | suggestion_verbosity = Verbosity.CLOSEST 391 | 392 | words = spell_correction(words) 393 | 394 | words = remove_stopwords(words) 395 | w_in_ques = words 396 | vs = vectors(words) 397 | 398 | avg_v = average_vector(vs) 399 | 400 | # TODO : 1. correction. 2, Make Generic thru user input 401 | now_date = datetime.now() 402 | now_date = now_date.strftime("%d/%m/%Y %H:%M:%S") 403 | filter_date_q = now_date 404 | 405 | 406 | 407 | 408 | 409 | 410 | vector_data['processed_words_list'] = vector_data['processed_words'].apply(str_to_list) 411 | 412 | vector_data['processed_words_list'] = vector_data['processed_words_list'].apply(add) 413 | 414 | print('adding done') 415 | 416 | vector_data['length'] = vector_data['processed_words_list'].apply(length_of_c_KW) 417 | 418 | new_vector_data = vector_data[vector_data['length']!=0] 419 | 420 | print('NEW LENGTH {}'.format(len(new_vector_data))) 421 | 422 | 423 | new_vector_data['Similarity'] = new_vector_data['Average_vector'].apply(match) 424 | 425 | # vector_data = vector_data[vector_data['modified_on']