├── dictionary_final.txt
├── final_dictionary.txt
├── templates
├── result.html
└── index.html
├── test.py
├── requirements.txt
├── add_dict.py
├── word_seg.py
├── left_words.txt
├── add_words_dictionary.py
├── extract_database.py
├── seperate_words.py
├── correction_count.py
├── Notes.txt
├── correction.py
├── processing_original_question.py
├── README.md
├── one_time.py
├── word2vec.py
├── matching.py
├── server.py
├── testing.py
├── common_words.txt
└── common_words2.txt
/dictionary_final.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sparsh-Bansal/QNA_project/HEAD/dictionary_final.txt
--------------------------------------------------------------------------------
/final_dictionary.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sparsh-Bansal/QNA_project/HEAD/final_dictionary.txt
--------------------------------------------------------------------------------
/templates/result.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | {{answer}}
5 |
6 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
8 |
9 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_keys2.csv')
4 | list2 = data['Final_filters'].to_list()
5 |
6 | file = open('D:/ML/QNA_project/left_words.txt','r')
7 | list1 = file.read().split('\n')
8 | file.close()
9 |
10 | w=[]
11 | i=0
12 | for item in list2:
13 | print(i)
14 | i=i+1
15 | if item in list1:
16 | continue
17 | else:
18 | w.append(item)
19 |
20 | print(len(w))
21 | df = pd.DataFrame(w,columns=['Final_filters'])
22 | df.to_csv('D:/ML/QNA_project/CSV_files/final_words_keys3.csv')
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | boto==2.49.0
2 | boto3==1.9.243
3 | botocore==1.12.243
4 | certifi==2019.9.11
5 | chardet==3.0.4
6 | Click==7.0
7 | docutils==0.15.2
8 | Flask==1.1.1
9 | gensim==3.8.1
10 | idna==2.8
11 | itsdangerous==1.1.0
12 | Jinja2==2.10.3
13 | jmespath==0.9.4
14 | MarkupSafe==1.1.1
15 | nltk==3.4.5
16 | numpy==1.17.2
17 | pandas==0.25.1
18 | PyMySQL==0.9.3
19 | python-dateutil==2.8.0
20 | pytz==2019.2
21 | requests==2.22.0
22 | s3transfer==0.2.1
23 | scipy==1.3.1
24 | six==1.12.0
25 | smart-open==1.8.4
26 | symspellpy==6.5.0
27 | urllib3==1.25.6
28 | Werkzeug==0.16.0
29 | wincertstore==0.2
30 |
--------------------------------------------------------------------------------
/add_dict.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from symspellpy.symspellpy import SymSpell # import the module
4 |
5 |
6 | def main():
7 | # maximum edit distance per dictionary precalculation
8 | max_edit_distance_dictionary = 2
9 | prefix_length = 7
10 | # create object
11 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
12 |
13 | # create dictionary using corpus.txt
14 | if not sym_spell.create_dictionary('D:/ML/QNA_project/corpus.txt'):
15 | print("Corpus file not found")
16 | return
17 |
18 | for key, count in sym_spell.words.items():
19 | print("{} {}".format(key, count))
20 |
21 |
22 | if __name__ == "__main__":
23 | main()
--------------------------------------------------------------------------------
/word_seg.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from symspellpy.symspellpy import SymSpell # import the module
4 |
5 |
6 | def main():
7 | # maximum edit distance per dictionary precalculation
8 | max_edit_distance_dictionary = 0
9 | prefix_length = 7
10 | # create object
11 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
12 | # load dictionary
13 | dictionary_path = os.path.join(os.path.dirname(__file__),"dictionary_final.txt")
14 | term_index = 0 # column of the term in the dictionary text file
15 | count_index = 1 # column of the term frequency in the dictionary text file
16 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
17 | print("Dictionary file not found")
18 | return
19 |
20 | # a sentence without any spaces
21 | input_term = "bangalore"
22 | # input_term = "thequickbrownfoxjumpsoverthelazydog"
23 | # input_term = 'universitycollegesbangalore'
24 | result = sym_spell.word_segmentation(input_term)
25 | x = result.corrected_string.split(' ')
26 | # display suggestion term, term frequency, and edit distance
27 | print(x)
28 | print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
29 | result.log_prob_sum))
30 |
31 |
32 | if __name__ == "__main__":
33 | main()
--------------------------------------------------------------------------------
/left_words.txt:
--------------------------------------------------------------------------------
1 | gamble
2 | ambition
3 | personal
4 | beyond
5 | activity
6 | ultimate
7 | late
8 | emphasis
9 | chase
10 | transmitted
11 | brakes
12 | wits
13 | desire
14 | issue
15 | aah
16 | risks
17 | detail
18 | blogs
19 | capability
20 | body
21 | duty
22 | home
23 | demand
24 | flying
25 | change
26 | secure
27 | ear
28 | keeping
29 | look
30 | joy
31 | rate
32 | name
33 | posts
34 | idea
35 | young
36 | cash
37 | text
38 | safe
39 | bond
40 | god
41 | sleep
42 | including
43 | solid
44 | little
45 | sum
46 | specific
47 | glorious
48 | tips
49 | blank
50 | gold
51 | block
52 | technique
53 | smile
54 | support
55 | inspired
56 | duties
57 | abilities
58 | wings
59 | inwards
60 | learn
61 | manage
62 | task
63 | real
64 | word
65 | scope
66 | object
67 | supreme
68 | said
69 | mutual
70 | bad
71 | help
72 | differently
73 | key
74 | use
75 | making
76 | view
77 | bag
78 | time
79 | step
80 | sure
81 | total
82 | light
83 | wake
84 | improvement
85 | record
86 | special
87 | crack
88 | live
89 | great
90 | black
91 | share
92 | bed
93 | progress
94 | pass
95 | created
96 | means
97 | dream
98 | case
99 | thought
100 | aware
101 | ran
102 | aspire
103 | purpose
104 | perfect
105 | grant
106 | next
107 | dress
108 | check
109 | call
110 | carry
111 | edge
112 | soft
113 | pure
114 | smart
115 | built
116 | think
117 | ask
118 | genius
119 | door
120 | yes
121 | made
122 | self
123 | free
124 | usage
125 | lead
126 | hope
127 | complex
128 | path
129 | insist
130 | good
131 | go
132 | heavy
133 | stop
134 | gonna
135 | living
136 | peace
137 | important
138 | make
139 | action
140 | big
141 | full
142 | best
143 | get
144 | useful
145 | catch
146 | greater
147 | related
148 | nice
149 | classic
150 | happy
151 | work
--------------------------------------------------------------------------------
/add_words_dictionary.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import math
3 | import collections
4 |
5 | data1 = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_keys.csv') # Keywords
6 |
7 | data2 = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total2.csv') # Questions
8 |
9 | count = data2['Total_words'].value_counts()
10 | """file = open('D:/ML/QNA_project/dictionary_words.txt','w') """
11 |
12 | for i in range(len(data1)):
13 | print(i)
14 | # if math.isnan(float('nan'))==math.isnan(float(data1['Final_filters'][i])):
15 | # if str(data1['Final_filters'][i])=='nan':
16 | if i==79:
17 | print('sparsh')
18 | # x = count(str(data1['Final_filters'][i]))
19 | # x=0
20 | continue
21 | else:
22 | try:
23 | x = count[data1['Final_filters'][i]]
24 | except:
25 | x=0
26 |
27 | x = x + 2022459848
28 | x = str(x)
29 | s = data1['Final_filters'][i] + " " + x
30 | file.write(s)
31 | file.write('\n')
32 |
33 | file.close()
34 |
35 |
36 | """file = open('D:/ML/QNA_project/dictionary_words.txt','r')"""
37 | data = file.read().split('\n')
38 | file.close()
39 | m = {}
40 | for i in range(len(data)-1):
41 | print("s {}".format(i))
42 |
43 | w = data[i].split(' ')
44 | m[w[0]]=w[1]
45 |
46 | sorted_x = sorted(m.items(), key=lambda kv: int(kv[1]))
47 | print('hgfhg')
48 | sorted_dict = collections.OrderedDict(sorted_x)
49 |
50 | """file2 = open('D:/ML/QNA_project/dictionary_words.txt','w')"""
51 |
52 | for key , value in sorted_dict.items():
53 | file2.write(key+" "+value)
54 | file2.write('\n')
55 | print('compelete')
56 | file2.close()
57 |
58 | """file1 = open('D:/ML/QNA_project/dictionary_words.txt','r')"""
59 | """file2 = open('D:/ML/QNA_project/final_dictionary.txt','w')"""
60 | data = file1.read().split('\n')
61 | file1.close()
62 | for i in range(len(data)-1,-1,-1):
63 | file2.write(data[i])
64 | file2.write('\n')
65 |
66 | file2.close()
--------------------------------------------------------------------------------
/extract_database.py:
--------------------------------------------------------------------------------
1 | import pymysql
2 | import pandas as pd
3 |
4 | db = pymysql.connect("localhost","root","12343249","sparsh" )
5 | cursor = db.cursor()
6 |
7 | def ex_questions_answers(query,file_path):
8 |
9 | sql = query
10 |
11 | try:
12 | cursor.execute(sql)
13 | data = cursor.fetchall()
14 | db.commit()
15 | except:
16 | db.rollback()
17 | #
18 | # file_ques = open('D:/ML/QNA_project/text_files/questions.txt','w')
19 | #
20 | # for i in range(len(data)):
21 | # d = ('{}. '.format(i + 1) + data[i][0]).encode('utf-8')
22 | # file_ques.write(str(d))
23 | # file_ques.write('\n')
24 | #
25 | # file_ques.close()
26 | #
27 | o_d = [data[i][0] for i in range(len(data))]
28 | df = pd.DataFrame(o_d,columns=['Answers'])
29 | df.to_csv(file_path)
30 |
31 |
32 | def extract_keywords_filter(query , file_path):
33 | sql = query
34 | try:
35 |
36 | cursor.execute(sql)
37 | data = cursor.fetchall()
38 | db.commit()
39 | except:
40 | db.rollback()
41 |
42 | o_d = [data[i] for i in range(len(data))]
43 | df = pd.DataFrame(o_d,columns=['Entity','Keywords'])
44 | df.to_csv(file_path)
45 |
46 | def ex_view_count(query,file_path):
47 |
48 | sql = query
49 |
50 | try:
51 | cursor.execute(sql)
52 | data = cursor.fetchall()
53 | db.commit()
54 | except:
55 | db.rollback()
56 |
57 | df = pd.DataFrame(data, columns=['ID', 'Answers', 'question_id', 'modified_on', 'upvote_count', 'comment_count'])
58 | df.to_csv(file_path)
59 |
60 |
61 | # ex_questions_answers('select text from sparsh.question_answers','D:/ML/QNA_project/CSV_files/answers.csv')
62 | # ex_questions_answers('select title from sparsh.questions','D:/ML/QNA_project/CSV_files/answers.csv')
63 | # extract_keywords_filter('select entity_type , keyword from sparsh.keywords ','D:/ML/QNA_project/CSV_files/keywords.csv')
64 | ex_view_count('select id ,text,question_id,modified_on , upvote_count , comment_count from sparsh.question_answers ' , 'D:/ML/QNA_project/CSV_files/answers.csv')
65 |
66 | db.close()
--------------------------------------------------------------------------------
/seperate_words.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import re
3 | import time
4 |
5 |
6 | def original():
7 |
8 | data = pd.read_csv("D:/ML/QNA_project/CSV_files/words_total.csv")
9 |
10 | # t1 = time.time()
11 | s = ""
12 | # s = data.head()['tokens'].sum()
13 | print(data.head())
14 | for i in range(len(data.head())):
15 | print(i)
16 | s = s + data['tokens'][i]+" "
17 | s = s[:-1]
18 |
19 | print(s)
20 | # t2 = time.time()
21 |
22 | # print(t2-t1)
23 |
24 | s = re.sub('\[|\]|\,|\'', '', s)
25 | print(s)
26 | words = s.split(' ')
27 | print(words)
28 | df = pd.DataFrame(words,columns=['Total_words'])
29 | df.to_csv('D:/ML/QNA_project/CSV_files/final_words_total.csv')
30 |
31 |
32 | words = list(set(words))
33 |
34 | df2 = pd.DataFrame(words,columns=['Final_words'])
35 | df2.to_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd.csv')
36 |
37 |
38 |
39 | def keywords_filters(path_to_data):
40 | # data = pd.read_csv("D:/ML/QNA_project/CSV_files/words_filters.csv")
41 | data = pd.read_csv(path_to_data)
42 | # s = data.head()['tokens'].sum()
43 | s = ""
44 | print(data.head())
45 | for i in range(len(data)):
46 | print(i)
47 | s = s + data['tokens'][i]+" "
48 | s = s[:-1]
49 | # print(t2-t1)
50 |
51 | s = re.sub('\[|\]|\,|\'', '', s)
52 | words = s.split(' ')
53 | words = list(set(words))
54 |
55 | s2 = ""
56 | for i in range(len(data)):
57 | print(i)
58 | s2 = s2+data['Entity'][i]+" "
59 |
60 | s2=s2[:-1]
61 |
62 |
63 | words2 = s2.split(' ')
64 | words2 = list(set(words2))
65 | final = words +words2
66 | final = list(set(final))
67 | return final
68 |
69 | def combine():
70 | word_k = keywords_filters('D:/ML/QNA_project/CSV_files/words_keywords.csv')
71 | word_f = keywords_filters('D:/ML/QNA_project/CSV_files/words_filters.csv')
72 | print(word_k)
73 | print(word_f)
74 | total = word_f + word_k
75 | total = list(set(total))
76 | df = pd.DataFrame(total,columns=['Final_filters'])
77 | df.to_csv('D:/ML/QNA_project/CSV_files/final_words_keys2.csv')
78 |
79 |
80 | # original()
81 | combine()
--------------------------------------------------------------------------------
/correction_count.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | from symspellpy.symspellpy import SymSpell, Verbosity # import the module
4 |
5 | def main():
6 | # maximum edit distance per dictionary precalculation
7 | max_edit_distance_dictionary = 2
8 | prefix_length = 9
9 | # data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv')
10 |
11 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
12 |
13 | dictionary_path = os.path.join(os.path.dirname(__file__),"dictionary_final.txt")
14 | term_index = 0 # column of the term in the dictionary text file
15 | count_index = 1 #
16 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
17 | print("Dictionary file not found")
18 | return
19 | # lookup suggestions for single-word input strings
20 |
21 | # input_term = "agricultr" # misspelling of "members"
22 | # max edit distance per lookup
23 | # (max_edit_distance_lookup <= max_edit_distance_dictionary)
24 | max_edit_distance_lookup = 2
25 |
26 | suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL
27 | s = ""
28 | # print('original')
29 | # print(len(words))
30 | # for i in range(len(data)):
31 | # # print(i)
32 | # if i==0 or i==51124 or i==65070:
33 | # continue
34 | # input_term = data['Final_words'][i]
35 | # suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
36 | # max_edit_distance_lookup)
37 | # print(i)
38 | # try:
39 | # s = s + str(suggestions[0].term)+" "
40 | # except:
41 | # s = s+ input_term
42 | #
43 | # s = s[:-1]
44 | # words = s.split(' ')
45 | # # print(len(words))
46 | # print('After')
47 | # print(len(words))
48 | # for suggestion in suggestions:
49 | # print("{}, {}, {}".format(suggestion.term, suggestion.distance,
50 | # suggestion.count))
51 |
52 | # input_term = ("whereis th elove hehad dated forImuch of thepast who "
53 | # "couqdn'tread in sixtgrade and ins pired him")
54 | input_term = 'live'
55 | # max_edit_distance_lookup = 2
56 | suggestions = sym_spell.lookup_compound(input_term,max_edit_distance_lookup)
57 | for suggestion in suggestions:
58 | print("{}, {}, {}".format(suggestion.term, suggestion.distance,
59 | suggestion.count))
60 | if __name__ == "__main__":
61 | main()
--------------------------------------------------------------------------------
/Notes.txt:
--------------------------------------------------------------------------------
1 | Embeddings
2 | word2vec and globe
3 | Tokenizer
4 | Lamenizer
5 | NLTK
6 | Stopwords
7 | Spell Correction
8 | RASA
9 | TAG Extraction
10 | Finding Similar Sentences
11 | Intent and Entity Recognition
12 | lavenhise distance
13 |
14 |
15 |
16 | Norvigs algorithm ........from collection import Counter https://towardsdatascience.com/correcting-your-spelling-error-with-4-operations-50bcfd519bb8
17 | Symspell https://towardsdatascience.com/essential-text-correction-process-for-nlp-tasks-f731a025fcc3
18 |
19 | N-gram Analysis
20 | Dictionary Lookup https://pdfs.semanticscholar.org/c64f/1bd3a1bd7f7fe4cadc469b4b94c45ad12b5d.pdf
21 |
22 | A simple spell checker ................https://blog.usejournal.com/a-simple-spell-checker-built-from-word-vectors-9f28452b6f26
23 |
24 | python pix2pix.py --mode train --output_dir facades_train --max_epochs 200 --input_dir facades/train --which_direction BtoA
25 |
26 | python pix2pix.py --mode test --output_dir facades_test --input_dir facades/val --checkpoint facades_train
27 |
28 |
29 | view_count
30 | modified_on
31 | answer_count
32 | comment_count
33 |
34 |
35 | keywords - only on similar cosine ranking
36 |
37 |
38 | final_ranking = w1*cosine_similarity + w2*(view_count_diff) + w3() .....
39 |
40 |
41 | modified_on
42 | upvote_count
43 | comment_count
44 |
45 |
46 | ..extract_database.py
47 | matching.py
48 | word2vec.py
49 | ..serate_words.py
50 | ..add_words_dictionary.py
51 | ..processing_original_question.py
52 |
53 | Parametrer
54 | margin = 0.02
55 | keyword_weight = 50
56 | sum_weight = 50
57 | date
58 |
59 |
60 | id,question,answers,modified_on,similarity,commomn_keyword,sum3,final_score
61 |
62 | results_file = open("Results.txt","w")
63 |
64 | for i in range(len(results_final)):
65 | print("{} Question".format(i+1))
66 | results_file.write("{} Question\n".format(i+1))
67 | print(results_final.iloc[i]["Question"])
68 | results_file.write(results_final.iloc[i]["Question"])
69 | print("Similarity Score : {}".format(results_final.iloc[i]['final_score']))
70 | results_file.write("Similarity Score : {}".format(results_final.iloc[i]['final_score']))
71 | print('Answer')
72 | results_file.write('Answer')
73 | print(results_final.iloc[i]['Answers'])
74 | results_file.write(results_final.iloc[i]['Answers'])
75 |
76 |
77 | I am weak in mathmatcs but i-want to....do civilengineering!!!!....then how can i manage....????
78 |
79 |
80 |
81 |
--------------------------------------------------------------------------------
/correction.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | from symspellpy.symspellpy import SymSpell, Verbosity # import the module
4 |
5 | def main():
6 | # maximum edit distance per dictionary precalculation
7 | max_edit_distance_dictionary = 2
8 | prefix_length = 9
9 | data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv')
10 |
11 | # create object
12 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
13 | # load dictionary
14 | dictionary_path = os.path.join(os.path.dirname(__file__),"frequency_dictionary_en_82_765.txt")
15 | term_index = 0 # column of the term in the dictionary text file
16 | count_index = 1 # column of the term frequency in the dictionary text file
17 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
18 | print("Dictionary file not found")
19 | return
20 | # lookup suggestions for single-word input strings
21 |
22 | # input_term = "agricultr" # misspelling of "members"
23 | # max edit distance per lookup
24 | # (max_edit_distance_lookup <= max_edit_distance_dictionary)
25 | max_edit_distance_lookup = 2
26 |
27 | suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL
28 | s = ""
29 | print('original')
30 | # print(len(words))
31 | for i in range(len(data)):
32 | # print(i)
33 | if i==0 or i==51124 or i==65070:
34 | continue
35 | input_term = data['Final_words'][i]
36 | suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
37 | max_edit_distance_lookup)
38 | print(i)
39 | try:
40 | s = s + str(suggestions[0].term)+" "
41 | except:
42 | s = s+ input_term
43 |
44 | s = s[:-1]
45 | words = s.split(' ')
46 | # print(len(words))
47 | print('After')
48 | print(len(words))
49 | # for suggestion in suggestions:
50 | # print("{}, {}, {}".format(suggestion.term, suggestion.distance,
51 | # suggestion.count))
52 |
53 | # lookup suggestions for multi-word input strings (supports compound
54 | # splitting & merging)
55 | # input_term = ("whereis th elove hehad dated forImuch of thepast who "
56 | # "couqdn'tread in sixtgrade and ins pired him")
57 | # # input_term = 'he lives in bngalre'
58 | # max_edit_distance_lookup = 2
59 | # suggestions = sym_spell.lookup_compound(input_term,
60 | # max_edit_distance_lookup)
61 | # for suggestion in suggestions:
62 | # print("{}, {}, {}".format(suggestion.term, suggestion.distance,
63 | # suggestion.count))
64 | if __name__ == "__main__":
65 | main()
--------------------------------------------------------------------------------
/processing_original_question.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from nltk.tokenize import RegexpTokenizer
3 | from nltk.corpus import stopwords
4 | import re
5 |
6 | data = pd.read_csv('D:/ML/QNA_project/CSV_files/questions.csv')
7 |
8 | stop_words = set(stopwords.words('english'))
9 |
10 | def remove_stopwords(words):
11 | wx = [w for w in words if not w in stop_words] ## Removing Stopwords
12 | return wx
13 |
14 |
15 | def remove_duplicates(my_list):
16 | return list(set(my_list))
17 |
18 |
19 | def standardize_text(df, text_field):
20 | df[text_field] = df[text_field].str.lower()
21 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r"http\S+", "", str(elem))) # get rid of URLs
22 | df[text_field] = df[text_field].apply(lambda elem: re.sub('[0-9]', "", str(elem)))
23 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r'[{}@_*>()\\#%+=\[\]\-]',' ', str(elem)))
24 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\(|\)|\[|\]',' ', str(elem)))
25 | df[text_field] = df[text_field].apply(lambda elem: re.sub('a0','', str(elem)))
26 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\.','. ', str(elem)))
27 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\!','! ', str(elem)))
28 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\?','? ', str(elem)))
29 | df[text_field] = df[text_field].apply(lambda elem: re.sub(' +',' ', str(elem)))
30 | return df
31 |
32 |
33 | def from_questions_csv():
34 |
35 | data = pd.read_csv('D:/ML/QNA_project/CSV_files/questions.csv')
36 | clean_questions = standardize_text(data, "Question")
37 | # print(clean_questions.head())
38 | tokenizer = RegexpTokenizer(r'\w+')
39 | clean_questions["tokens"] = clean_questions["Question"].apply(tokenizer.tokenize) #Tokenization
40 | clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_total.csv')
41 |
42 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates
43 | clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_after_removing_duplicates.csv')
44 |
45 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) # Removing Stopwords
46 | clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_after_removing_stopwords.csv')
47 |
48 | print(clean_questions.head())
49 |
50 |
51 | def from_keywords_filters_csv():
52 |
53 | clean_questions = standardize_text(data, "Entity")
54 | clean_questions = standardize_text(data, "Filters")
55 |
56 | # print(clean_questions.head())
57 | tokenizer = RegexpTokenizer(r'\w+')
58 |
59 | clean_questions["tokens"] = clean_questions["Filters"].apply(tokenizer.tokenize) # Tokenization
60 |
61 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates
62 |
63 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) # Removing Stopwords
64 |
65 | clean_questions.to_csv('D:/ML/QNA_project/CSV_files/words_filters.csv')
66 |
67 | print(clean_questions.head())
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # QNA Project (A major NLP Project):
2 |
3 | As we all know , after passing 12th Grade , one has to appear and clear his/her respective examination for higher studies (JEE for Engineering , NEET for Medical,BBA,MTech etc. etc.).
4 |
5 | There are lot of questions thats comes in our mind and we do extensive google searching to get our answers, like (Previous year ranks, Placement , Fees ,which college is best for our rank etc etc ..thousands of questions).
6 |
7 | So here is a NLP based BOT trained on huge question/answers dataset to give the best/most appropriate answer of all the questions about colleges in every field.
8 |
9 |
10 | # Process:
11 | 1. Data Preprocessing (M. IMP).
12 | 2. Storing Vectors of questions and words in to a dictionary.
13 | 3. Spell Correction and Word Segmentation on the input question.
14 | 4. Finding Cosine Similarity between vectors(to find Top 30 answers).
15 | 5. Best Answer is calculated by :
16 | w1*(Similarity) + w2*(Upvote count on answers) + w3*(number of common words)
17 |
18 |
19 | # Files :
20 |
21 | add_words_dictionary.py : This file add important specific words from the dataset into a dictionary for Spell Correction and Word Segmentation . dictionary_final.txt(contains original words + words from the dataset)
22 |
23 | correction_count.py : To check the accuracy of SymSpell Correction /
24 | Word Segmentation after training on the dataset (Accuracy : 85%)
25 |
26 | extract_database.py : To extract CSV files from the SQL database.
27 |
28 | matching.py : Single question to answer main file
29 |
30 | testing.py : Main file which calls all other files(includes the main code)
31 | Run Command : python testing.py
32 |
33 | one_time.py : One Can auto train a updated dataset by running this file.
34 |
35 | preprocessing_original_question.py : Data(Question-Answers) Preprocessing
36 |
37 | server.py : Flask Application(Deploy the model on web server).
38 |
39 | separate_words.py : Data Processing
40 |
41 | word2vec.py : Convert Each question into a Single Vector
42 | Download the Google Word to vector model(GoogleNews-vectors-negative300.bin) trained on News Dataset(4.92GB)
43 |
44 | # Results:
45 |
46 | Some Results are Shown below ....A same question is asked in a different way with a lot of speling mistakes
47 |
48 | a) Question :
49 |
50 | 1. What is the total fee of VIT VELLORE of 4 year
51 | 2. annual fees of vit velor..??????
52 | 3. can you tellme totalfees of vitvellore in 2019.
53 | 4. VIT VELLLOre1 totalannual fees..of full year....?????
54 |
55 | Answer :
56 | Total fees will be around 12-13 lacs while u r in category 1 it's including hostel and mess fees. And for category 2 it will be around 16-17 lacs. 1-2 lac increases as category increases...
57 |
58 |
59 |
60 | b) Question :
61 |
62 | 1. What is the registration date of bba entrance
63 | 2. entrance registration date(BBA)..??
64 | 3. can you tell me the regstrtion examdate of bba.....
65 |
66 | Answer :
67 | The application form will be tenatively available in third week of February 2019.Application closes on first week of May 2019.The application mode will be available online . Thanks
68 |
69 |
70 |
71 |
72 | c) Question
73 | 1. what is the fees structure for B tech , computer science including hostel fee in lmnit jaipur
74 | 2. Feesstructure (B.tech) in computerscience in LMNIT jaipurrr..???
75 | 3. btech fees struture in LMNIT jaipur coputer scince...???
76 |
77 | Answer :
78 | It is genereally same fee for all branches in a college.And it is 1,78,000 for first semester.Please go through the below link for the fee structure of LNMIIT, Jaipur:- https://www.lnmiit.ac.in/Admissions/ugadmissions/Fee_Structure.html . Hope you found this helpful...!!!. All the best...!!!
79 |
80 |
81 |
82 | d) Question :
83 |
84 | 1. where can i get the refund list of neet 2019
85 | 2. if i dont want to take admissssssion , How can i get the reefund after neet counceling.... ..???
86 |
87 | Answer:
88 |
89 | Refund Procedure : If candidate do not wish to pursue the study in college after 2nd round of counselling in this case only refund is initiated.Aspirants get their amount refunded on the same account through which they had submitted their fee.
90 | And once all the rounds of Counselling are completed, MCI will release the list of the Xandidated eligible for the refund of the Security deposit at their official website at mcc.nic.in.
91 | However candidate has to contact on MCC tollfree no and drop a mail to their finance department.This is the only way a candidate may get a refund. Hope this help you aspirants.
92 |
93 |
94 | e) Question :
95 |
96 | 1. Previous year rank od computer science branch in different NIT's
97 | 2. tell me the previos year ranks of computerscience brnch in all (NIT))))>>>...??????????
98 |
99 | Answer:
100 |
101 | Following is the 2018 JEE MAIN cutoff for Computer Science for some NITs:-
102 | NIT Warangal : 1745
103 | NIT Suratkhal : 1767
104 | NIT Trichy : 1140
105 | MNNIT Allahbad : 3504
106 | NIT Rourkela : 3576
107 | NIT Calicut : 4822
108 | NIT Durgapur : 8516
109 | NIT Hamirpur : 15821
110 | MANIT Bhopal : 6827
111 | MANIT Jaipur : 4875
112 | Hope this helps :)
113 | Best of luck!
114 |
115 | f) Question :
116 |
117 | 1. my mat score September 735.50 get my chance of good MBA college
118 | 2. what are the chancesof geting gud mba colllegee ,,,, my score is around 700.
119 |
120 | Answer :
121 |
122 | Yes you have a good MAT score and you stand a good chance in getting colleges in Bengaluru (Your profile looks great, wait for the moment )
123 |
124 | https://bschool.careers360.com/articles/mat-cutoff
125 | Cutoff reports and college information shared above.
126 |
127 |
128 | # Conclusion :
129 |
130 | Results are Pretty Good .
131 |
132 | Average time for Question with enough imformation (13-18 sec)
133 |
134 | Average time for questions with a little imformation (25-30 sec)
135 |
--------------------------------------------------------------------------------
/one_time.py:
--------------------------------------------------------------------------------
1 | import pymysql
2 | import pandas as pd
3 | from nltk.tokenize import RegexpTokenizer
4 | from nltk.corpus import stopwords
5 | import re
6 | import math
7 | import collections
8 |
9 | db = pymysql.connect("localhost","root","12343249","sparsh" )
10 | cursor = db.cursor()
11 |
12 | def extract_from_database(query,column_list):
13 | sql = query
14 | try:
15 | cursor.execute(sql)
16 | data = cursor.fetchall()
17 | db.commit()
18 | except:
19 | db.rollback()
20 |
21 | df = pd.DataFrame(data, columns=column_list)
22 | return df
23 |
24 | org_ques_data = extract_from_database('select id ,title,answer_count , comment_count,view_count,modified_on from sparsh.questions',
25 | ['ID','Question','answer_count','comment_count','view_count','modified_on'])
26 | org_ans_data = extract_from_database('select id ,text,question_id,modified_on , upvote_count , comment_count from sparsh.question_answers ',
27 | ['ID', 'Answers', 'question_id', 'modified_on', 'upvote_count', 'comment_count'])
28 | org_keyword_data = extract_from_database('select entity_type , keyword from sparsh.keywords' , ['Entity','Keywords'])
29 | org_filter_data = extract_from_database('select entity_type , filters from sparsh.filters' ,['Entity','Filter'])
30 |
31 |
32 | stop_words = set(stopwords.words('english'))
33 | def remove_stopwords(words):
34 | wx = [w for w in words if not w in stop_words] ## Removing Stopwords
35 | return wx
36 |
37 |
38 | def remove_duplicates(my_list):
39 | return list(set(my_list))
40 |
41 |
42 | def standardize_text(df, text_field):
43 | df[text_field] = df[text_field].str.lower()
44 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r"http\S+", "", str(elem))) # get rid of URLs
45 | df[text_field] = df[text_field].apply(lambda elem: re.sub('[0-9]', "", str(elem)))
46 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r'[{}@_*>()\\#%+=\[\]\-]',' ', str(elem)))
47 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\(|\)|\[|\]',' ', str(elem)))
48 | df[text_field] = df[text_field].apply(lambda elem: re.sub('a0','', str(elem)))
49 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\.','. ', str(elem)))
50 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\!','! ', str(elem)))
51 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\?','? ', str(elem)))
52 | df[text_field] = df[text_field].apply(lambda elem: re.sub(' +',' ', str(elem)))
53 | return df
54 |
55 |
56 | def words_from_keywords_filters(data,column_name):
57 |
58 | clean_questions = standardize_text(data, column_name)
59 | clean_questions = standardize_text(clean_questions,'Entity')
60 | tokenizer = RegexpTokenizer(r'\w+')
61 | clean_questions["tokens"] = clean_questions[column_name].apply(tokenizer.tokenize) #Tokenization
62 | clean_questions['Entity'] = clean_questions['Entity'].apply(tokenizer.tokenize)
63 | clean_questions['tokens'] = clean_questions['tokens'] + clean_questions['Entity']
64 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates
65 |
66 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) # Removing Stopwords
67 |
68 | return clean_questions
69 |
70 |
71 | def words_from_question(data,column_name):
72 |
73 | clean_questions = standardize_text(data, column_name)
74 |
75 | tokenizer = RegexpTokenizer(r'\w+')
76 |
77 | clean_questions["tokens"] = clean_questions[column_name].apply(tokenizer.tokenize) #Tokenization
78 |
79 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates
80 |
81 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords) # Removing Stopwords
82 |
83 | return clean_questions
84 |
85 |
86 | words_keywords = words_from_keywords_filters(org_keyword_data , 'Keywords')
87 | words_filters = words_from_keywords_filters(org_filter_data , 'Filters')
88 | words_ques = words_from_question(org_ques_data,'Question')
89 |
90 | def original(data):
91 | w = []
92 | for i in range(len(data.head())):
93 | w = w + data['tokens'][i]
94 |
95 | words = list(set(w))
96 |
97 | df = pd.DataFrame(words,columns=['Final_words'])
98 | return df
99 |
100 |
101 | def keywords_filters(data):
102 | w = []
103 | for i in range(len(data)):
104 | print(i)
105 | w = w + data['tokens'][i]
106 | words = list(set(w))
107 | return words
108 |
109 |
110 | def combine():
111 | word_k = keywords_filters(words_keywords)
112 | word_f = keywords_filters(words_filters)
113 | total = word_f + word_k
114 | total = list(set(total))
115 | df = pd.DataFrame(total,columns=['Final_filters'])
116 | df.to_csv('D:/ML/QNA_project/CSV_files/final_words_keys2.csv')
117 | return df
118 |
119 | total_words_data = original(words_ques) # TODO : data2
120 | keys_data = combine() # TODO : data1
121 |
122 |
123 | def add_to_dictionary():
124 |
125 | count = total_words_data['Total_words'].value_counts()
126 |
127 | w_keys = []
128 | for i in range(len(keys_data)):
129 | print(i)
130 | if i==79:
131 | print('sparsh')
132 | continue
133 | else:
134 | try:
135 | x = count[keys_data['Final_filters'][i]]
136 | except:
137 | x=0
138 |
139 | x = x + 2022459848
140 | x = str(x)
141 | s = keys_data['Final_filters'][i] + " " + x
142 | w_keys.append(s)
143 |
144 | m = {}
145 | for i in range(len(w_keys)-1):
146 | print("s {}".format(i))
147 |
148 | w = w_keys[i].split(' ')
149 | m[w[0]]=w[1]
150 |
151 | sorted_x = sorted(m.items(), key=lambda kv: int(kv[1]))
152 | sorted_dict = collections.OrderedDict(sorted_x)
153 |
154 | file = open('D:/ML/QNA_project/dictionary_words.txt','w')
155 |
156 | for key , value in sorted_dict.items():
157 | file.write(key+" "+value)
158 | file.write('\n')
159 | print('compelete')
160 | file.close()
161 |
162 | file1 = open('D:/ML/QNA_project/dictionary_words.txt' , 'r')
163 | data = file1.read().split('\n')
164 | file1.close()
165 |
166 | file1 = open('D:/ML/QNA_project/frequency_dictionary.txt' , 'r')
167 | data2 = file1.split('\n')
168 | file1.close()
169 |
170 | file2 = open('D:/ML/QNA_project/dictionary_final.txt' , 'w')
171 | for i in range(len(data)-1,-1,-1):
172 | file2.write(data[i])
173 | file2.write('\n')
174 |
175 | for i in range(len(data2)):
176 | file2.write(data2[i])
177 | file2.write('\n')
178 | file2.close()
179 |
180 | db.close()
--------------------------------------------------------------------------------
/word2vec.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from nltk.tokenize import RegexpTokenizer
4 | from nltk.corpus import stopwords
5 | import re
6 | import os
7 | from symspellpy.symspellpy import SymSpell, Verbosity
8 | from gensim.models import KeyedVectors
9 | import time
10 |
11 | data = pd.read_csv('D:/ML/QNA_project/CSV_files/questions.csv')
12 |
13 | def standardize_text(df, text_field):
14 | df[text_field] = df[text_field].str.lower()
15 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r"http\S+", "", str(elem))) # get rid of URLs
16 | df[text_field] = df[text_field].apply(lambda elem: re.sub('[0-9]', "", str(elem)))
17 | df[text_field] = df[text_field].apply(lambda elem: re.sub(r'[{}@_*>()\\#%+=\[\]\-]',' ', str(elem)))
18 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\(|\)|\[|\]',' ', str(elem)))
19 | df[text_field] = df[text_field].apply(lambda elem: re.sub('a0','', str(elem)))
20 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\.','. ', str(elem)))
21 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\!','! ', str(elem)))
22 | df[text_field] = df[text_field].apply(lambda elem: re.sub('\?','? ', str(elem)))
23 | df[text_field] = df[text_field].apply(lambda elem: re.sub(' +',' ', str(elem)))
24 | return df
25 |
26 |
27 | def remove_duplicates(my_list):
28 | return list(set(my_list))
29 |
30 |
31 | def remove_stopwords(words):
32 | stop_words = set(stopwords.words('english'))
33 | wx = [w for w in words if not w in stop_words] ## Removing Stopwords
34 | return wx
35 |
36 |
37 | def spell_correction(words):
38 | s = ""
39 | print(words)
40 | for i in words:
41 | suggestions = sym_spell.lookup(i, suggestion_verbosity,max_edit_distance_lookup)
42 | try:
43 | # print('hello')
44 | # print(suggestions[0].term)
45 | s = s + suggestions[0].term + " "
46 | except:
47 | # print('vhjyfhfy')
48 | s = s + i + " "
49 | s = s[:-1]
50 | print(s)
51 | w = s.split(' ')
52 | w = list(set(w))
53 | return w
54 |
55 |
56 | def word_segmentation(words):
57 | print('started')
58 | final = words
59 | for i in words:
60 | input_term = i
61 | try:
62 | result = sym_spell.word_segmentation(input_term)
63 | w = (result.corrected_string).split(' ')
64 |
65 | print(w)
66 | w = w + final
67 |
68 | except:
69 | print('fail')
70 | pass
71 | try:
72 | w = list(set(w))
73 | except:
74 | print('YOYO')
75 | w = words
76 | print(w)
77 | return w
78 |
79 |
80 | def vectors(words):
81 |
82 | w = []
83 | for i in words:
84 | try:
85 | vector = model[i]
86 | except:
87 | vector = np.zeros(300)
88 | vector = vector.tolist()
89 | # print(vector.shape)
90 | w.append(vector)
91 |
92 | return w
93 |
94 |
95 | def average_vector(vectors):
96 | v = np.zeros(300)
97 | x = np.zeros(300)
98 | n = len(vectors)
99 | for i in vectors:
100 | i = np.array(i)
101 | if (i==x).all():
102 | n = n - 1
103 | else:
104 | v = v + i
105 | v = v/n
106 | v = v.tolist()
107 |
108 | return v
109 |
110 | if __name__ == '__main__':
111 | clean_questions = standardize_text(data.head(), "Question")
112 |
113 |
114 | tokenizer = RegexpTokenizer(r'\w+')
115 | clean_questions["tokens"] = clean_questions["Question"].apply(tokenizer.tokenize)
116 |
117 |
118 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_duplicates) # Removing Duplicates
119 |
120 |
121 | stop_words = set(stopwords.words('english'))
122 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords)
123 |
124 |
125 | max_edit_distance_dictionary = 2
126 | prefix_length = 9
127 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
128 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
129 | term_index = 0 # column of the term in the dictionary text file
130 | count_index = 1 # column of the term frequency in the dictionary text file
131 |
132 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
133 | print("Dictionary file not found")
134 |
135 | max_edit_distance_lookup = 2
136 | suggestion_verbosity = Verbosity.CLOSEST
137 | clean_questions['tokens'] = clean_questions['tokens'].apply(spell_correction)
138 |
139 | # clean_questions.to_csv('D:/ML/QNA_project/CSV_files/main_spell1.csv')
140 | print('spell1 done')
141 | # clean_questions = pd.read_csv('D:/ML/QNA_project/CSV_files/main_spell1.csv')
142 | max_edit_distance_dictionary = 0
143 | prefix_length = 7
144 | # create object
145 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
146 | # load dictionary
147 | dictionary_path = os.path.join(os.path.dirname(__file__),"dictionary_final.txt")
148 | term_index = 0 # column of the term in the dictionary text file
149 | count_index = 1 # column of the term frequency in the dictionary text file
150 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
151 | print("Dictionary file not found")
152 | clean_questions['tokens'] = clean_questions['tokens'].apply(word_segmentation)
153 | # clean_questions.to_csv('D:/ML/QNA_project/CSV_files/main_word_seg.csv')
154 | print('wordseg done')
155 |
156 | max_edit_distance_dictionary = 2
157 | prefix_length = 9
158 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
159 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
160 | term_index = 0 # column of the term in the dictionary text file
161 | count_index = 1 # column of the term frequency in the dictionary text file
162 |
163 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
164 | print("Dictionary file not found")
165 |
166 | max_edit_distance_lookup = 2
167 | suggestion_verbosity = Verbosity.CLOSEST
168 | clean_questions['tokens'] = clean_questions['tokens'].apply(spell_correction)
169 |
170 |
171 | clean_questions['tokens'] = clean_questions['tokens'].apply(remove_stopwords)
172 | clean_questions['processed_words'] = clean_questions['tokens']
173 | # clean_questions.to_csv('D:/ML/QNA_project/CSV_files/main_spell2.csv')
174 |
175 | t1 =time.time()
176 | model = KeyedVectors.load_word2vec_format('D:/ML/QNA_project/model/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True)
177 | t2 = time.time()
178 | print('model loaded in {} seconds'.format(t2-t1))
179 | clean_questions['vectors'] = clean_questions['tokens'].apply(vectors)
180 | # clean_questions.to_csv('D:/ML/QNA_project/CSV_files/main_vectors.csv')
181 |
182 |
183 | clean_questions['Average_vector'] = clean_questions['vectors'].apply(average_vector)
184 |
185 | clean_questions = clean_questions.drop(['vectors','tokens'],axis=1)
186 | clean_questions.to_csv('D:/ML/QNA_project/CSV_files/main_average.csv')
187 |
188 | print(clean_questions['tokens'])
189 | print(type(clean_questions['tokens'][0]))
190 |
191 | print(clean_questions.head())
192 |
193 |
194 |
--------------------------------------------------------------------------------
/matching.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from nltk.tokenize import RegexpTokenizer
4 | from nltk.corpus import stopwords
5 | import re
6 | import os
7 | from symspellpy.symspellpy import SymSpell, Verbosity
8 | from gensim.models import KeyedVectors
9 | import time
10 | from scipy import spatial
11 | from datetime import datetime
12 |
13 | text = "Why upsee is making compulsory for the students to get admission in allotted college if they want to take part in fifth round counselling.."
14 | text_cw = text
15 |
16 | t1 = time.time()
17 | def remove_duplicates(my_list):
18 | return list(set(my_list))
19 |
20 | def remove_stopwords(words):
21 | stop_words = set(stopwords.words('english'))
22 | wx = [w for w in words if not w in stop_words] ## Removing Stopwords
23 | return wx
24 |
25 | def spell_correction(words):
26 | s = ""
27 | print(words)
28 | for i in words:
29 | suggestions = sym_spell.lookup(i, suggestion_verbosity,max_edit_distance_lookup)
30 | try:
31 | # print('hello')
32 | # print(suggestions[0].term)
33 | s = s + suggestions[0].term + " "
34 | except:
35 | # print('vhjyfhfy')
36 | s = s + i + " "
37 | s = s[:-1]
38 | print(s)
39 | w = s.split(' ')
40 | w = list(set(w))
41 | return w
42 |
43 |
44 | def word_segmentation(words):
45 | print('started')
46 | final = []
47 | for i in words:
48 | input_term = i
49 | try:
50 | result = sym_spell.word_segmentation(input_term)
51 | w = (result.corrected_string).split(' ')
52 |
53 | print(w)
54 | final = final +w
55 |
56 | except:
57 | print('fail')
58 | pass
59 | final = list(set(final))
60 | return final
61 |
62 | def preprocessing(text):
63 | text = text.lower()
64 | text = re.sub(r"http\S+", "", text) # get rid of URLs
65 | text = re.sub('[0-9]', "", text)
66 | text = re.sub(r'[{}@_*>()\\#%+=\[\]\-]',' ', text)
67 | text = re.sub('\(|\)|\[|\]',' ', text)
68 | text = re.sub('a0','', text)
69 | text = re.sub('\.','. ', text)
70 | text = re.sub('\!','! ', text)
71 | text = re.sub('\?','? ', text)
72 | text = re.sub(' +',' ', text)
73 | return text
74 |
75 |
76 | def vectors(words):
77 |
78 | w = []
79 | for i in words:
80 | try:
81 | vector = model[i]
82 | except:
83 | vector = np.zeros(300)
84 | vector = vector.tolist()
85 | # print(vector.shape)
86 | w.append(vector)
87 |
88 | return w
89 |
90 |
91 | def average_vector(vectors):
92 | v = np.zeros(300)
93 | x = np.zeros(300)
94 | n = len(vectors)
95 | for i in vectors:
96 | i = np.array(i)
97 | if (i==x).all():
98 | n = n - 1
99 | else:
100 | v = v + i
101 | v = v/n
102 | v = v.tolist()
103 |
104 | return v
105 |
106 | def match(s):
107 | s = re.sub('\[|\]|\,|\'', '', s)
108 | words = s.split(' ')
109 | vec1 = []
110 | for i in words:
111 | vec1.append(float(i))
112 |
113 | result = 1 - spatial.distance.cosine(vec1,avg_v)
114 | # print(result)
115 | return result
116 |
117 |
118 | keyword_data = pd.read_csv('D:/ML/QNA_project/CSV_files/keywords.csv')
119 | filter_data = pd.read_csv('D:/ML/QNA_project/CSV_files/filters.csv')
120 | def common_keywords(text_q):
121 |
122 | text_q = text_q.lower()
123 | w = text_q.split(' ')
124 |
125 | max_edit_distance_dictionary = 2
126 | prefix_length = 9
127 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
128 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
129 | term_index = 0 # column of the term in the dictionary text file
130 | count_index = 1 # column of the term frequency in the dictionary text file
131 |
132 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
133 | print("Dictionary file not found")
134 |
135 | max_edit_distance_lookup = 2
136 | suggestion_verbosity = Verbosity.CLOSEST
137 |
138 | ques = ""
139 | for input in w:
140 | suggestions = sym_spell.lookup(input, max_edit_distance_lookup)
141 | try:
142 | ques = ques + suggestions[0].term + " "
143 | except:
144 | ques = ques + input + " "
145 | ques = ques + text_q
146 | print(ques)
147 |
148 | input_ques = text_cw
149 | input_ques = input_ques.lower()
150 | words_input = input_ques.split(' ')
151 |
152 | in_ques = ''
153 | for input in words_input:
154 | suggestions = sym_spell.lookup(input, max_edit_distance_lookup)
155 | try:
156 | in_ques = in_ques + suggestions[0].term + " "
157 | except:
158 | in_ques = in_ques + input + " "
159 | in_ques = in_ques + input_ques
160 | print(in_ques)
161 |
162 | w1 = []
163 | w2 = []
164 | for i in range(len(keyword_data)):
165 | str = keyword_data['Keywords'][i]
166 | str = str.lower()
167 | if (ques.find(str, 0, len(str)) != -1):
168 | w1.append(str)
169 | if(in_ques.find(str, 0, len(str)) != -1):
170 | w2.append(str)
171 |
172 | for i in range(len(filter_data)):
173 | str = filter_data['Filters'][i]
174 | str = str.lower()
175 | if (ques.find(str, 0, len(str)) != -1):
176 | w1.append(str)
177 | if (in_ques.find(str, 0, len(str)) != -1):
178 | w2.append(str)
179 | # print(len(w1))
180 | # print(len(w2))
181 | common = w2 + w1
182 | common_d = list(set(common))
183 | # print(len(common))
184 | # print(len(common_d))
185 | x = len(common) - len(common_d)
186 | return x
187 |
188 |
189 | def getting_answer(final):
190 | answer_data = pd.read_csv('D:/ML/QNA_project/CSV_files/answers.csv')
191 | answers_list = []
192 | for j in range(len(final)):
193 | id = final.iloc[j]['ID']
194 | id = int(id)
195 | req = answer_data.loc[answer_data['question_id'] == id]
196 |
197 | max = -1
198 | id = req.iloc[0]['ID']
199 |
200 | for i in range(len(req)):
201 | up_c = int(req.iloc[i]['upvote_count'])
202 | cm_c = int(req.iloc[i]['comment_count'])
203 | if up_c + cm_c > max:
204 | max = up_c + cm_c
205 | id = req.iloc[i]['ID']
206 |
207 | ans = req.loc[req['ID'] == id]['Answers']
208 | answers_list.append(ans)
209 | return answers_list
210 |
211 |
212 |
213 | text = preprocessing(text)
214 |
215 | tokenizer = RegexpTokenizer(r'\w+')
216 | words = tokenizer.tokenize(text)
217 |
218 | words = remove_duplicates(words)
219 |
220 | words = remove_stopwords(words)
221 |
222 | max_edit_distance_dictionary = 2
223 | prefix_length = 9
224 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
225 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
226 | term_index = 0 # column of the term in the dictionary text file
227 | count_index = 1 # column of the term frequency in the dictionary text file
228 |
229 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
230 | print("Dictionary file not found")
231 |
232 | max_edit_distance_lookup = 2
233 | suggestion_verbosity = Verbosity.CLOSEST
234 |
235 | words = spell_correction(words)
236 |
237 |
238 |
239 | max_edit_distance_dictionary = 0
240 | prefix_length = 7
241 | # create object
242 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
243 | # load dictionary
244 | dictionary_path = os.path.join(os.path.dirname(__file__),"dictionary_final.txt")
245 | term_index = 0 # column of the term in the dictionary text file
246 | count_index = 1 # column of the term frequency in the dictionary text file
247 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
248 | print("Dictionary file not found")
249 |
250 | words = word_segmentation(words)
251 |
252 |
253 |
254 | max_edit_distance_dictionary = 2
255 | prefix_length = 9
256 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
257 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
258 | term_index = 0 # column of the term in the dictionary text file
259 | count_index = 1 # column of the term frequency in the dictionary text file
260 |
261 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
262 | print("Dictionary file not found")
263 |
264 | max_edit_distance_lookup = 2
265 | suggestion_verbosity = Verbosity.CLOSEST
266 |
267 | words = spell_correction(words)
268 |
269 | words = remove_stopwords(words)
270 |
271 |
272 | # t1 =time.time()
273 | model = KeyedVectors.load_word2vec_format('D:/ML/QNA_project/model/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True)
274 | # t2 = time.time()
275 | print('model loaded ')
276 | vs = vectors(words)
277 |
278 |
279 | avg_v = average_vector(vs)
280 |
281 | vector_data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_question_vector.csv')
282 | print('sparsh')
283 |
284 | #TODO : 1. correction. 2, Make Generic thru user input
285 | now_date = datetime.now()
286 | now_date = now_date.strftime("%d/%m/%Y %H:%M:%S")
287 | filter_date_q = now_date
288 |
289 | vector_data['Similarity'] = vector_data['Average_vector'].apply(match)
290 |
291 | # vector_data = vector_data[vector_data['modified_on']()\\#%+=\[\]\-]', ' ', text)
82 | text = re.sub('\(|\)|\[|\]', ' ', text)
83 | text = re.sub('a0', '', text)
84 | text = re.sub('\.', '. ', text)
85 | text = re.sub('\!', '! ', text)
86 | text = re.sub('\?', '? ', text)
87 | text = re.sub(' +', ' ', text)
88 | return text
89 |
90 | def vectors(words):
91 |
92 | w = []
93 | for i in words:
94 | try:
95 | vector = model[i]
96 | except:
97 | vector = np.zeros(300)
98 | vector = vector.tolist()
99 | # print(vector.shape)
100 | w.append(vector)
101 |
102 | return w
103 |
104 | def average_vector(vectors):
105 | v = np.zeros(300)
106 | x = np.zeros(300)
107 | n = len(vectors)
108 | for i in vectors:
109 | i = np.array(i)
110 | if (i == x).all():
111 | n = n - 1
112 | else:
113 | v = v + i
114 | v = v / n
115 | v = v.tolist()
116 |
117 | return v
118 |
119 | def match(s):
120 | s = re.sub('\[|\]|\,|\'', '', s)
121 | words = s.split(' ')
122 | vec1 = []
123 | for i in words:
124 | vec1.append(float(i))
125 |
126 | result = 1 - spatial.distance.cosine(vec1, avg_v)
127 | # print(result)
128 | return result
129 |
130 | def common_keywords(text):
131 | keyword_data = pd.read_csv('D:/ML/QNA_project/CSV_files/keywords.csv')
132 | filter_data = pd.read_csv('D:/ML/QNA_project/CSV_files/filters.csv')
133 |
134 | # text = "he lives in bangalor1"
135 | text = text.lower()
136 | w = text.split(' ')
137 | print(w)
138 |
139 | max_edit_distance_dictionary = 2
140 | prefix_length = 9
141 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
142 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
143 | term_index = 0 # column of the term in the dictionary text file
144 | count_index = 1 # column of the term frequency in the dictionary text file
145 |
146 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
147 | print("Dictionary file not found")
148 |
149 | max_edit_distance_lookup = 2
150 | suggestion_verbosity = Verbosity.CLOSEST
151 |
152 | ques = ""
153 | for input in w:
154 | suggestions = sym_spell.lookup(input, max_edit_distance_lookup)
155 | try:
156 | ques = ques + suggestions[0].term + " "
157 | except:
158 | ques = ques + input + " "
159 | ques = ques + text
160 | # print(ques)
161 | words = []
162 | for i in range(len(keyword_data)):
163 | str = keyword_data['Keywords'][i]
164 | str = str.lower()
165 | if (ques.find(str, 0, len(str)) != -1):
166 | words.append(str)
167 |
168 | for i in range(len(filter_data)):
169 | str = filter_data['Filters'][i]
170 | str = str.lower()
171 | if (ques.find(str, 0, len(str)) != -1):
172 | words.append(str)
173 |
174 | return len(words)
175 |
176 | def getting_answer(final):
177 | answer_data = pd.read_csv('D:/ML/QNA_project/CSV_files/answers.csv')
178 |
179 | now_date_a = datetime.now()
180 | now_date_a = now_date_a.strftime("%d/%m/%Y %H:%M:%S")
181 | filter_date_a = now_date_a
182 |
183 | answer_data = answer_data[answer_data['modified_on'] < filter_date_a]
184 |
185 | answers_list = []
186 | for j in range(len(final)):
187 | id = final.iloc[j]['ID']
188 | id = int(id)
189 | req = answer_data.loc[answer_data['question_id'] == id]
190 |
191 | max = -1
192 | id = req.iloc[0]['ID']
193 | date = req.iloc[0]['modified_on']
194 |
195 | for i in range(len(req)):
196 | up_c = int(req.iloc[i]['upvote_count'])
197 | cm_c = int(req.iloc[i]['comment_count'])
198 | d1 = req.iloc[i]['modified_on']
199 | if up_c + cm_c > max:
200 | max = up_c + cm_c
201 | id = req.iloc[i]['ID']
202 |
203 | ans = req.loc[req['ID'] == id]['Answers']
204 | answers_list.append(ans)
205 | return answers_list
206 |
207 |
208 | text = request.form.to_dict()
209 | text = text['question']
210 | print(text)
211 |
212 | text = preprocessing(text)
213 |
214 | tokenizer = RegexpTokenizer(r'\w+')
215 | words = tokenizer.tokenize(text)
216 |
217 | words = remove_duplicates(words)
218 |
219 | words = remove_stopwords(words)
220 |
221 | max_edit_distance_dictionary = 2
222 | prefix_length = 9
223 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
224 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
225 | term_index = 0 # column of the term in the dictionary text file
226 | count_index = 1 # column of the term frequency in the dictionary text file
227 |
228 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
229 | print("Dictionary file not found")
230 |
231 | max_edit_distance_lookup = 2
232 | suggestion_verbosity = Verbosity.CLOSEST
233 |
234 | words = spell_correction(words)
235 |
236 | max_edit_distance_dictionary = 0
237 | prefix_length = 7
238 | # create object
239 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
240 | # load dictionary
241 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
242 | term_index = 0 # column of the term in the dictionary text file
243 | count_index = 1 # column of the term frequency in the dictionary text file
244 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
245 | print("Dictionary file not found")
246 |
247 | words = word_segmentation(words)
248 |
249 | max_edit_distance_dictionary = 2
250 | prefix_length = 9
251 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
252 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
253 | term_index = 0 # column of the term in the dictionary text file
254 | count_index = 1 # column of the term frequency in the dictionary text file
255 |
256 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
257 | print("Dictionary file not found")
258 |
259 | max_edit_distance_lookup = 2
260 | suggestion_verbosity = Verbosity.CLOSEST
261 |
262 | words = spell_correction(words)
263 |
264 | words = remove_stopwords(words)
265 |
266 | # t1 =time.time()
267 | model = KeyedVectors.load_word2vec_format(
268 | 'D:/ML/QNA_project/model/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin',
269 | binary=True)
270 | # t2 = time.time()
271 | print('model loaded ')
272 | vs = vectors(words)
273 |
274 | avg_v = average_vector(vs)
275 |
276 | vector_data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_question_vector.csv')
277 | print('sparsh')
278 |
279 | now_date = datetime.now()
280 | now_date = now_date.strftime("%d/%m/%Y %H:%M:%S")
281 | filter_date_q = now_date
282 |
283 | vector_data['Similarity'] = vector_data['Average_vector'].apply(match)
284 | # print(vector_data.nlargest(5,['Similarity']))
285 | #
286 | dummy = vector_data.nlargest(30, ['Similarity'])
287 | # print(dummy['Question'].head(5))
288 | # print(time.time()-t1)
289 |
290 | dummy = dummy[dummy['modified_on'] < filter_date_q]
291 |
292 | dummy['common_keyword'] = dummy['Question'].apply(common_keywords)
293 | min_kw = min(dummy['common_keyword'])
294 | max_kw = max(dummy['common_keyword'])
295 | dummy['common_keyword'] = (dummy['common_keyword'] - min_kw) / (max_kw - min_kw)
296 |
297 | dummy['sum3'] = dummy['view_count'] + dummy['answer_count'] + dummy['comment_count']
298 | min_s = min(dummy['sum3'])
299 | max_s = max(dummy['sum3'])
300 | dummy['sum3'] = (dummy['sum3'] - min_s) / (max_s - min_s)
301 |
302 | margin = 0.02
303 | keyword_wt = 50
304 | sum_wt = 50
305 | w1 = 1
306 | w2 = margin * keyword_wt / 100
307 | w3 = margin * sum_wt / 100
308 |
309 | dummy['final_score'] = (w1 * dummy['Similarity']) + (w2 * dummy['common_keyword']) + (w3 * dummy['sum3'])
310 |
311 | final = dummy.nlargest(10, ['final_score'])
312 | print(final.head())
313 | print(final['Question'].head())
314 |
315 | ans_list = getting_answer(final)
316 | final['Answers'] = ans_list
317 | print(final['Answers'].head())
318 |
319 | final = final.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Average_vector', 'answer_count', 'comment_count', 'view_count'], axis=1)
320 |
321 | dic = final.head().to_dict()
322 | print(time.time()-t1)
323 |
324 | return render_template('result.html' , answer = dic)
325 |
326 | if __name__ == '__main__':
327 | app.run(debug=True)
328 |
329 |
--------------------------------------------------------------------------------
/testing.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from nltk.tokenize import RegexpTokenizer
4 | from nltk.corpus import stopwords
5 | import re
6 | import os
7 | from symspellpy.symspellpy import SymSpell, Verbosity
8 | from gensim.models import KeyedVectors
9 | import time
10 | from scipy import spatial
11 | from datetime import datetime
12 |
13 | import sys
14 |
15 | print(sys.stdout.encoding)
16 | print(u"Stöcker".encode(sys.stdout.encoding, errors='replace'))
17 | print(u"Стоескер".encode(sys.stdout.encoding, errors='replace'))
18 |
19 | model = KeyedVectors.load_word2vec_format('D:/ML/QNA_project/model/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True)
20 | print('Model Loaded')
21 | keyword_data = pd.read_csv('D:/ML/QNA_project/CSV_files/keywords.csv')
22 | filter_data = pd.read_csv('D:/ML/QNA_project/CSV_files/filters.csv')
23 | keys_data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_keys3.csv')
24 | answer_data = pd.read_csv('D:/ML/QNA_project/CSV_files/answers.csv')
25 | vector_data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_question_vector.csv')
26 | print('All CSV Files readed')
27 |
28 |
29 | def remove_duplicates(my_list):
30 | return list(set(my_list))
31 |
32 |
33 | def remove_stopwords(words):
34 | stop_words = set(stopwords.words('english'))
35 | wx = [w for w in words if not w in stop_words] ## Removing Stopwords
36 | return wx
37 |
38 |
39 | def spell_correction(words):
40 | s = ""
41 | print(words)
42 | for i in words:
43 | suggestions = sym_spell.lookup(i, suggestion_verbosity, max_edit_distance_lookup)
44 | try:
45 | s = s + suggestions[0].term + " "
46 | except:
47 | s = s + i + " "
48 | s = s[:-1]
49 | print(s)
50 | w = s.split(' ')
51 | w = list(set(w))
52 | return w
53 |
54 |
55 | def word_segmentation(words):
56 | final = []
57 | for i in words:
58 | input_term = i
59 | try:
60 | result = sym_spell.word_segmentation(input_term)
61 | w = (result.corrected_string).split(' ')
62 | final = final + w
63 | except:
64 | pass
65 | final = list(set(final))
66 | print('Segmented')
67 | print(final)
68 | return final
69 |
70 |
71 | def preprocessing(text):
72 | text = text.lower()
73 | text = re.sub(r"http\S+", "", text) # get rid of URLs
74 | text = re.sub('[0-9]', "", text)
75 | text = re.sub(r'[{}@_*>()\\#%+=\[\]\-]', ' ', text)
76 | text = re.sub('\(|\)|\[|\]', ' ', text)
77 | text = re.sub('a0', '', text)
78 | text = re.sub('\.', '. ', text)
79 | text = re.sub('\!', '! ', text)
80 | text = re.sub('\?', '? ', text)
81 | text = re.sub(' +', ' ', text)
82 | return text
83 |
84 |
85 | def vectors(words):
86 | w = []
87 | for i in words:
88 | try:
89 | vector = model[i]
90 | except:
91 | vector = np.zeros(300)
92 | vector = vector.tolist()
93 | w.append(vector)
94 | return w
95 |
96 |
97 | def average_vector(vectors):
98 | v = np.zeros(300)
99 | x = np.zeros(300)
100 | n = len(vectors)
101 | for i in vectors:
102 | i = np.array(i)
103 | if (i == x).all():
104 | n = n - 1
105 | else:
106 | v = v + i
107 | v = v / n
108 | v = v.tolist()
109 | return v
110 |
111 |
112 | def match(s):
113 | s = re.sub('\[|\]|\,|\'', '', s)
114 | words = s.split(' ')
115 | vec1 = []
116 | for i in words:
117 | vec1.append(float(i))
118 |
119 | result = 1 - spatial.distance.cosine(vec1, avg_v)
120 | return result
121 |
122 | def common_keywords(text_q):
123 | print(w_in_ques)
124 | # text_q = text_q.lower()
125 | # w = text_q.split(' ')
126 | #
127 | # max_edit_distance_dictionary = 2
128 | # prefix_length = 9
129 | # sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
130 | # dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
131 | # term_index = 0 # column of the term in the dictionary text file
132 | # count_index = 1 # column of the term frequency in the dictionary text file
133 | #
134 | # if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
135 | # print("Dictionary file not found")
136 | #
137 | # max_edit_distance_lookup = 2
138 | # suggestion_verbosity = Verbosity.CLOSEST
139 | #
140 | # ques = ""
141 | # for input in w:
142 | # suggestions = sym_spell.lookup(input, max_edit_distance_lookup)
143 | # try:
144 | # ques = ques + suggestions[0].term + " "
145 | # except:
146 | # ques = ques + input + " "
147 | # ques = ques + text_q
148 | # print(ques)
149 |
150 | # input_ques = text_cw
151 | # input_ques = input_ques.lower()
152 | # words_input = input_ques.split(' ')
153 | #
154 | # in_ques = ''
155 | # for input in words_input:
156 | # suggestions = sym_spell.lookup(input, max_edit_distance_lookup)
157 | # try:
158 | # in_ques = in_ques + suggestions[0].term + " "
159 | # except:
160 | # in_ques = in_ques + input + " "
161 | # in_ques = in_ques + input_ques
162 | # print(in_ques)
163 |
164 | text_q = preprocessing(text_q)
165 |
166 | tokenizer = RegexpTokenizer(r'\w+')
167 | words_cw = tokenizer.tokenize(text_q)
168 |
169 | words_cw = remove_duplicates(words_cw)
170 |
171 | words_cw = remove_stopwords(words_cw)
172 |
173 | max_edit_distance_dictionary = 2
174 | prefix_length = 9
175 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
176 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
177 | term_index = 0 # column of the term in the dictionary text file
178 | count_index = 1 # column of the term frequency in the dictionary text file
179 |
180 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
181 | print("Dictionary file not found")
182 |
183 | max_edit_distance_lookup = 2
184 | suggestion_verbosity = Verbosity.CLOSEST
185 |
186 | words_cw = spell_correction(words_cw)
187 |
188 | max_edit_distance_dictionary = 0
189 | prefix_length = 7
190 | # create object
191 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
192 | # load dictionary
193 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
194 | term_index = 0 # column of the term in the dictionary text file
195 | count_index = 1 # column of the term frequency in the dictionary text file
196 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
197 | print("Dictionary file not found")
198 |
199 | words_cw = word_segmentation(words_cw)
200 |
201 | max_edit_distance_dictionary = 2
202 | prefix_length = 9
203 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
204 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
205 | term_index = 0 # column of the term in the dictionary text file
206 | count_index = 1 # column of the term frequency in the dictionary text file
207 |
208 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
209 | print("Dictionary file not found")
210 |
211 | max_edit_distance_lookup = 2
212 | suggestion_verbosity = Verbosity.CLOSEST
213 |
214 | words_cw = spell_correction(words_cw)
215 |
216 | words_cw = remove_stopwords(words_cw)
217 |
218 | w_ques = words_cw
219 | print(w_ques)
220 | w1 = []
221 | w2 = []
222 | # for i in range(len(keyword_data)):
223 | # str = keyword_data['Keywords'][i]
224 | # str = str.lower()
225 | # if (ques.find(str, 0, len(str)) != -1):
226 | # w1.append(str)
227 | # if (in_ques.find(str, 0, len(str)) != -1):
228 | # w2.append(str)
229 | #
230 | # for i in range(len(filter_data)):
231 | # str = filter_data['Filters'][i]
232 | # str = str.lower()
233 | # if (ques.find(str, 0, len(str)) != -1):
234 | # w1.append(str)
235 | # if (in_ques.find(str, 0, len(str)) != -1):
236 | # w2.append(str)
237 |
238 | for i in range(len(keys_data)):
239 | ss = keys_data['Final_filters'][i]
240 | # print(i)
241 | ss = str(ss)
242 | ss = ss.lower()
243 | # w_ques = ques.split()
244 | # w_ques = list(set(w_ques))
245 | # w_in_ques = in_ques.split()
246 | # w_in_ques = list(set(w_in_ques))
247 | if ss in w_ques:
248 | w1.append(ss)
249 | if ss in w_in_ques:
250 | w2.append(ss)
251 |
252 |
253 | common = w2 + w1
254 | common_d = list(set(common))
255 | x = len(common) - len(common_d)
256 | print(common)
257 | print(common_d)
258 | print(x)
259 | return x
260 |
261 |
262 | def common_keyword_new(s):
263 | s = re.sub('\[|\]|\,|\'', '', s)
264 | w_ques = s.split(' ')
265 | print(w_ques)
266 | print(w_in_ques)
267 | w1=[]
268 | w2=[]
269 | for i in range(len(keys_data)):
270 | ss = keys_data['Final_filters'][i]
271 | # print(i)
272 | ss = str(ss)
273 | ss = ss.lower()
274 | if ss in w_ques:
275 | w1.append(ss)
276 | if ss in w_in_ques:
277 | w2.append(ss)
278 |
279 |
280 | common = w2 + w1
281 | common_d = list(set(common))
282 | x = len(common) - len(common_d)
283 | print(common)
284 | print(common_d)
285 | print(x)
286 | return x
287 |
288 | def getting_answer(final):
289 | answers_list = []
290 | for j in range(len(final)):
291 | id = final.iloc[j]['ID']
292 | id = int(id)
293 | try:
294 | req = answer_data.loc[answer_data['question_id'] == id]
295 | max = -1
296 | id = req.iloc[0]['ID']
297 |
298 | for i in range(len(req)):
299 | up_c = int(req.iloc[i]['upvote_count'])
300 | cm_c = int(req.iloc[i]['comment_count'])
301 | if up_c + cm_c > max:
302 | max = up_c + cm_c
303 | id = req.iloc[i]['ID']
304 |
305 | ans = req.loc[id-1]['Answers']
306 | answers_list.append(ans)
307 | except:
308 | answers_list.append("Answer not available")
309 | return answers_list
310 |
311 |
312 | def str_to_list(s):
313 | s = re.sub('\[|\]|\,|\'', '', s)
314 | words = s.split(' ')
315 | return words
316 |
317 |
318 | def length_of_c_KW(l_words):
319 | rd = list(set(l_words))
320 | return len(l_words)-len(rd)
321 |
322 | def add(words):
323 | words = words + w_in_ques
324 | return words
325 |
326 | # results_file = open("Results4.txt", "w")
327 |
328 | j=36
329 | while True:
330 | # text = "Why upsee is making compulsory for the students to get admission in allotted college if they want to take part in fifth round counselling.."
331 | text = input("Enter Your Question: ")
332 |
333 | # results_file.write('{} Test Question'.format(j))
334 | # results_file.write('\n')
335 | # results_file.write(text + '\n')
336 | # results_file.write('\n')
337 |
338 | t1 = time.time()
339 | text_cw = text
340 |
341 |
342 | text = preprocessing(text)
343 |
344 | tokenizer = RegexpTokenizer(r'\w+')
345 | words = tokenizer.tokenize(text)
346 |
347 | words = remove_duplicates(words)
348 |
349 | words = remove_stopwords(words)
350 |
351 | max_edit_distance_dictionary = 2
352 | prefix_length = 9
353 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
354 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
355 | term_index = 0 # column of the term in the dictionary text file
356 | count_index = 1 # column of the term frequency in the dictionary text file
357 |
358 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
359 | print("Dictionary file not found")
360 |
361 | max_edit_distance_lookup = 2
362 | suggestion_verbosity = Verbosity.CLOSEST
363 |
364 | words = spell_correction(words)
365 |
366 | max_edit_distance_dictionary = 0
367 | prefix_length = 7
368 | # create object
369 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
370 | # load dictionary
371 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
372 | term_index = 0 # column of the term in the dictionary text file
373 | count_index = 1 # column of the term frequency in the dictionary text file
374 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
375 | print("Dictionary file not found")
376 |
377 | words = word_segmentation(words)
378 |
379 | max_edit_distance_dictionary = 2
380 | prefix_length = 9
381 | sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
382 | dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt")
383 | term_index = 0 # column of the term in the dictionary text file
384 | count_index = 1 # column of the term frequency in the dictionary text file
385 |
386 | if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
387 | print("Dictionary file not found")
388 |
389 | max_edit_distance_lookup = 2
390 | suggestion_verbosity = Verbosity.CLOSEST
391 |
392 | words = spell_correction(words)
393 |
394 | words = remove_stopwords(words)
395 | w_in_ques = words
396 | vs = vectors(words)
397 |
398 | avg_v = average_vector(vs)
399 |
400 | # TODO : 1. correction. 2, Make Generic thru user input
401 | now_date = datetime.now()
402 | now_date = now_date.strftime("%d/%m/%Y %H:%M:%S")
403 | filter_date_q = now_date
404 |
405 |
406 |
407 |
408 |
409 |
410 | vector_data['processed_words_list'] = vector_data['processed_words'].apply(str_to_list)
411 |
412 | vector_data['processed_words_list'] = vector_data['processed_words_list'].apply(add)
413 |
414 | print('adding done')
415 |
416 | vector_data['length'] = vector_data['processed_words_list'].apply(length_of_c_KW)
417 |
418 | new_vector_data = vector_data[vector_data['length']!=0]
419 |
420 | print('NEW LENGTH {}'.format(len(new_vector_data)))
421 |
422 |
423 | new_vector_data['Similarity'] = new_vector_data['Average_vector'].apply(match)
424 |
425 | # vector_data = vector_data[vector_data['modified_on']