├── data ├── README └── glove │ └── README ├── fetch_glove_data.sh ├── README ├── glove_predict.py ├── ck12_wiki_predict.py └── utils.py /data/README: -------------------------------------------------------------------------------- 1 | train and test files should be put in this directory 2 | -------------------------------------------------------------------------------- /data/glove/README: -------------------------------------------------------------------------------- 1 | glove data should be put in this directory 2 | -------------------------------------------------------------------------------- /fetch_glove_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | url=http://nlp.stanford.edu/data/glove.6B.zip 4 | fname=`basename $url` 5 | 6 | wget $url 7 | mv $fname data/glove 8 | unzip data/glove/$fname -d data/glove/ 9 | 10 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Two simple solutions for Allen Institute Kaggle competiotion. 2 | 3 | 1) Glove based solution. 4 | The already computed wiki glove word2vec is used co calculate cosine similaruty of questions and answers. 5 | Scores are around 0.31875 6 | Run: 7 | - get glove data - fetch_glove_data.sh 8 | - get prediciton - glove_predict.py 9 | 10 | 2)IR based part: 11 | Retrivieng ck12 topics, then get wiki pages for each page and then ranking documents for question and answer pair. 12 | Scores are around 0.35375. 13 | Run: 14 | To get wiki pages and prediction - python ck12_wiki_predict.py --get_data 1 15 | -------------------------------------------------------------------------------- /glove_predict.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy import linalg 4 | from nltk.corpus import stopwords 5 | import argparse 6 | from utils import tokenize 7 | 8 | 9 | def predict_answers(data, word2vec, N): 10 | 11 | stop = stopwords.words('english') 12 | 13 | pred_answs = [] 14 | for i in range(data.shape[0]): 15 | #calculate word2vec for question 16 | q_vec = np.zeros(N) 17 | for w in tokenize(data['question'][i]): 18 | if w.lower() in word2vec and w.lower() not in stop: 19 | q_vec += word2vec[w.lower()] 20 | q_vec = q_vec / linalg.norm(q_vec) 21 | 22 | #calculate word2vec for answers 23 | A_vec = np.zeros(N) 24 | B_vec = np.zeros(N) 25 | C_vec = np.zeros(N) 26 | D_vec = np.zeros(N) 27 | for w in tokenize(data['answerA'][i]): 28 | if w.lower() in word2vec and w.lower() not in stop: 29 | A_vec += word2vec[w.lower()] 30 | 31 | for w in tokenize(data['answerB'][i]): 32 | if w.lower() in word2vec and w.lower() not in stop: 33 | B_vec += word2vec[w.lower()] 34 | 35 | for w in tokenize(data['answerC'][i]): 36 | if w.lower() in word2vec and w.lower() not in stop: 37 | C_vec += word2vec[w.lower()] 38 | 39 | for w in tokenize(data['answerD'][i]): 40 | if w.lower() in word2vec and w.lower() not in stop: 41 | D_vec += word2vec[w.lower()] 42 | 43 | A_vec = A_vec / linalg.norm(A_vec) 44 | B_vec = B_vec / linalg.norm(B_vec) 45 | C_vec = C_vec / linalg.norm(C_vec) 46 | D_vec = D_vec / linalg.norm(D_vec) 47 | 48 | #choose question based on cosine distance 49 | idx = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec).argmax() 50 | pred_answs.append(["A", "B", "C", "D"][idx]) 51 | 52 | return pred_answs 53 | 54 | if __name__ == '__main__': 55 | #parsing input arguments 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument('--fname', type=str, default='validation_set.tsv', help='file name with data') 58 | parser.add_argument('--N', type=int, default= 300, help='embeding size (50, 100, 200, 300 only)') 59 | args = parser.parse_args() 60 | 61 | #read data 62 | data = pd.read_csv('data/' + args.fname, sep = '\t' ) 63 | 64 | #read glove 65 | word2vec = {} 66 | with open("data/glove/glove.6B." + str(args.N) + "d.txt") as f: 67 | for line in f: 68 | l = line.split() 69 | word2vec[l[0]] = map(float, l[1:]) 70 | 71 | #predict 72 | pred_answs = predict_answers(data, word2vec, args.N) 73 | 74 | #save prediction 75 | pd.DataFrame({'id': list(data['id']),'correctAnswer': pred_answs})[['id', 'correctAnswer']].to_csv('prediction.csv', index = False) -------------------------------------------------------------------------------- /ck12_wiki_predict.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import utils 3 | import numpy as np 4 | import pandas as pd 5 | 6 | #urls to get toppics 7 | ck12_url_topic = ['https://www.ck12.org/earth-science/', 'http://www.ck12.org/life-science/', 8 | 'http://www.ck12.org/physical-science/', 'http://www.ck12.org/biology/', 9 | 'http://www.ck12.org/chemistry/', 'http://www.ck12.org/physics/'] 10 | wiki_docs_dir = 'data/wiki_data' 11 | 12 | 13 | def get_wiki_docs(): 14 | # get keywords 15 | ck12_keywords = set() 16 | for url_topic in ck12_url_topic: 17 | keywords= utils.get_keyword_from_url_topic(url_topic) 18 | for kw in keywords: 19 | ck12_keywords.add(kw) 20 | 21 | #get and save wiki docs 22 | utils.get_save_wiki_docs(ck12_keywords, wiki_docs_dir) 23 | 24 | 25 | def predict(data, docs_per_q): 26 | #index docs 27 | docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir) 28 | 29 | res = [] 30 | 31 | for index, row in data.iterrows(): 32 | #get answers words 33 | w_A = set(utils.tokenize(row['answerA'])) 34 | w_B = set(utils.tokenize(row['answerB'])) 35 | w_C = set(utils.tokenize(row['answerC'])) 36 | w_D = set(utils.tokenize(row['answerD'])) 37 | 38 | sc_A = 0 39 | sc_B = 0 40 | sc_C = 0 41 | sc_D = 0 42 | 43 | q = row['question'] 44 | 45 | for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]: 46 | for w in w_A: 47 | if w in docs_tf[d]: 48 | sc_A += 1. * docs_tf[d][w] * words_idf[w] 49 | for w in w_B: 50 | if w in docs_tf[d]: 51 | sc_B += 1. * docs_tf[d][w] * words_idf[w] 52 | for w in w_C: 53 | if w in docs_tf[d]: 54 | sc_C += 1. * docs_tf[d][w] * words_idf[w] 55 | for w in w_D: 56 | if w in docs_tf[d]: 57 | sc_D += 1. * docs_tf[d][w] * words_idf[w] 58 | 59 | res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])]) 60 | 61 | return res 62 | 63 | if __name__ == '__main__': 64 | #parsing input arguments 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument('--fname', type=str, default='validation_set.tsv', help='file name with data') 67 | parser.add_argument('--docs_per_q', type=int, default= 10, help='number of docs to consider when ranking quesitons') 68 | parser.add_argument('--get_data', type=int, default= 0, help='flag to get wiki data for IR') 69 | args = parser.parse_args() 70 | 71 | if args.get_data: 72 | get_wiki_docs() 73 | 74 | #read data 75 | data = pd.read_csv('data/' + args.fname, sep = '\t' ) 76 | #predict 77 | res = predict(data, args.docs_per_q) 78 | #save result 79 | pd.DataFrame({'id': list(data['id']), 'correctAnswer': res})[['id', 'correctAnswer']].to_csv("prediction.csv", index = False) 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import wikipedia as wiki 4 | from urllib2 import urlopen 5 | from bs4 import BeautifulSoup 6 | from nltk.corpus import stopwords 7 | from math import log 8 | 9 | 10 | def tokenize(review, remove_stopwords = True ): 11 | # Function to convert a document to a sequence of words, 12 | # optionally removing stop words. Returns a list of words. 13 | # 1. Remove non-letters 14 | review_text = re.sub("[^a-zA-Z]"," ", review) 15 | # 2. Convert words to lower case and split them 16 | words = review_text.lower().split() 17 | # 3. Optionally remove stop words (true by default) 18 | if remove_stopwords: 19 | stops = set(stopwords.words("english")) 20 | words = [w for w in words if not w in stops] 21 | # 5. Return a list of words 22 | return words 23 | 24 | def ensure_dir(dir): 25 | if not os.path.exists(dir): 26 | os.makedirs(dir) 27 | 28 | 29 | def get_keyword_from_url_topic(url_topic): 30 | # Topic includes: Earth Science, Life Science, Physical Science, Biology, Chemestry and Physics 31 | lst_url = [] 32 | html = urlopen(url_topic).read() 33 | soup = BeautifulSoup(html, 'html.parser') 34 | for tag_h3 in soup.find_all('h3'): 35 | url_res = ' '.join(tag_h3.li.a.get('href').strip('/').split('/')[-1].split('-')) 36 | lst_url.append(url_res) 37 | return lst_url 38 | 39 | 40 | def get_save_wiki_docs(keywords, save_folder = 'data/wiki_data/'): 41 | 42 | ensure_dir(save_folder) 43 | 44 | n_total = len(keywords) 45 | for i, kw in enumerate(keywords): 46 | kw = kw.lower() 47 | print i, n_total, i * 1.0 / n_total, kw 48 | try: 49 | content = wiki.page(kw).content.encode('ascii', 'ignore') 50 | except wiki.exceptions.DisambiguationError as e: 51 | print 'DisambiguationError', kw 52 | except: 53 | print 'Error', kw 54 | if not content: 55 | continue 56 | with open(os.path.join(save_folder, '_'.join(kw.split()) + '.txt'), 'w') as f: 57 | f.write(content) 58 | 59 | 60 | 61 | def get_docstf_idf(dir_data): 62 | """ indexing wiki pages: 63 | returns {document1:{word1:tf, word2:tf ...}, ....}, 64 | {word1: idf, word2:idf, ...}""" 65 | docs_tf = {} 66 | idf = {} 67 | vocab = set() 68 | 69 | for fname in os.listdir(dir_data): 70 | dd = {} 71 | total_w = 0 72 | path = os.path.join(dir_data, fname) 73 | for index, line in enumerate(open(path)): 74 | lst = tokenize(line) 75 | for word in lst: 76 | vocab.add(word) 77 | dd.setdefault(word, 0) 78 | dd[word] += 1 79 | total_w += 1 80 | 81 | for k, v in dd.iteritems(): 82 | dd[k] = 1.* v / total_w 83 | 84 | docs_tf[fname] = dd 85 | 86 | for w in list(vocab): 87 | docs_with_w = 0 88 | for path, doc_tf in docs_tf.iteritems(): 89 | if w in doc_tf: 90 | docs_with_w += 1 91 | idf[w] = log(len(docs_tf)/docs_with_w) 92 | 93 | return docs_tf, idf 94 | 95 | 96 | def get_docs_importance_for_question(question, dosc_tf, word_idf, max_docs = None): 97 | question_words = set(tokenize(question)) 98 | #go through each article 99 | doc_importance = [] 100 | 101 | for doc, doc_tf in dosc_tf.iteritems(): 102 | doc_imp = 0 103 | for w in question_words: 104 | if w in doc_tf: 105 | doc_imp += doc_tf[w] * word_idf[w] 106 | doc_importance.append((doc, doc_imp)) 107 | 108 | #sort doc importance 109 | doc_importance = sorted(doc_importance, key=lambda x: x[1], reverse = True) 110 | if max_docs: 111 | return doc_importance[:max_docs] 112 | else: 113 | return doc_importance 114 | --------------------------------------------------------------------------------