├── pa3
    ├── base_classes
    │   ├── __init__.py
    │   ├── baseline_score.py
    │   ├── query.py
    │   ├── id_map.py
    │   ├── document.py
    │   ├── ndcg.py
    │   └── embedding.py
    ├── fig
    │   └── IIR_fig_6.15.png
    ├── environment.yml
    ├── README.md
    ├── pa3-learning-to-rank.ipynb
    └── pa3-ranking.ipynb
├── .gitattributes
├── pa1
    ├── .DS_Store
    ├── environment.yml
    └── README.md
├── pa2
    ├── environment.yml
    ├── README.md
    └── pa2.ipynb
├── .gitignore
└── README.md


/pa3/base_classes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/pa1/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lichuanro/Information-Retrieval-and-Web-Search/HEAD/pa1/.DS_Store


--------------------------------------------------------------------------------
/pa3/fig/IIR_fig_6.15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lichuanro/Information-Retrieval-and-Web-Search/HEAD/pa3/fig/IIR_fig_6.15.png


--------------------------------------------------------------------------------
/pa1/environment.yml:
--------------------------------------------------------------------------------
1 | name: cs276-pa1
2 | channels:
3 |   - defaults
4 | dependencies:
5 |   - jupyter=1.0.0
6 |   - python=3.7.3
7 |   - psutil=5.6.1
8 | 


--------------------------------------------------------------------------------
/pa2/environment.yml:
--------------------------------------------------------------------------------
1 | name: cs276-pa2
2 | channels:
3 |   - defaults
4 | dependencies:
5 |   - jupyter=1.0.0
6 |   - python=3.7.3
7 |   - tqdm=4.31.1
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *submission*
 2 | *tmp*
 3 | *dev*
 4 | *data*
 5 | *toy*
 6 | *output*
 7 | *autograding*
 8 | *checkpoints*
 9 | *pycache*
10 | *.pyc
11 | .DS_Store
12 | 


--------------------------------------------------------------------------------
/pa3/environment.yml:
--------------------------------------------------------------------------------
 1 | name: cs276-pa3
 2 | channels:
 3 |   - defaults
 4 |   - conda-forge
 5 | dependencies:
 6 |   - jupyter=1.0.0
 7 |   - python=3.7.3
 8 |   - tqdm=4.31.1
 9 |   - scikit-learn=0.20.3
10 |   - pypdf2=1.26.0
11 |   - pip:
12 |     - xgboost==0.82
13 | 


--------------------------------------------------------------------------------
/pa3/base_classes/baseline_score.py:
--------------------------------------------------------------------------------
 1 | class BaselineScorer(AScorer):
 2 |     def __init__(self, idf):
 3 |         super().__init__(idf)
 4 |     
 5 |     def get_sim_score(self, q, d):
 6 |         q_vec = self.get_query_vector(q)
 7 |         d_vec = self.get_doc_vector(q, d)
 8 |         score = 0
 9 |         if 'body_hits' in d_vec.keys():
10 |             for term in d_vec['body_hits'].keys():
11 |                 score += d_vec['body_hits'][term]
12 |         return score
13 | 


--------------------------------------------------------------------------------
/pa2/README.md:
--------------------------------------------------------------------------------
 1 | ## CS276 PA2
 2 | 
 3 | ### Quick Start
 4 | Run the following commands from the root directory of the assignment:
 5 |   1. `conda env create -f environment.yml`
 6 |   2. `conda activate cs276-pa2` (or `source activate cs276-pa2`)
 7 |   3. `jupyter notebook`
 8 | 
 9 | This will launch a Jupyter Notebook session in your web browser,
10 | where you can finish the assignment by filling out `pa2-skeleton.ipynb`.
11 | All further instructions can be found in the Jupyter Notebook.
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Information Retrieval and Web Search
 2 | 
 3 | This is my implementation of [CS 276](http://web.stanford.edu/class/cs276/) homework.
 4 | 
 5 | pa1 is for **Index Construction**, including uncompressed and compressed index, boolean conjunctive queries. It simulates the real world index construction when memory resources are limited by partitioning data into multiple folders(blocks)
 6 | 
 7 | pa2 is about **Spelling Correction**, including Candidate Generator and Candidate Scorer.
 8 | 
 9 | pa3 is about webpage **ranking**, with implementation of different ranking functions.
10 | 
11 | 


--------------------------------------------------------------------------------
/pa3/README.md:
--------------------------------------------------------------------------------
 1 | ## CS276 PA3
 2 | 
 3 | ### Quick Start
 4 | Run the following commands from the root directory of the assignment:
 5 |   1. `conda env create -f environment.yml`
 6 |   2. `conda activate cs276-pa3` (or `source activate cs276-pa3`)
 7 |   3. `jupyter notebook`
 8 | 
 9 | (If you have problem with virtual environment for pa3, have a look at https://piazza.com/class/jtkhpfdjfik115?cid=459).
10 | 
11 | This will launch a Jupyter Notebook session in your web browser,
12 | where you can finish the assignment by first filling out `pa3-ranking.ipynb`
13 | followed by `pa3-learning-to-rank.ipynb`
14 | All further instructions can be found in the Jupyter Notebook.
15 | 


--------------------------------------------------------------------------------
/pa3/base_classes/query.py:
--------------------------------------------------------------------------------
 1 | class Query:
 2 |     """This class is used to store a query sequence."""
 3 |     def __init__(self, query):
 4 |         """Constructs a query."""
 5 |         self.query_words = query.split(" ")
 6 |         
 7 |     def __iter__(self):
 8 |         for w in self.query_words:
 9 |             yield w
10 |     
11 |     def __eq__(self, other): 
12 |         if not isinstance(other, Query):
13 |             # don't attempt to compare against unrelated types
14 |             return False
15 |         return self.query_words == other.query_words
16 |         
17 |     def __hash__(self):
18 |         return hash(str(self))
19 |         
20 |     def __str__(self):
21 |         return " ".join(self.query_words)
22 |     
23 |     __repr__ = __str__
24 | 


--------------------------------------------------------------------------------
/pa1/README.md:
--------------------------------------------------------------------------------
 1 | Getting started
 2 | ===============
 3 | 
 4 | Install Conda
 5 | -------------
 6 | We will use Conda to obtain a standard installation of Python and related useful packages. If you aren't already using it, please install it using the platform-specific instructions provided on the [Conda installation page](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html).
 7 | 
 8 | Create a New Environment
 9 | ------------------------
10 | We have provided an environment file for this assignment.
11 | To create a new environment with the correct packages, run
12 | 
13 |      conda env create -f environment.yml
14 | 
15 | 
16 | This will create a new environment named cs276-pa1. You can activate it by running
17 | 
18 |      conda activate cs276-pa1
19 | 
20 | Once you are done working on the assignment, you can deactivate it by running
21 | 
22 |      conda deactivate
23 | 
24 | If you’ve got conda v4.4+, you should use `conda activate cs276-pa1` and similarly `conda deactivate`
25 | 
26 | 
27 | Open Jupyter Notebook
28 | ---------------------
29 | We'll be using a Jupyter notebook to do our assignment. You can start a new session with
30 | 
31 |       jupyter notebook
32 | 
33 | This will start a local server which you can then connect with using a local browser.
34 | 


--------------------------------------------------------------------------------
/pa3/base_classes/id_map.py:
--------------------------------------------------------------------------------
 1 | class IdMap:
 2 |     """Helper class to store a mapping from strings to ids."""
 3 |     def __init__(self):
 4 |         self.str_to_id = {}
 5 |         self.id_to_str = []
 6 |         
 7 |     def __len__(self):
 8 |         """Return number of terms stored in the IdMap"""
 9 |         return len(self.id_to_str)
10 |         
11 |     def _get_str(self, i):
12 |         """Returns the string corresponding to a given id (`i`)."""
13 |         ### Begin your code
14 |         #Out of range error will be thrown automatically, 
15 |         #no need to handle it separately
16 |         return self.id_to_str[i]
17 |          
18 |         ### End your code
19 |         
20 |     def _get_id(self, s):
21 |         """Returns the id corresponding to a string (`s`). 
22 |         If `s` is not in the IdMap yet, then assigns a new id and returns the new id.
23 |         """
24 |         ### Begin your code
25 |         if s not in self.str_to_id:
26 |             self.str_to_id[s] = len(self.id_to_str)
27 |             self.id_to_str.append(s)
28 | 
29 |         return self.str_to_id[s]
30 |         ### End your code
31 |             
32 |     def __getitem__(self, key):
33 |         """If `key` is a integer, use _get_str; 
34 |            If `key` is a string, use _get_id;"""
35 |         if type(key) is int:
36 |             return self._get_str(key)
37 |         elif type(key) is str:
38 |             return self._get_id(key)
39 |         else:
40 |             raise TypeError
41 | 


--------------------------------------------------------------------------------
/pa3/base_classes/document.py:
--------------------------------------------------------------------------------
 1 | class Document:
 2 |     """The class is used to store useful information for a document. 
 3 |     You can also write the document to a string for debugging."""
 4 |     def __init__(self, url):
 5 |         """Constructs a document with a String url."""
 6 |         self.url = url
 7 |         self.title = None
 8 |         self.headers = None
 9 |         self.body_hits = None # term -> [list of positions]
10 |         self.body_length = 0
11 |         self.pagerank = 0
12 |         self.anchors = None # term -> anchor_count
13 |         self.debugStr = ""
14 |         
15 |     def __iter__(self):
16 |         for u in self.url:
17 |             yield u
18 |             
19 |     def __str__(self):  
20 |         result = [];
21 |         NEW_LINE = "\n"
22 | #         result.append("url: "+ self.url + NEW_LINE);
23 |         if (self.title is not None): result.append("title: " + self.title + NEW_LINE);
24 |         if (self.headers is not None): result.append("headers: " + str(self.headers) + NEW_LINE);
25 |         if (self.body_hits is not None): result.append("body_hits: " + str(self.body_hits) + NEW_LINE);
26 |         if (self.body_length != 0): result.append("body_length: " + str(self.body_length) + NEW_LINE);
27 |         if (self.pagerank != 0): result.append("pagerank: " + str(self.pagerank) + NEW_LINE);
28 |         if (self.anchors is not None): result.append("anchors: " + str(self.anchors) + NEW_LINE);
29 |         return " ".join(result)
30 | 
31 |     __repr__ = __str__    
32 | 


--------------------------------------------------------------------------------
/pa3/base_classes/ndcg.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl
 2 | from .query import Query
 3 | from .document import Document
 4 | import math
 5 | 
 6 | class NDCG:
 7 |     def get_rel_scores(self, filename):
 8 |         self.rel_scores = {}
 9 |         query = ""
10 |         with open(filename, 'r') as f:
11 |             for line in f:
12 |                 if line.startswith("q"):
13 |                     query = line.split(":")[-1].strip()
14 |                     url_score = {}
15 |                     self.rel_scores[query] = url_score
16 |                 else: #urls
17 |                     tokens = line[line.index(":")+1:].strip().split(" ")
18 |                     url = tokens[0]
19 |                     rel = tokens[1]
20 |                     if float(rel) < 0:
21 |                         rel = 0
22 |                     if url_score is not None:
23 |                         url_score[url] = float(rel)
24 | 
25 |     def calc_ndcg(self, rels):
26 |         local_sum = 0
27 |         sorted_sum = 0
28 |         for i in range(len(rels)):
29 |             rel = rels[i]
30 |             local_sum += (2**rel - 1) / (math.log(i + 1 + 1, 2))
31 |         sorted_rels = sorted(rels, reverse=True)
32 |         for i in range(len(sorted_rels)):
33 |             rel = sorted_rels[i]
34 |             sorted_sum += (2**rel - 1) / (math.log(i + 1 + 1, 2))
35 | 
36 |         if (sorted_sum == 0):
37 |             return 0
38 |         else:
39 |             return local_sum/sorted_sum
40 | 
41 |     def read_ranking_calc(self, ranked_result_file):
42 |         self.query_ndcg = {}
43 |         self.query_docs = {}
44 |         cur_q = ""
45 |         cur_rels = []
46 | 
47 |         with open(ranked_result_file, 'r') as f:
48 |             for line in f:
49 |                 clean_l = line.strip().split(":")
50 |                 l_type = clean_l[0].strip()
51 |                 l_content = ":".join(clean_l[1:]).strip()
52 |                 if l_type == 'query':
53 |                     if len(cur_rels) > 0:
54 |                         self.query_ndcg[cur_q] = self.calc_ndcg(cur_rels)
55 |                     cur_q = l_content
56 |                     cur_rels = []
57 |                     self.query_docs[cur_q] = []
58 |                 elif l_type == 'url':
59 |                     doc = Document(l_content)
60 |                     self.query_docs[cur_q].append(doc)
61 |                     if (cur_q in self.rel_scores) and \
62 |                        (doc.url in self.rel_scores[cur_q]):
63 |                         cur_rels.append(self.rel_scores[cur_q][doc.url])
64 |                     else:
65 |                         print("Warning. Cannot find query %s with url %s"%(cur_q, doc.url))
66 |                 elif l_type == 'title':
67 |                     doc.title = l_content
68 |                 # ignore debug line for now
69 | 
70 |         if len(cur_rels) > 0:
71 |             self.query_ndcg[cur_q] = self.calc_ndcg(cur_rels)
72 |             cur_q = l_content
73 |             cur_rels = []
74 | 
75 |     def get_avg_ndcg(self):
76 |         sum_ndcg = 0
77 |         for i in self.query_ndcg:
78 |             sum_ndcg += self.query_ndcg[i]
79 |         return sum_ndcg / len(self.query_ndcg)
80 | 
81 |     def write_ndcg_result(self, ndcg_result_file):
82 |         with open(ndcg_result_file, 'w') as f:
83 |             for query in self.query_ndcg:
84 |                 f.write("query: " + query + "\n")
85 |                 ndcg_score = self.query_ndcg[query]
86 |                 f.write("ndcg: " + str(ndcg_score) + "\n")
87 | 
88 |                 for doc in self.query_docs[query]:
89 |                     f.write("  url: " + doc.url + "\n")
90 |                     f.write("    rating: " + str(self.rel_scores[query][doc.url]) + "\n")
91 |                     f.write("    title: " + doc.title + "\n")
92 |                     f.write("    debug:" + "\n")
93 | 
94 |         print("Write ndcg result to " + ndcg_result_file + " sucessfully!")
95 | 


--------------------------------------------------------------------------------
/pa3/base_classes/embedding.py:
--------------------------------------------------------------------------------
  1 | """Download and pre-process GloVe.
  2 | 
  3 | Author:
  4 |     Chris Chute (chute@stanford.edu)
  5 |     Ashwin Paranjape (ashwinp@cs.stanford.edu)
  6 | """
  7 | 
  8 | import numpy as np
  9 | import os
 10 | import urllib.request
 11 | 
 12 | from codecs import open
 13 | from collections import Counter
 14 | from tqdm import tqdm
 15 | from zipfile import ZipFile
 16 | 
 17 | def download_url(url, output_path, show_progress=True):
 18 |     class DownloadProgressBar(tqdm):
 19 |         def update_to(self, b=1, bsize=1, tsize=None):
 20 |             if tsize is not None:
 21 |                 self.total = tsize
 22 |             self.update(b * bsize - self.n)
 23 | 
 24 |     if show_progress:
 25 |         # Download with a progress bar
 26 |         with DownloadProgressBar(unit='B', unit_scale=True,
 27 |                                  miniters=1, desc=url.split('/')[-1]) as t:
 28 |             urllib.request.urlretrieve(url,
 29 |                                        filename=output_path,
 30 |                                        reporthook=t.update_to)
 31 |     else:
 32 |         # Simple download with no progress bar
 33 |         urllib.request.urlretrieve(url, output_path)
 34 | 
 35 | def url_to_data_path(url):
 36 |     return os.path.join('./data/', )
 37 | 
 38 | class Embedding():
 39 |     def __init__(self, url, dim, vocab=None):
 40 |         """
 41 |         Initialize the Embedding class.
 42 | 
 43 |         Arguments
 44 |         ---------
 45 |         url: str
 46 |             Url to download the Embeddings file from
 47 | 
 48 |         vocab: Set[str]
 49 |             Set of tokens specifying the subset of embeddings to keep in memory
 50 | 
 51 |         Supports reading from glove-like file format and keeps a subset of embeddings in memory for fast-access
 52 |         """
 53 | 
 54 |         self.url = url
 55 |         if vocab is not None:
 56 |             self.vocab = set(vocab)
 57 |         else:
 58 |             self.vocab = None
 59 |         self.OOV = "--OOV--"
 60 |         self.dim = dim
 61 |         self.download()
 62 |         self.load()
 63 | 
 64 |     def download(self):
 65 |         """Download the embeddings file and extract it
 66 |         """
 67 | 
 68 |         url = self.url
 69 |         output_path = url.split('/')[-1]
 70 |         if not os.path.exists(output_path):
 71 |             print('Downloading {}...'.format(output_path))
 72 |             download_url(url, output_path)
 73 | 
 74 |         if os.path.exists(output_path) and output_path.endswith('.zip'):
 75 |             extracted_path = output_path.replace('.zip', '')
 76 |             if not os.path.exists(extracted_path):
 77 |                 print('Unzipping {}...'.format(output_path))
 78 |                 with ZipFile(output_path, 'r') as zip_fh:
 79 |                     zip_fh.extractall(extracted_path)
 80 |         self.emb_folder = extracted_path
 81 |         self.emb_file = os.path.join(self.emb_folder,self.emb_folder+"."+str(self.dim)+"d.txt")
 82 | 
 83 |     def load(self):
 84 |         """Load a subset (self.vocab) of embeddings into memory"""
 85 |         print("Pre-processing {} vectors...".format(self.emb_file))
 86 | 
 87 |         embedding_dict = {}
 88 |         with open(self.emb_file, "r") as fh:
 89 |             for line in fh:
 90 |                 array = line.split()
 91 |                 word = array[0]
 92 |                 vector = np.array(list(map(float, array[1:])))
 93 |                 if self.vocab is not None:
 94 |                     if word in self.vocab:
 95 |                         embedding_dict[word] = vector
 96 |                 else:
 97 |                     embedding_dict[word] = vector
 98 |             if self.vocab is not None:
 99 |                 print("{} / {} tokens have corresponding embedding vector".format(len(embedding_dict), len(self.vocab)))
100 |             else:
101 |                 print("{} tokens have corresponding embedding vector".format(len(embedding_dict)))
102 |         embedding_dict[self.OOV] = np.array([0. for _ in range(self.dim)])
103 | 
104 |         self.embeddings = embedding_dict
105 | 
106 |     def __getitem__(self, token):
107 |         if token in self.embeddings:
108 |             return self.embeddings[token]
109 |         else:
110 |             return self.embeddings[self.OOV]
111 | 
112 | if __name__ == '__main__':
113 |     # Get command-line args
114 |     glove_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
115 |     embedding = Embedding(glove_url, 100, vocab=set(['a', 'an']))
116 |     print(embedding['a'])
117 |     print(embedding['an'])
118 |     print(embedding['the'])
119 |     print(embedding['laksdjflaskfdjalskdfj'])
120 | 


--------------------------------------------------------------------------------
/pa3/pa3-learning-to-rank.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# PA3 - Learning to Rank (52% of total PA3 grade)\n",
  8 |     "\n",
  9 |     "In the first part of this assignment, we examined various ways of ranking documents given a query; however, weights for different features were not learned automatically but set manually. As more and more ranking signals are investigated, integrating more features becomes challenging as it would be hard to come up with a single ranking function like BM25 for arbitrary features. \n",
 10 |     "\n",
 11 |     "In this assignment, you will be investigating different approaches to the learning to rank task that you have learned: (1) the pointwise approach using linear regression and (2) the pairwise approach employing gradient boosted decision trees. The goal is to let these algorithms learn weights automatically for various features. \n",
 12 |     "\n",
 13 |     "More specifically, it involves the following tasks (weights are for the programming assignment as a whole):\n",
 14 |     "* [Task 1: Pointwise Approach and Linear Regression (6%)](#Task-1:-Pointwise-Approach-and-Linear-Regression): Implement an instance of the pointwise approach with linear regression based on basic tf-idf features\n",
 15 |     "* [Task 2: Pairwise Approach and Gradient Boosted Decision Trees (6%)](#Task-2:-Pairwise-Approach-and-Gradient-Boosted-Decision-Trees): Implement an instance of the pairwise approach with the help of gradient boosted decision trees, using basic tf-idf features\n",
 16 |     "* [Task 3: Train Your Best Model (20%)](#Task-3:-Adding-More-Features) Train your best model, and experiment with more features such as BM25, Smallest Window, and PageRank\n",
 17 |     "* [Task 4: Report (20%)](#Task-4:-Report): Write up a summary report and answer some questions about the above tasks\n",
 18 |     "* [Extra Credit](#Extra-Credit): Up to 10% in extra credit will be awarded to the top performing models in the class\n",
 19 |     "\n",
 20 |     "(Note: 3% of your grade on this programming assignment is reserved for completing the query and ranking quizzes). \n",
 21 |     "\n",
 22 |     "__Grading for Tasks 1, 2 and 3__\n",
 23 |     "- Half of your grade will be based on your model's performance on an autograder test set. Your scores will be visible to you when you submit on Gradescope, but the test set will not. \n",
 24 |     "- The other half of your grade will be based on your model's performance on a hidden test set. Your scores will only be visible to you when grades for this assignment are released\n",
 25 |     "- You will get full credit for solutions that receive NDCG scores within reasonable range of the NDCG scores received by the teaching staff."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## Submission instructions\n",
 33 |     "\n",
 34 |     "1\\. The assignment is due at 4:00 pm PT on the due date (30 May, 2019)\n",
 35 |     "\n",
 36 |     "2\\. The notebook will automatically generate **python files** in submission folder. You'll have to upload them to the PA3-code assignment on gradescope. Note that you need to upload all the individual files in the submission folder without zipping it.    \n",
 37 |     "\n",
 38 |     "3\\. While solving the assignment, do **NOT** change class and method names, autograder tests will fail otherwise. \n",
 39 |     "\n",
 40 |     "4\\. You'll also have to upload a **PDF version** of the notebook (which would be primarily used to grade your report section of the notebook) to PA3-PDF assignment on gradescope. Note that directly converting the PDF truncates code cells. To get a usable PDF version, first click on File > Print Preview, which will open in a new tab, then print to PDF using your browser's print functionality. \n",
 41 |     "\n",
 42 |     "5\\. Since there are two notebooks, we have included a script to help you merge them together before upload. Run\n",
 43 |     "```\n",
 44 |     "python pdfcat pa3-ranking.pdf pa3-learning-to-rank.pdf > pa3-solution.pdf\n",
 45 |     "``` \n",
 46 |     "to generate a single concatenated pdf file and upload `pa3-solution.pdf` to gradescope.\n",
 47 |     "\n",
 48 |     "6\\. After uploading the PDF make sure you **tag all the relevant pages to each question**. We will penalize for mistagged submissions. \n",
 49 |     "\n",
 50 |     "7\\. If you are solving the assignment in a team of two, add the other student as a group member after submitting the assignment. Do **NOT** submit the same assignment twice. "
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "## Setup"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 1,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "#Load the tee magic which saves a copy of the cell when executed\n",
 67 |     "%reload_ext autoreload\n",
 68 |     "%autoreload 2\n",
 69 |     "\n",
 70 |     "%reload_ext autograding_magics"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "The `submission` folder will contain all the files to be submitted, and `base_classes` contains other class definitions which you will not submit."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 2,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "import os\n",
 87 |     "try: \n",
 88 |     "    os.mkdir('submission')\n",
 89 |     "except FileExistsError:\n",
 90 |     "    pass\n",
 91 |     "try: \n",
 92 |     "    os.mkdir('base_classes')\n",
 93 |     "except FileExistsError:\n",
 94 |     "    pass"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "You can add additional imports below as required."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 3,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "output_type": "stream",
111 |      "name": "stdout",
112 |      "text": "Overwriting submission/imports2.py\n"
113 |     }
114 |    ],
115 |    "source": [
116 |     "%%tee submission/imports2.py\n",
117 |     "\n",
118 |     "# You can add additional imports here\n",
119 |     "\n",
120 |     "import sys\n",
121 |     "import pickle as pkl\n",
122 |     "import array\n",
123 |     "import os\n",
124 |     "import timeit\n",
125 |     "import contextlib\n",
126 |     "import numpy as np\n",
127 |     "\n",
128 |     "from sklearn.linear_model import LinearRegression\n",
129 |     "from sklearn.metrics import mean_squared_error\n",
130 |     "from collections import Counter\n",
131 |     "from collections import OrderedDict\n",
132 |     "import math\n",
133 |     "\n",
134 |     "import xgboost as xgb\n",
135 |     "\n",
136 |     "from base_classes.load_train_data import load_train_data\n",
137 |     "from base_classes.id_map import IdMap\n",
138 |     "from base_classes.ndcg import NDCG\n",
139 |     "from base_classes.query import Query\n",
140 |     "from base_classes.document import Document"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "# Data"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "**This dataset is the same as what you used in the first part of this Programming Assignment. You do not need to download it again.**\n",
155 |     "\n",
156 |     "As in the first part of this programming assignment, we have partitioned the data into two sets for you: \n",
157 |     "1. Training set (pa3.(signal|rel).train)\n",
158 |     "2. Development set (pa3.(signal|rel).dev)\n"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "## Loading previous code\n",
166 |     "\n",
167 |     "We load the AScorer class that you completed in the first part of Programming Assignment 3. Note that you may need to make updates to this class for completing the tasks in this notebook.\n",
168 |     "\n",
169 |     "We also load the Idf class that you can use to get document frequency values based on a corpus of ~100K documents and ~340K terms. You will also need to load the Rank class for the computation of NDCG scores on the tasks below."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 4,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "from submission.ascore import AScorer\n",
179 |     "from submission.build_idf import Idf\n",
180 |     "from submission.rank import Rank"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "# Task 1: Pointwise Approach and Linear Regression (6%)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "In ranking, each query $q_i$ will be associated with a set of documents, and for each document $j$, we extract a query-document feature vector $x_{i,j}$. There is also a label $y_{i,j}$ associated with each query-document vector $x_{i,j}$."
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "In the pointwise approach, such group structure in ranking is ignored, and we simply view our training data as $\\{(x_{i}, y_{i})\\}$, where each instance consists of a query-document feature vector $x_{i}$ and a label $y_{i}$ (which is a relevance score as in the first part of this programming assignment). The ranking problem amounts to learning a function $f$ such that $f(x_{i})$ closely matches $y_{i}$.\n",
202 |     "\n",
203 |     "In this task, we consider a very simple instance of the pointwise approach, the *linear regression* approach. That is, we will use a linear function $f$ which gives a score to each query-document feature vector $x$ as follows: $f(x) = wx+b$. Here, the weight vector ${w}$ and the bias term $b$ are parameters that we need to learn to minimize the loss function as defined below:\n",
204 |     "\\begin{equation}\n",
205 |     "\\sum_{i=1}^m (f(x_{i})-y_{i})^2\n",
206 |     "\\end{equation}\n",
207 |     "This formulation is also referred to as the *ordinary least squares* approach."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "### 1.1: Designing Feature Vectors"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "Represent each query-document pair as a five-dimensional vector of query vector-document vector (tf-idf) scores. Each dimension corresponds to a document field -- url, title, header, body, and anchor. Specifically, given a query vector $q$ and a document vector $d_{f}$ of a document field $f$, the tf-idf score is the dot product $q \\cdot d_{f}$. \n",
222 |     "\n",
223 |     "To start with, use query and document vectors with lnn.ltc weighting (as represented in SMART notation ddd.qqq). In other words, begin by using:\n",
224 |     "\n",
225 |     "1) For the document vectors, \"lnn\":\n",
226 |     "    - logarithmic term frequency of query terms in documents\n",
227 |     "    - no document frequency \n",
228 |     "    - no normalization\n",
229 |     "2) For the query vector, \"ltc\":\n",
230 |     "    - logarithmic term frequency for words in query\n",
231 |     "    - idf (inverse document frequency)\n",
232 |     "    - cosine (i.e., L2) normalization\n",
233 |     "    \n",
234 |     "Then, experiment with a few weighting schemes other than lnn.ltc.  Refer to http://web.stanford.edu/class/cs276/19handouts/lecture6-tfidf-1per.pdf for other possible weighting schemes. You will report which weighting scheme yields the best performance in Task 4.\n",
235 |     "\n",
236 |     "A few important notes:\n",
237 |     "- Creating these vectors is similar to the exercise you performed in computing cosine similarity in the first part of this programming assignment\n",
238 |     "- Make modifications to the AScorer class in order to try to implement other weighting mechanisms \n",
239 |     "- **You will use these basic feature vectors for both Task 1 and Task 2. Do not use any other signals or features for Tasks 1 and 2; you will have the opportunity to use these features in Task 3.**\n",
240 |     "\n"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 5,
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "output_type": "stream",
250 |      "name": "stdout",
251 |      "text": "Overwriting submission/features.py\n"
252 |     }
253 |    ],
254 |    "source": [
255 |     "%%tee submission/features.py\n",
256 |     "\n",
257 |     "def get_features (signal_file, idf):\n",
258 |     "    '''\n",
259 |     "    Create a feature vector from the signal file and from the idf_dict. \n",
260 |     "\n",
261 |     "    Args:\n",
262 |     "        signal_file: filepath to signal file\n",
263 |     "        idf: object of class Idf (with idf built)\n",
264 |     "\n",
265 |     "    Returns:\n",
266 |     "        feature_vec (numpy array of dimension (N, 5)): N is the number of (query, document)\n",
267 |     "        pairs in the relevance file.\n",
268 |     "    '''\n",
269 |     "\n",
270 |     "    # Experiment with different values of weighting below. Note that this uses dddqqq notation.\n",
271 |     "    # Make sure to set weighting to the best value prior to submitting your code.\n",
272 |     "    # You should be able to support lnn.ltc weighting, along with any other weighting that you experiment with\n",
273 |     "\n",
274 |     "\n",
275 |     "    WEIGHTING = 'lnnltc' \n",
276 |     "\n",
277 |     "    assert len(WEIGHTING) == 6, \"Invalid weighting scheme.\"        \n",
278 |     "\n",
279 |     "    feature_vec = []\n",
280 |     "\n",
281 |     "    ### Begin your code\n",
282 |     "    query_dict = load_train_data(signal_file)\n",
283 |     "    scorer = AScorer(idf)\n",
284 |     "    zones = ['url', 'anchor', 'headers', 'body_hits', 'anchor']\n",
285 |     "    for q, d_dict in query_dict.items():\n",
286 |     "        terms = Counter(q.query_words)\n",
287 |     "        q_vec = scorer.get_query_vector(q)\n",
288 |     "        q_vec = np.array([q_vec.get(w) for w in terms.keys()])\n",
289 |     "        q_vec /= np.linalg.norm(q_vec)\n",
290 |     "        for d in d_dict.values():\n",
291 |     "            d_vec = scorer.get_doc_vector(q, d)\n",
292 |     "            d_arr = []\n",
293 |     "            get_zone_value = lambda x: np.sum(q_vec * np.array([d_vec[x][w] for w in terms])) if x in d_vec else 0\n",
294 |     "            for z in zones:\n",
295 |     "                d_arr.append(get_zone_value(z))\n",
296 |     "            feature_vec.append(d_arr)\n",
297 |     "    feature_vec = np.array(feature_vec)\n",
298 |     "    ### End your code\n",
299 |     "\n",
300 |     "    return feature_vec\n",
301 |     "\n",
302 |     "\n",
303 |     "def get_relevance (relevance_file):\n",
304 |     "    '''\n",
305 |     "    Extract relevance scores from the relevance file. This should be a simple wrapper (<10 lines) over\n",
306 |     "    the get_rel_scores() function in the NDCG class.\n",
307 |     "\n",
308 |     "    Args:\n",
309 |     "        relevance_file: filepath to relevance file\n",
310 |     "\n",
311 |     "    Returns:\n",
312 |     "        relevance_vec (numpy array of dimension (N,)): N is the number of (query, document)\n",
313 |     "        pairs in the relevance file.   \n",
314 |     "        ndcg_obj: NDCG object which contains relevance scores\n",
315 |     "    '''  \n",
316 |     "\n",
317 |     "\n",
318 |     "    relevance_vec = []\n",
319 |     "    ndcg_obj = NDCG()\n",
320 |     "\n",
321 |     "    ### Begin your code\n",
322 |     "    ndcg_obj.get_rel_scores(relevance_file)\n",
323 |     "    for urls in ndcg_obj.rel_scores.values():\n",
324 |     "        relevance_vec += list(urls.values())\n",
325 |     "    relevance_vec = np.array(relevance_vec)\n",
326 |     "    ### End your code\n",
327 |     "\n",
328 |     "    return relevance_vec, ndcg_obj\n",
329 |     "\n",
330 |     "\n",
331 |     "\n",
332 |     "    \n",
333 |     "   \n",
334 |     "    "
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "### 1.2: Training a Linear Regression Model\n",
342 |     "\n",
343 |     "Implement the PointwiseLearner class below. You may use the LinearRegression class from the sklearn package. If you use the LinearRegression class, set fit_intercept to true and normalize to False."
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": 6,
349 |    "metadata": {},
350 |    "outputs": [
351 |     {
352 |      "output_type": "stream",
353 |      "name": "stdout",
354 |      "text": "Overwriting submission/linear_regression.py\n"
355 |     }
356 |    ],
357 |    "source": [
358 |     "%%tee submission/linear_regression.py\n",
359 |     "\n",
360 |     "class PointwiseLearner:\n",
361 |     "    \n",
362 |     "    def __init__(self):\n",
363 |     "        self.model = None\n",
364 |     "\n",
365 |     "    def train_model (self, x, y):\n",
366 |     "    \n",
367 |     "        '''\n",
368 |     "        - Train your linear regression model using the LinearRegression class \n",
369 |     "\n",
370 |     "        Args:\n",
371 |     "                x (numpy array of dimension (N, 5)): Feature vector for each query, document pair. \n",
372 |     "                Dimension is N x 5, where N is the number of query, document pairs. \n",
373 |     "                Is the independent variable for linear regression. \n",
374 |     "\n",
375 |     "                y (numpy array of dimension (N,)): Relevance score for each query, document pair. \n",
376 |     "                Is the dependent variable for linear regresion.\n",
377 |     "\n",
378 |     "        Returns: none\n",
379 |     "        '''\n",
380 |     "        ### Begin your code\n",
381 |     "        self.model = LinearRegression()\n",
382 |     "        self.model.fit(x, y)\n",
383 |     "        ### End your code\n",
384 |     "    \n",
385 |     "    def predict_model (self, x):\n",
386 |     "    \n",
387 |     "        '''\n",
388 |     "        - Output predicted scores based on the trained model.\n",
389 |     "\n",
390 |     "        Args:\n",
391 |     "                x (numpy array of dimension (N, 5)): Feature vector for each query, document pair. \n",
392 |     "                Dimension is N x 5, where N is the number of (query, document) pairs. \n",
393 |     "                Predictions are made on this input feature array.\n",
394 |     "\n",
395 |     "        Returns:\n",
396 |     "                y_pred (numpy array of dimension (N,)): Predicted relevance scores for each query, document pair\n",
397 |     "                based on the trained linear regression model.\n",
398 |     "        '''\n",
399 |     "        ### Begin your code\n",
400 |     "        return self.model.predict(x)\n",
401 |     "        ### End your code\n",
402 |     "    "
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 7,
408 |    "metadata": {},
409 |    "outputs": [
410 |     {
411 |      "output_type": "stream",
412 |      "name": "stdout",
413 |      "text": "Total Number of Docs is 98998\nTotal Number of Terms is 347071\n"
414 |     }
415 |    ],
416 |    "source": [
417 |     "lm = PointwiseLearner()\n",
418 |     "\n",
419 |     "idf = Idf()\n",
420 |     "\n",
421 |     "#Get train features and relevance\n",
422 |     "\n",
423 |     "train_signal_file = \"pa3-data/pa3.signal.train\"\n",
424 |     "train_rel_file = \"pa3-data/pa3.rel.train\"\n",
425 |     "train_features = get_features(train_signal_file, idf)\n",
426 |     "train_relevance, train_ndcg = get_relevance(train_rel_file)\n",
427 |     "assert train_features.shape[1] == 5, 'Train features are of incorrect shape. They should be 5 dimensions, but got {}'.format(train_predicts.shape[1])\n",
428 |     "\n",
429 |     "#Train linear regression model\n",
430 |     "\n",
431 |     "lm.train_model(train_features, train_relevance)\n",
432 |     "\n",
433 |     "# Get predictions on dev set.\n",
434 |     "dev_signal_file = \"pa3-data/pa3.signal.dev\"\n",
435 |     "dev_rel_file = \"pa3-data/pa3.rel.dev\"\n",
436 |     "dev_features = get_features(dev_signal_file, idf)\n",
437 |     "dev_relevance, dev_ndcg =  get_relevance(dev_rel_file)\n",
438 |     "dev_predicts = lm.predict_model(dev_features)\n",
439 |     "\n",
440 |     ""
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "markdown",
445 |    "metadata": {},
446 |    "source": [
447 |     "Make sure your code passes the sanity check below."
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 8,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "assert dev_features.shape[1] == 5, 'Train features are of incorrect shape. They should be 5 dimensions, but got {}'.format(train_predicts.shape[1])\n",
457 |     "\n",
458 |     "assert dev_relevance.shape[0]== 1187, 'Relevance vector is of incorrect shape. Expected 1187, but got {}'.format(dev_relevance.shape[0])\n",
459 |     "\n",
460 |     "assert dev_predicts.shape[0] == 1187, 'Predictions are of incorrect shape. Expected 1187, but got {}'.format(dev_predicts.shape[0])"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {},
466 |    "source": [
467 |     "## Evaluation\n",
468 |     "\n",
469 |     "Using the predictions from your trained model, compute the mean squared error and NDCG score that you receive.  Include the score you received in your report. For the development data set, the course staff received an NDCG score of ~0.83."
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 9,
475 |    "metadata": {},
476 |    "outputs": [],
477 |    "source": [
478 |     "def NDCG_calc_for_LTR (dev_ndcg, dev_predicts, out_file=\"ranked_result_default\"):\n",
479 |     "\n",
480 |     "    ''' We provide this function to calculate the average NDCG score given a predicted score and a ground truth score.\n",
481 |     "        Note that the code below calls rank_with_score() in the Rank class, so the correct value for NDCG \n",
482 |     "        depends on the correct implmementation of that function.\n",
483 |     "         Args:\n",
484 |     "                dev_ndcg (type NDCG): Object that contains the \"ground truth\" relevance scores in dev_ndcg.rel_scores \n",
485 |     "                dev_predicts: numpy array of dimension (N,) which contains predicted scores for a dataset.\n",
486 |     "                out_file: filename to which the ranked_result_file is written\n",
487 |     "            \n",
488 |     "        Returns: avg_ndcg_score: Scalar that averages NDCG score across all queries. \n",
489 |     "    \n",
490 |     "    '''\n",
491 |     "    idx = 0\n",
492 |     "    dev_predicts_dict = {}\n",
493 |     "\n",
494 |     "    #Converts the dev_predicts vector into query->url->score dict\n",
495 |     "    for query, url_dict in dev_ndcg.rel_scores.items():\n",
496 |     "        query_obj = Query(query) #Converts str to Query object\n",
497 |     "        dev_curr_dict = {}\n",
498 |     "        for url in url_dict.keys():\n",
499 |     "            dev_curr_dict[url] = dev_predicts[idx]\n",
500 |     "            idx+=1\n",
501 |     "        dev_predicts_dict[query_obj] = dev_curr_dict\n",
502 |     "\n",
503 |     "    #Orders dev_predicts_dict. This remains a Query->url->score dict after ordering.\n",
504 |     "    #Note that this depends on your implementation of the rank_with_score() function in the Rank class.\n",
505 |     "    r = Rank()\n",
506 |     "    dev_predicts_dict_ordered = r.rank_with_score(dev_predicts_dict)\n",
507 |     "\n",
508 |     "    #Creates a Query->Document->score dict called dev_predicts_ranks that will be written to file.\n",
509 |     "    dev_data = load_train_data(dev_signal_file) #Query->Document dict\n",
510 |     "\n",
511 |     "    dev_predicts_ranked = {} #The Query->Document->Score dict that will be written to file.\n",
512 |     "    for query in dev_predicts_dict_ordered:\n",
513 |     "        doc_to_score = {}\n",
514 |     "        for url in dev_predicts_dict_ordered[query]:\n",
515 |     "            doc = dev_data[query][url]\n",
516 |     "            doc_to_score[doc] = dev_predicts_dict_ordered[query][url]\n",
517 |     "        dev_predicts_ranked[query] = doc_to_score\n",
518 |     "\n",
519 |     "    #Writes dev_predicts_ranked to file.\n",
520 |     "    ranked_result_file = os.path.join(\"output\", out_file)\n",
521 |     "    r.write_ranking_to_file(dev_predicts_ranked, ranked_result_file)\n",
522 |     "\n",
523 |     "    #Uses the NDCG class to get the NDCG score\n",
524 |     "    dev_ndcg.read_ranking_calc(ranked_result_file)\n",
525 |     "    avg_ndcg_score = dev_ndcg.get_avg_ndcg()\n",
526 |     "    return avg_ndcg_score\n",
527 |     ""
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": 10,
533 |    "metadata": {},
534 |    "outputs": [
535 |     {
536 |      "output_type": "stream",
537 |      "name": "stdout",
538 |      "text": "Mean Squared Error: 0.8238012404188285\nWrite ranking result to output/ranked_result_pointwise sucessfully!\nAverage NDCG score: 0.7972582415705513\n"
539 |     }
540 |    ],
541 |    "source": [
542 |     "# Compute mean squared error and NDCG Score\n",
543 |     "\n",
544 |     "mse = mean_squared_error(dev_relevance, dev_predicts)\n",
545 |     "\n",
546 |     "print (\"Mean Squared Error:\", mse)\n",
547 |     "\n",
548 |     "print (\"Average NDCG score:\", NDCG_calc_for_LTR(dev_ndcg, dev_predicts, \"ranked_result_pointwise\"))\n",
549 |     "\n",
550 |     "\n",
551 |     ""
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "markdown",
556 |    "metadata": {},
557 |    "source": [
558 |     "### Task 2: Gradient Boosted Decision Trees (6%)"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "markdown",
563 |    "metadata": {},
564 |    "source": [
565 |     "We next use the LambdaMART algorithm to implement Gradient Boosted Decision Trees. \n",
566 |     "\n",
567 |     "LambdaMART is the boosted tree version of an earlier algorithm, LambdaRank. The full evolution of algorithms from RankNet through LambdaRANK, MART and LambdaMART is presented below (Page 16 and 17 are particularly important). \n",
568 |     "https://pdfs.semanticscholar.org/0df9/c70875783a73ce1e933079f328e8cf5e9ea2.pdf\n",
569 |     "\n",
570 |     "The relevant lecture notes can be found here: http://web.stanford.edu/class/cs276/19handouts/lecture15-learning-ranking-1per.pdf"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "markdown",
575 |    "metadata": {},
576 |    "source": [
577 |     "We can use the XGBoost package to implement LambdaMART. You may find it helpful to read the documentation here: https://xgboost.readthedocs.io/en/latest/get_started.html"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "markdown",
582 |    "metadata": {},
583 |    "source": [
584 |     "#### Parameter description (not exhaustive, see here for more details): https://xgboost.readthedocs.io/en/latest/parameter.html\n",
585 |     "\n",
586 |     "General Parameters (**make sure to use the following values**):\n",
587 |     "- \"booster\": use \"gbtree\". Uses a tree-based model for boosting\n",
588 |     "- \"objective\": use \"rank:pairwise\". Uses the LambdaMART algorithm to minimize pairwise loss. \n",
589 |     "- \"eval_metric: use \"ndcg\" (while we will be evaluating your performance solely based on ndcg, feel free to test performance on other metrics)\n",
590 |     "\n",
591 |     "Hyperparamters to be tuned (not exhaustive):\n",
592 |     "- \"eta\": Learning rate\n",
593 |     "- \"gamma\": Minimum loss reduction required to make a further partition on a leaf node of the tree\n",
594 |     "- \"max_depth\": Maximum depth of a tree\n",
595 |     "- \"subsample\": Subsample ratio of training instances to prevent overfitting\n",
596 |     "\n",
597 |     "When training, you should also experiment with early stopping to prevent overfitting. Take a look at the description of early stopping here: https://xgboost.readthedocs.io/en/latest/python/python_intro.html"
598 |    ]
599 |   },
600 |   {
601 |    "cell_type": "code",
602 |    "execution_count": 11,
603 |    "metadata": {},
604 |    "outputs": [],
605 |    "source": [
606 |     "train_query_dict = load_train_data(train_signal_file)\n",
607 |     "train_groups = []\n",
608 |     "for query, url_dict in train_query_dict.items():\n",
609 |     "    train_groups.append(len(url_dict))\n",
610 |     "    \n",
611 |     "assert sum(train_groups) == 7026, 'Expected 7026 (query, doc) pairs, but got {}'.format(sum(train_groups))\n",
612 |     "assert len(train_groups) == 731, 'Expected 731 queries, but got {}'.format(len(train_groups))\n",
613 |     "\n",
614 |     "\n",
615 |     "dev_query_dict = load_train_data(dev_signal_file)\n",
616 |     "dev_groups = []\n",
617 |     "for query, url_dict in dev_query_dict.items():\n",
618 |     "    dev_groups.append(len(url_dict))\n",
619 |     "    \n",
620 |     "assert sum(dev_groups) == 1187, 'Expected 1187 (query, doc) pairs, but got {}'.format(sum(train_groups))\n",
621 |     "assert len(dev_groups) == 124, 'Expected 124 queries, but got {}'.format(len(train_groups))\n",
622 |     "\n",
623 |     "dtrain = xgb.DMatrix(train_features, label = train_relevance)\n",
624 |     "dtrain.set_group(train_groups)\n",
625 |     "ddev = xgb.DMatrix(dev_features, label = dev_relevance) \n",
626 |     "ddev.set_group(dev_groups)\n",
627 |     "\n",
628 |     ""
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": 17,
634 |    "metadata": {},
635 |    "outputs": [
636 |     {
637 |      "output_type": "stream",
638 |      "name": "stdout",
639 |      "text": "Overwriting submission/gbdt.py\n"
640 |     }
641 |    ],
642 |    "source": [
643 |     "%%tee submission/gbdt.py\n",
644 |     "\n",
645 |     "class GBDTLearner:\n",
646 |     "    \n",
647 |     "    def __init__(self):\n",
648 |     "        self.params = None\n",
649 |     "        self.model = None\n",
650 |     "\n",
651 |     "    def train_model (self, dtrain, evallist):\n",
652 |     "    \n",
653 |     "        '''\n",
654 |     "        - Specifies parameters for XGBoost training\n",
655 |     "        - Trains model\n",
656 |     "\n",
657 |     "        Args:\n",
658 |     "                dtrain (type DMatrix): DMatrix is a internal data structure that used by XGBoost \n",
659 |     "                which is optimized for both memory efficiency and training speed.\n",
660 |     "                \n",
661 |     "                evallist (array of tuples): The datasets on which the algorithm reports performance as training takes place\n",
662 |     "                \n",
663 |     "\n",
664 |     "        Returns: none\n",
665 |     "        '''\n",
666 |     "        num_rounds = 10 #Experiment with different values of this parameter\n",
667 |     "        \n",
668 |     "        ### Begin your code\n",
669 |     "        self.params = {\n",
670 |     "            \"booster\":\"gbtree\",\n",
671 |     "            \"objective\":\"rank:pairwise\",\n",
672 |     "            \"eval_metric\":\"ndcg\",\n",
673 |     "            \"eta\":0.01,\n",
674 |     "            \"max_depth\":10,\n",
675 |     "            \"gamma\":0.01,\n",
676 |     "            \"subsample\":1,\n",
677 |     "        }\n",
678 |     "        self.model = xgb.train(self.params, dtrain=dtrain, evals=evallist, early_stopping_rounds=num_rounds, num_boost_round=num_rounds)\n",
679 |     "        ### End your code\n",
680 |     "    \n",
681 |     "    def predict_model (self, dtest):\n",
682 |     "    \n",
683 |     "        '''\n",
684 |     "        - Output predicted scores based on the trained model.\n",
685 |     "\n",
686 |     "        Args:\n",
687 |     "                dtest (type DMatrix): DMatrix that contains the dev/test signal data\n",
688 |     "\n",
689 |     "        Returns:\n",
690 |     "                y_pred (numpy array of dimension (N,)): Predicted relevance scores for each query, document pair\n",
691 |     "                based on the trained  model.\n",
692 |     "        '''\n",
693 |     "        ### Begin your code\n",
694 |     "        return self.model.predict(dtest, self.model.best_ntree_limit)\n",
695 |     "        ### End your code\n",
696 |     "\n",
697 |     "\n",
698 |     ""
699 |    ]
700 |   },
701 |   {
702 |    "cell_type": "code",
703 |    "execution_count": 18,
704 |    "metadata": {},
705 |    "outputs": [
706 |     {
707 |      "output_type": "stream",
708 |      "name": "stdout",
709 |      "text": "[0]\ttrain-ndcg:0.830106\nWill train until train-ndcg hasn't improved in 10 rounds.\n[1]\ttrain-ndcg:0.85287\n[2]\ttrain-ndcg:0.865899\n[3]\ttrain-ndcg:0.868449\n[4]\ttrain-ndcg:0.870436\n[5]\ttrain-ndcg:0.872452\n[6]\ttrain-ndcg:0.876593\n[7]\ttrain-ndcg:0.879496\n[8]\ttrain-ndcg:0.880033\n[9]\ttrain-ndcg:0.880491\n"
710 |     }
711 |    ],
712 |    "source": [
713 |     "#Train a gradient boosted decision trees model.\n",
714 |     "\n",
715 |     "model = GBDTLearner()\n",
716 |     "evallist = [(dtrain, 'train')]\n",
717 |     "model.train_model(dtrain, evallist)\n",
718 |     "\n",
719 |     "# Get predictions on dev set.\n",
720 |     "\n",
721 |     "dev_predicts_gbdt = model.predict_model(ddev)"
722 |    ]
723 |   },
724 |   {
725 |    "cell_type": "code",
726 |    "execution_count": 19,
727 |    "metadata": {},
728 |    "outputs": [],
729 |    "source": [
730 |     "assert dev_predicts_gbdt.shape[0] == 1187, 'Predictions are of incorrect shape. Expected 1187, but got {}'.format(dev_predicts.shape[0])"
731 |    ]
732 |   },
733 |   {
734 |    "cell_type": "code",
735 |    "execution_count": 20,
736 |    "metadata": {},
737 |    "outputs": [
738 |     {
739 |      "output_type": "stream",
740 |      "name": "stdout",
741 |      "text": "Write ranking result to output/ranked_result_gbdt sucessfully!\nAverage NDCG score: 0.7918533475222841\n"
742 |     }
743 |    ],
744 |    "source": [
745 |     "print (\"Average NDCG score:\", NDCG_calc_for_LTR(dev_ndcg, dev_predicts_gbdt, \"ranked_result_gbdt\"))"
746 |    ]
747 |   },
748 |   {
749 |    "cell_type": "markdown",
750 |    "metadata": {},
751 |    "source": [
752 |     "# Task 3: Train your best model (20%)\n",
753 |     "\n",
754 |     "Putting it all together! In this part, train your best model - and feel free to use additional features! Experiment with the following to see which yields the best performance on the dev set:\n",
755 |     "\n",
756 |     "1. Using smallest window feature from the first part of this programming assignment\n",
757 |     "2. Using BM-25 from the first part of this programming assignment\n",
758 |     "3. Using Pagerank from the idf file\n",
759 |     "\n",
760 |     "In addition, you may also choose to experiment with using word vectors. We provide GLoVE embeddings for the words in our vocabulary, which you can download with the help of embedding.py in the base_classes folder.\n",
761 |     "\n",
762 |     "The grader will interface exclusively with the train_and_predict function, but you may choose to write several helper functions as required."
763 |    ]
764 |   },
765 |   {
766 |    "cell_type": "code",
767 |    "execution_count": null,
768 |    "metadata": {},
769 |    "outputs": [],
770 |    "source": [
771 |     "%%tee submission/best_model.py\n",
772 |     "\n",
773 |     "class BestModel:\n",
774 |     "    \n",
775 |     "    def __init__(self):\n",
776 |     "    ### Begin your code\n",
777 |     "\n",
778 |     "    ### End your code\n",
779 |     "   \n",
780 |     "    # You may choose to write other helper functions below \n",
781 |     "    # (such as to augment feature array with additional features)\n",
782 |     "    \n",
783 |     "    ### Begin your code\n",
784 |     "\n",
785 |     "    ### End your code\n",
786 |     "    \n",
787 |     "    \n",
788 |     "    def train_and_predict(self, train_signal_file, train_rel_file, test_signal_file, idf):\n",
789 |     "    \n",
790 |     "        '''\n",
791 |     "        - Receives the training signal and relevance files as parameters\n",
792 |     "        - Creates a feature vector associated with the signal file\n",
793 |     "        - Trains the best possible model on the training data\n",
794 |     "        - Using the trained model, makes a prediction on the test_signal_file\n",
795 |     "        \n",
796 |     "        - \n",
797 |     "\n",
798 |     "        Args:\n",
799 |     "            train_signal_file: filename of training signal\n",
800 |     "            train_rel_file: filename of training relevance file\n",
801 |     "            test_signal_file: filename containing dev/test signal\n",
802 |     "            idf: object of class IDF, containing a fully built idf dictionary\n",
803 |     "            \n",
804 |     "\n",
805 |     "        Returns: none\n",
806 |     "        '''\n",
807 |     "        test_predictions = []\n",
808 |     "    \n",
809 |     "        ### Begin your code\n",
810 |     "\n",
811 |     "        ### End your code\n",
812 |     "        \n",
813 |     "        return test_predictions"
814 |    ]
815 |   },
816 |   {
817 |    "cell_type": "code",
818 |    "execution_count": null,
819 |    "metadata": {},
820 |    "outputs": [],
821 |    "source": [
822 |     "model = BestModel()\n",
823 |     "idf = Idf()\n",
824 |     "train_signal_file = \"pa3-data/pa3.signal.train\"\n",
825 |     "train_rel_file = \"pa3-data/pa3.rel.train\"\n",
826 |     "dev_signal_file = \"pa3-data/pa3.signal.dev\"\n",
827 |     "\n",
828 |     "dev_predicts_best = model.train_and_predict(train_signal_file, train_rel_file, dev_signal_file, idf)\n",
829 |     "\n",
830 |     "dev_rel_file = \"pa3-data/pa3.rel.dev\"\n",
831 |     "dev_relevance, dev_ndcg = get_relevance(dev_rel_file)\n",
832 |     "\n",
833 |     "print (\"Average NDCG score:\", NDCG_calc_for_LTR(dev_ndcg, dev_predicts_best, \"ranked_result_best\"))"
834 |    ]
835 |   },
836 |   {
837 |    "cell_type": "markdown",
838 |    "metadata": {},
839 |    "source": [
840 |     "## Code submission\n",
841 |     "\n",
842 |     "You are now ready to submit the code for your assignment. Refer to [submission instructions section](#Submission-instructions). "
843 |    ]
844 |   },
845 |   {
846 |    "cell_type": "markdown",
847 |    "metadata": {},
848 |    "source": [
849 |     "# Task 4: Written Report (20%)"
850 |    ]
851 |   },
852 |   {
853 |    "cell_type": "markdown",
854 |    "metadata": {},
855 |    "source": [
856 |     "This section is meant to be relatively more open-ended as you describe the model choices you made in this assignment. Please keep your report concise. Be sure to document any design decisions you made, and provide a brief rationale for them. \n",
857 |     "\n",
858 |     "You may choose to insert cells below to generate tables or plots if required."
859 |    ]
860 |   },
861 |   {
862 |    "cell_type": "markdown",
863 |    "metadata": {},
864 |    "source": [
865 |     "### A. Design of feature vectors (Task 1 and 2) (3%)\n",
866 |     "\n",
867 |     "For each (query, document) pair, in designing your feature vector from query vector and document vectors, you had various possible options for (i) term frequency, (ii) document frequency and (iii) normalization. The default option we recommended you start with for the feature vector is lnn.ltc (using the SMART notation ddd.qqq).\n",
868 |     "\n",
869 |     "What other choices did you experiment with? How did the performance compare across these choices? What might be the rationale for this difference in performance across the various models?"
870 |    ]
871 |   },
872 |   {
873 |    "cell_type": "markdown",
874 |    "metadata": {},
875 |    "source": [
876 |     "> Your Answer Here"
877 |    ]
878 |   },
879 |   {
880 |    "cell_type": "markdown",
881 |    "metadata": {},
882 |    "source": [
883 |     "### B. Hyperparameter tuning  (Task 2) (3%)\n",
884 |     "\n",
885 |     "Briefly describe the hyperparameters you tuned for your implementation of XGBoost. \n",
886 |     "Which hyperparameters were most consequential to the performance of the model?\n",
887 |     "\n",
888 |     "Provide an intuition, based on your understanding of the LambdaMART algorithm, for why the performance of the model varied as it did with the hyperparameters you tuned."
889 |    ]
890 |   },
891 |   {
892 |    "cell_type": "markdown",
893 |    "metadata": {},
894 |    "source": [
895 |     "> Your answer here. "
896 |    ]
897 |   },
898 |   {
899 |    "cell_type": "markdown",
900 |    "metadata": {},
901 |    "source": [
902 |     "### C. Model Design and Ablation Analysis (Task 3) (7%)\n",
903 |     "\n",
904 |     "You had the option to include various additional features in your model design. Which features did you experiment with? Which features did you end up using in your final model, and why? \n",
905 |     "\n",
906 |     "We expect ablation analysis on which features provided useful signals and which ones did not. Please include at least two plots and/or tables for this question."
907 |    ]
908 |   },
909 |   {
910 |    "cell_type": "markdown",
911 |    "metadata": {},
912 |    "source": [
913 |     "> Your answer here"
914 |    ]
915 |   },
916 |   {
917 |    "cell_type": "markdown",
918 |    "metadata": {},
919 |    "source": [
920 |     "### D. Error Analysis (Task 3) (7%)\n",
921 |     "\n",
922 |     "Analyze your errors for the best performing model you trained. Please include at least two plots and/or tables for this assignment. "
923 |    ]
924 |   },
925 |   {
926 |    "cell_type": "markdown",
927 |    "metadata": {},
928 |    "source": [
929 |     "> Your Answer Here"
930 |    ]
931 |   },
932 |   {
933 |    "cell_type": "markdown",
934 |    "metadata": {},
935 |    "source": [
936 |     "\n",
937 |     "# Extra Credit (up to 10%)"
938 |    ]
939 |   },
940 |   {
941 |    "cell_type": "markdown",
942 |    "metadata": {},
943 |    "source": [
944 |     "We will give extra credit for best ranking systems in the entire class submitted in Task 3. This is based on the NDCG scores computed on our hidden test data. Include a writeup below that describes the model used, the extensions employed, other models tried, and a hypothesis for why the model used works best. \n",
945 |     "\n",
946 |     "Extra credit will be provided as follows:\n",
947 |     "\n",
948 |     "We will provide:\n",
949 |     "    - 10% for the top few systems in the class (of which 5% is for writeup quality)\n",
950 |     "    - 5% for the next few systems in the class (of which 5% is for writeup quality)"
951 |    ]
952 |   },
953 |   {
954 |    "cell_type": "markdown",
955 |    "metadata": {},
956 |    "source": [
957 |     "> Your writeup here."
958 |    ]
959 |   }
960 |  ],
961 |  "metadata": {
962 |   "kernelspec": {
963 |    "display_name": "Python 3.7.3 64-bit ('cs276-pa3': conda)",
964 |    "language": "python",
965 |    "name": "python37364bitcs276pa3conda5d59678e0f834eb89f8c66ff589d5169"
966 |   },
967 |   "language_info": {
968 |    "codemirror_mode": {
969 |     "name": "ipython",
970 |     "version": 3
971 |    },
972 |    "file_extension": ".py",
973 |    "mimetype": "text/x-python",
974 |    "name": "python",
975 |    "nbconvert_exporter": "python",
976 |    "pygments_lexer": "ipython3",
977 |    "version": "3.7.3-final"
978 |   },
979 |   "toc": {
980 |    "base_numbering": 1,
981 |    "nav_menu": {},
982 |    "number_sections": true,
983 |    "sideBar": true,
984 |    "skip_h1_title": false,
985 |    "title_cell": "Table of Contents",
986 |    "title_sidebar": "Contents",
987 |    "toc_cell": false,
988 |    "toc_position": {},
989 |    "toc_section_display": true,
990 |    "toc_window_display": false
991 |   }
992 |  },
993 |  "nbformat": 4,
994 |  "nbformat_minor": 2
995 | }


--------------------------------------------------------------------------------
/pa2/pa2.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# CS 276 Programming Assignment 2: Spelling Corrector"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "## I. Overview\n",
  15 |     "\n",
  16 |     "In this assignment, we will build a probabilistic spelling corrector to automatically correct errors in queries. More formally, given a (possibly corrupt) raw query $R$, our goal is to find the intended query $Q$ which maximizes the probability $P(Q\\mid R)$. That is, we want to guess the query which the user probably meant to submit. By Bayes' Theorem we have\n",
  17 |     "$$\n",
  18 |     "    P(Q\\mid R) = \\frac{P(R\\mid Q)P(Q)}{P(R)}\\propto P(R\\mid Q)P(Q).\n",
  19 |     "$$\n",
  20 |     "Since our goal is to find the value of $Q$ which maximizes $P(Q\\mid R)$, this shows it is sufficient to maximize $P(R\\mid Q)P(Q)$. With the above formulation in mind, we will build a probabilistic spelling corrector consisting of 4 parts:\n",
  21 |     "  1. **Language Model.**\n",
  22 |     "      Estimates the prior distribution of unigrams and bigrams, allowing us to estimate $P(Q)$. We will use maximum-likelihood estimation, which counts the occurrences of token unigrams and bigrams in the training corpus in order to determine their prior probabilities.\n",
  23 |     "  2. **Edit Probability Model.**\n",
  24 |     "      Estimates the likelihood of errors that may occur in a query, which allows us to estimate $P(R\\mid Q)$. In particular, this component estimates the probability of characters being mistakenly deleted, inserted, substituted, or transposed in a query term.\n",
  25 |     "  3. **Candidate Generator.**\n",
  26 |     "      Takes a raw query $R$ submitted by the user, and generates candidates for $Q$.\n",
  27 |     "  4. **Candidate Scorer.**\n",
  28 |     "      Combines (1), (2), and (3) to compute $Q^{*} = \\arg\\max_{Q}P(Q\\mid R)$. That is, for each $Q$ generated by the candidate generator, the scorer uses the language model to estimate $P(Q)$ and uses the edit probability model to estimate $P(R\\mid Q)$, and finally chooses $Q$ which maximizes $P(Q)P(R\\mid Q)$."
  29 |    ]
  30 |   },
  31 |   {
  32 |    "cell_type": "markdown",
  33 |    "metadata": {},
  34 |    "source": [
  35 |     "## II. Assignment Details\n",
  36 |     "\n",
  37 |     "The assignment is due at **4:00 PM PST on Tuesday, May 7th, 2019**. We have split the assignment up into the following parts:\n",
  38 |     "  1. [Task 1: Spelling Correction with Uniform Edit Costs](#uniform): **55%** of your total grade for this assignment depends on a correctly implemented solution for task 1. Your solution will be evaluated on a hidden test set, and full credit will be given to models that are within 1% of the staff implementation's test-set accuracy or higher. We do not publish the test set queries or our accuracy on the test set. However, as a guideline for performance, the staff implementation with uniform edit probability model gets **82.42% on the dev set.** We will give partial credit on a non-linear scale (which disproportionately favors models that are closer to our threshold for full credit, as an encouragement to squeeze out more performance improvements).\n",
  39 |     "  2. [Task 2: Spelling Correction with Empirical Edit Costs](#empirical): **25%** of your total grade is based on your implementation of task 2. Full credit will be granted for accuracy levels within 1% of the staff implementation's test-set accuracy or higher. Again, we do not publish our test set accuracy, but the staff implementation with empirical edit probability model gets **87.91% on the dev set.** As with Task 1, we will give partial for lower accuracy levels, we will give partial credit on a non-linear scale, with credit accruing more rapidly as your solution gets closer to the target.\n",
  40 |     "  3. [Written Report](#written): **20%** of your grade is based on the 1-2 page report that you will submit through Gradescope. See [Section VI](#written) for instructions and grading breakdown.\n",
  41 |     "  4. [Extra Credit (Optional)](#extra): **Up to 10%** extra credit will be awarded for implementing extensions, with an explanation in the report. It is not necessary for the extensions to radically improve accuracy to get credit. As described in [Section VII](#extra), you can also get a small amount of extra credit if your system is a top performer in terms of accuracy or running time.\n",
  42 |     "\n",
  43 |     "The submission procedure is the same as in PA1, but we repeat the instructions here for your reference:\n",
  44 |     "  - This assignment should be done in teams of two or individually. Assignments are graded the same for one and two person teams.\n",
  45 |     "  - The notebook will automatically generate Python files in `submission` folder. To submit your assignment, **upload the Python files to the PA2-code assignment on Gradescope.** Note that you need to upload all the individual files in the `submission` folder without zipping it.\n",
  46 |     "  - While solving the assignment, do **NOT** change class and method names, otherwise the autograder tests will fail.\n",
  47 |     "  - You'll also have to **upload a PDF version of the notebook (which would be primarily used to grade your report section of the notebook) to PA2-PDF assignment on Gradescope.** Note that directly converting the PDF truncates code cells. To get a usable PDF version, first click on `File > Print Preview`, which will open in a new tab, then print to PDF using your browser's print functionality.\n",
  48 |     "  - After uploading the PDF make sure you tag all the relevant pages to each question. We reserve the right to penalize for mistagged submissions.\n",
  49 |     "  - If you are solving the assignment in a team of two, add the other student as a group member after submitting the assignment. Do **NOT** submit the same assignment twice."
  50 |    ]
  51 |   },
  52 |   {
  53 |    "cell_type": "markdown",
  54 |    "metadata": {},
  55 |    "source": [
  56 |     "#### A Note on Numerical Stability\n",
  57 |     "\n",
  58 |     "Many of the probabilities we will encounter in this assignment are very small. When we multiply many small numbers together, there is a risk of [underﬂow](https://en.wikipedia.org/wiki/Arithmetic_underflow). Therefore, it is common practice to perform this type of probability calculation in log space. Recall that:\n",
  59 |     "  1. The log function is monotonically increasing, therefore $\\arg\\max p = \\arg\\max\\log p$.\n",
  60 |     "  2. We have $\\log(pq) = \\log p + \\log q$, and by extension $\\log\\left(\\prod_{i} p_i\\right) = \\sum_{i}\\log p_i$.\n",
  61 |     "\n",
  62 |     "As a result, if we want to maximize $P(\\textbf{x}) = P(x_1)P(x_2)\\cdots P(x_n)$, we can equivalently maximize $\\log P(\\textbf{x}) = \\log P(x_1) + \\log P(x_2) + \\cdots + \\log P(x_n)$. **For numerical stability, we recommend that you use this log-space formulation throughout the assignment.**"
  63 |    ]
  64 |   },
  65 |   {
  66 |    "cell_type": "markdown",
  67 |    "metadata": {},
  68 |    "source": [
  69 |     "<a id=\"dataset\"></a>\n",
  70 |     "## III. Dataset\n",
  71 |     "\n",
  72 |     "The dataset you will be working with for this assignment is available as a zip file at [this link](http://web.stanford.edu/class/cs276/pa/pa2-data.zip). The unzipped data directory will contain the following subdirectories:\n",
  73 |     "  - **Language Modeling Morpus (`pa2-data/corpus/`).** 99,904 documents crawled from the stanford.edu domain. The corpus is organized in a block structure found at `pa2-data/corpus/`, where you'll find 10 files. Each line in a file represents the text of a single document. You will use the tokens in these documents to build a language model.\n",
  74 |     "  - **Query Training Set (`pa2-data/training_set/`).** 819,722 pairs of misspelled queries and their corresponding corrected versions, with each pair separated by an edit distance of at most one. The two queries are tab-separated in the file `pa2-data/training_set/edit1s.txt`. You will use this data to build a probability model for the \"noisy channel\" of spelling errors.\n",
  75 |     "  - **Query Dev Set (`pa2-data/dev_set`).** 455 pairs of misspelled and corrected queries, which you will use to measure the performance of your model.  There are three files in `pa2-data/dev_set/`: the (possibly) misspelled queries are in `queries.txt`, corrected versions are in `gold.txt`, and Google's suggested spelling corrections are in `google.txt`.\n",
  76 |     "  \n",
  77 |     "Run the following code blocks to import packages, download, and unzip the data."
  78 |    ]
  79 |   },
  80 |   {
  81 |    "cell_type": "code",
  82 |    "execution_count": 1,
  83 |    "metadata": {},
  84 |    "outputs": [],
  85 |    "source": [
  86 |     "%reload_ext autograding_magics"
  87 |    ]
  88 |   },
  89 |   {
  90 |    "cell_type": "code",
  91 |    "execution_count": 2,
  92 |    "metadata": {},
  93 |    "outputs": [
  94 |     {
  95 |      "output_type": "stream",
  96 |      "name": "stdout",
  97 |      "text": "Overwriting submission/imports.py\n"
  98 |     }
  99 |    ],
 100 |    "source": [
 101 |     "%%tee submission/imports.py\n",
 102 |     "\n",
 103 |     "# Import modules\n",
 104 |     "import math\n",
 105 |     "import os\n",
 106 |     "import urllib.request\n",
 107 |     "import zipfile\n",
 108 |     "from collections import Counter\n",
 109 |     "from tqdm import tqdm"
 110 |    ]
 111 |   },
 112 |   {
 113 |    "cell_type": "code",
 114 |    "execution_count": 4,
 115 |    "metadata": {},
 116 |    "outputs": [
 117 |     {
 118 |      "output_type": "stream",
 119 |      "name": "stdout",
 120 |      "text": "Data downloaded and unzipped to pa2-data...\n\nDirectory Structure:\npa2-data/\n  - dev_set/\n  - corpus/\n  - training_set/\n"
 121 |     }
 122 |    ],
 123 |    "source": [
 124 |     "# Download dataset\n",
 125 |     "data_dir = 'pa2-data'\n",
 126 |     "data_url = 'http://web.stanford.edu/class/cs276/pa/{}.zip'.format(data_dir)\n",
 127 |     "urllib.request.urlretrieve(data_url, '{}.zip'.format(data_dir))\n",
 128 |     "\n",
 129 |     "# Unzip dataset\n",
 130 |     "with zipfile.ZipFile('{}.zip'.format(data_dir), 'r') as zip_fh:\n",
 131 |     "    zip_fh.extractall()\n",
 132 |     "print('Data downloaded and unzipped to {}...\\n'.format(data_dir))\n",
 133 |     "\n",
 134 |     "# Print the directory structure\n",
 135 |     "print('Directory Structure:')\n",
 136 |     "print(data_dir + os.path.sep)\n",
 137 |     "for sub_dir in os.listdir(data_dir):\n",
 138 |     "    if not sub_dir.startswith('.'):\n",
 139 |     "        print('  - ' + sub_dir + os.path.sep)"
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "markdown",
 144 |    "metadata": {},
 145 |    "source": [
 146 |     "<a id='uniform'></a>\n",
 147 |     "## IV. Task 1: Spelling Correction with Uniform Edit Costs (55%)\n",
 148 |     "\n",
 149 |     "### IV.1. Language Model\n",
 150 |     "\n",
 151 |     "We will now build a language model to estimate $P(Q)$ from the training corpus. We will treat $Q$ as a sequence of terms $(w_1, \\ldots, w_n)$ whose probability is computed as\n",
 152 |     "$$\n",
 153 |     "P(w_1, \\ldots, w_n) = P(w_1)P(w_2\\mid w_1)\\cdots P(w_n\\mid w_{n-1}),\n",
 154 |     "$$\n",
 155 |     "where $P(w_1)$ is the unigram probability of term $w_1$, and $P(w_{i}\\mid w_{i-1})$ is the bigram probability of $(w_{i-1}, w_i)$ for $i \\in \\{2, \\ldots, n\\}$."
 156 |    ]
 157 |   },
 158 |   {
 159 |    "cell_type": "markdown",
 160 |    "metadata": {},
 161 |    "source": [
 162 |     "#### IV.1.1. Calculating Unigram and Bigram Probabilities\n",
 163 |     "\n",
 164 |     "Our language model will use the maximum likelihood estimates (MLE) for both probabilities, which turn out to be their observed frequencies:\n",
 165 |     "$$\n",
 166 |     "\\begin{align*}\n",
 167 |     "    P_{\\text{MLE}}(w_i) & = \\frac{\\texttt{count}(w_i)}{T},\n",
 168 |     "    &\n",
 169 |     "    P_{\\text{MLE}}(w_i\\mid w_{i-1}) & = \\frac{\\texttt{count}((w_{i}, w_{i-1}))}{\\texttt{count}(w_{i-1})},\n",
 170 |     "\\end{align*}\n",
 171 |     "$$\n",
 172 |     "where $T$ is the total number of tokens in our corpus, and where $\\texttt{count}$ simply counts occurrences of unigrams or bigrams in the corpus. In summary, computing unigram probabilities $P(w_i)$ and bigram probabilities $P(w_{i}\\mid w_{i-1})$ is a simple matter of counting the unigrams and bigrams that appear throughout the corpus.\n",
 173 |     "\n",
 174 |     "Fill out the following code block to count the unigrams and bigrams in our corpus."
 175 |    ]
 176 |   },
 177 |   {
 178 |    "cell_type": "code",
 179 |    "execution_count": 3,
 180 |    "metadata": {},
 181 |    "outputs": [
 182 |     {
 183 |      "output_type": "stream",
 184 |      "name": "stdout",
 185 |      "text": "Overwriting submission/language_model_part1.py\n"
 186 |     }
 187 |    ],
 188 |    "source": [
 189 |     "%%tee submission/language_model_part1.py\n",
 190 |     "\n",
 191 |     "class LanguageModel:\n",
 192 |     "    \"\"\"Models prior probability of unigrams and bigrams.\"\"\"\n",
 193 |     "\n",
 194 |     "    def __init__(self, corpus_dir='pa2-data/corpus', lambda_=0.1):\n",
 195 |     "        \"\"\"Iterates over all whitespace-separated tokens in each file in\n",
 196 |     "        `corpus_dir`, and counts the number of occurrences of each unigram and\n",
 197 |     "        bigram. Also keeps track of the total number of tokens in the corpus.\n",
 198 |     "\n",
 199 |     "        Args:\n",
 200 |     "            corpus_dir (str): Path to directory containing corpus.\n",
 201 |     "            lambda_ (float): Interpolation factor for smoothing by unigram-bigram\n",
 202 |     "                interpolation. You only need to save `lambda_` as an attribute for now, and\n",
 203 |     "                it will be used later in `LanguageModel.get_bigram_logp`. See Section\n",
 204 |     "                IV.1.2. below for further explanation.\n",
 205 |     "        \"\"\"\n",
 206 |     "        self.lambda_ = lambda_\n",
 207 |     "        self.total_num_tokens = 0        # Counts total number of tokens in the corpus\n",
 208 |     "        self.unigram_counts = Counter()  # Maps strings w_1 -> count(w_1)\n",
 209 |     "        self.bigram_counts = Counter()   # Maps tuples (w_1, w_2) -> count((w_1, w_2))\n",
 210 |     "\n",
 211 |     "        ### Begin your code\n",
 212 |     "        for file_path in os.listdir(corpus_dir):\n",
 213 |     "            with open(os.path.join(corpus_dir, file_path), 'r') as f:\n",
 214 |     "                for line in f.readlines():\n",
 215 |     "                    w_list = line.split()\n",
 216 |     "                    bi_gram = list(zip(w_list, w_list[1:]))\n",
 217 |     "                    self.unigram_counts.update(w_list)\n",
 218 |     "                    self.bigram_counts.update(bi_gram)\n",
 219 |     "                    self.total_num_tokens += len(w_list)\n",
 220 |     "        ### End your code"
 221 |    ]
 222 |   },
 223 |   {
 224 |    "cell_type": "markdown",
 225 |    "metadata": {},
 226 |    "source": [
 227 |     "Now that we have counted the unigrams and bigrams in our corpus, we will add methods for computing query probabilities. First, however, a note about handling bigrams which never occur in our corpus:\n",
 228 |     "\n",
 229 |     "<a id='smoothing'></a>\n",
 230 |     "#### IV.1.2. Smoothing by Interpolation\n",
 231 |     "\n",
 232 |     "The unigram probability model will also serve as our vocabulary, since we are making the assumption that our query language is derived from our document corpus. As a result, we do not need to perform [Laplace smoothing](https://en.wikipedia.org/wiki/Additive_smoothing) on our unigram probabilities, since our candidates will be drawn from this very vocabulary. However, even if we have two query terms that are both members of our query language, there is no guarantee that their corresponding *bigram* appears in our training corpus. To handle this data sparsity problem, we will *interpolate* unigram and bigram probabilities to get our ﬁnal conditional probability estimates:\n",
 233 |     "$$\n",
 234 |     "P(w_2\\mid w_1) = \\lambda P_{\\text{MLE}}(w_2) + (1 - \\lambda)P_{\\text{MLE}}(w_2\\mid w_1).\n",
 235 |     "$$\n",
 236 |     "Try setting $\\lambda$ to a small value (say, 0.1) in the beginning, and experiment later with varying this parameter to see if you can get better correction accuracies on the development dataset. However, be careful not to overﬁt your development dataset. (You might consider reserving a small portion of your development data to tune the parameters).\n",
 237 |     "\n",
 238 |     "Fill out the functions below to complete our `LanguageModel` class."
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "code",
 243 |    "execution_count": 4,
 244 |    "metadata": {},
 245 |    "outputs": [
 246 |     {
 247 |      "output_type": "stream",
 248 |      "name": "stdout",
 249 |      "text": "Overwriting submission/language_model_part2.py\n"
 250 |     }
 251 |    ],
 252 |    "source": [
 253 |     "%%tee submission/language_model_part2.py\n",
 254 |     "\n",
 255 |     "# NOTE: Syntax on the following line just extends the `LanguageModel` class\n",
 256 |     "class LanguageModel(LanguageModel):\n",
 257 |     "    def get_unigram_logp(self, unigram):\n",
 258 |     "        \"\"\"Computes the log-probability of `unigram` under this `LanguageModel`.\n",
 259 |     "\n",
 260 |     "        Args:\n",
 261 |     "            unigram (str): Unigram for which to compute the log-probability.\n",
 262 |     "\n",
 263 |     "        Returns:\n",
 264 |     "            log_p (float): Log-probability of `unigram` under this\n",
 265 |     "                `LanguageModel`.\n",
 266 |     "        \"\"\"\n",
 267 |     "        ### Begin your code\n",
 268 |     "        return math.log(self.unigram_counts[unigram] / self.total_num_tokens)\n",
 269 |     "        ### End your code\n",
 270 |     "\n",
 271 |     "    def get_bigram_logp(self, w_1, w_2):\n",
 272 |     "        \"\"\"Computes the log-probability of `unigram` under this `LanguageModel`.\n",
 273 |     "\n",
 274 |     "        Note:\n",
 275 |     "            Use self.lambda_ for the unigram-bigram interpolation factor.\n",
 276 |     "\n",
 277 |     "        Args:\n",
 278 |     "            w_1 (str): First word in bigram.\n",
 279 |     "            w_2 (str): Second word in bigram.\n",
 280 |     "\n",
 281 |     "        Returns:\n",
 282 |     "            log_p (float): Log-probability of `bigram` under this\n",
 283 |     "                `LanguageModel`.\n",
 284 |     "        \"\"\"\n",
 285 |     "        ### Begin your code\n",
 286 |     "        p1 = self.lambda_ * self.unigram_counts[w_2] / self.total_num_tokens\n",
 287 |     "        p2 = (1-self.lambda_) * self.bigram_counts.get((w_1, w_2,), 0) / self.unigram_counts[w_1]\n",
 288 |     "        return math.log(p1+p2)\n",
 289 |     "        ### End your code\n",
 290 |     "\n",
 291 |     "    def get_query_logp(self, query):\n",
 292 |     "        \"\"\"Computes the log-probability of `query` under this `LanguageModel`.\n",
 293 |     "\n",
 294 |     "        Args:\n",
 295 |     "            query (str): Whitespace-delimited sequence of terms in the query.\n",
 296 |     "\n",
 297 |     "        Returns:\n",
 298 |     "            log_p (float): Log-probability assigned to the query under this\n",
 299 |     "                `LanguageModel`.\n",
 300 |     "        \"\"\"\n",
 301 |     "        ### Begin your code\n",
 302 |     "        w_list = query.split()\n",
 303 |     "        p = self.get_unigram_logp(w_list[0])\n",
 304 |     "        for w_1, w_2 in zip(w_list, w_list[1:]):\n",
 305 |     "            p += self.get_bigram_logp(w_1, w_2)\n",
 306 |     "        return p\n",
 307 |     "        ### End your code"
 308 |    ]
 309 |   },
 310 |   {
 311 |    "cell_type": "code",
 312 |    "execution_count": 5,
 313 |    "metadata": {},
 314 |    "outputs": [
 315 |     {
 316 |      "output_type": "stream",
 317 |      "name": "stdout",
 318 |      "text": "P(\"stanford university\") == 0.003335533975682097\nP(\"stanfrod universit\") == 1.0458878000447559e-14\nAll tests passed!\n"
 319 |     }
 320 |    ],
 321 |    "source": [
 322 |     "# Make sure your implementation passes the following sanity checks\n",
 323 |     "# Note: Constructing the language model could take 30 seconds or longer\n",
 324 |     "# We suggest using `tqdm` to track progress in your `LanguageModel.__init__` function.\n",
 325 |     "lm = LanguageModel()\n",
 326 |     "\n",
 327 |     "assert len(lm.unigram_counts) == 347071, 'Invalid num. unigrams: {}'.format(len(lm.unigram_counts))\n",
 328 |     "# assert len(lm.bigram_counts) == 4497257, 'Invalid num. bigrams: {}'.format(len(lm.bigram_counts))\n",
 329 |     "assert lm.total_num_tokens == 25498340, 'Invalid num. tokens: {}'.format(lm.total_num_tokens)\n",
 330 |     "\n",
 331 |     "# Test a reasonable query with and without typos (you should try your own)!\n",
 332 |     "query_wo_typo = \"stanford university\"\n",
 333 |     "query_w_typo = \"stanfrod universit\"\n",
 334 |     "\n",
 335 |     "p_wo_typo = math.exp(lm.get_query_logp(query_wo_typo))\n",
 336 |     "p_w_typo = math.exp(lm.get_query_logp(query_w_typo))\n",
 337 |     "print('P(\"{}\") == {}'.format(query_wo_typo, p_wo_typo))\n",
 338 |     "print('P(\"{}\") == {}'.format(query_w_typo, p_w_typo))\n",
 339 |     "if p_wo_typo <= p_w_typo:\n",
 340 |     "    print('Are you sure \"{}\" should be assigned higher probability than \"{}\"?'\n",
 341 |     "          .format(query_w_typo, query_wo_typo))\n",
 342 |     "print('All tests passed!')"
 343 |    ]
 344 |   },
 345 |   {
 346 |    "cell_type": "markdown",
 347 |    "metadata": {},
 348 |    "source": [
 349 |     "### IV.2. Edit Probability Model\n",
 350 |     "\n",
 351 |     "The edit probability model attempts to estimate $P(R\\mid Q)$. That is, for a fixed candidate query $Q$, the edit probability model estimates the probability that a (possibly corrupt) raw query $R$ was submitted. We quantify the distance between the candidate query $Q$ and the actual input $R$ using the [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance). In Damerau-Levenshtein distance, the possible edits are **insertion**, **deletion**, **substitution**, and **transposition**, each involving single characters as operands. We have provided a base class for `EditCostModel`s below."
 352 |    ]
 353 |   },
 354 |   {
 355 |    "cell_type": "code",
 356 |    "execution_count": 6,
 357 |    "metadata": {},
 358 |    "outputs": [
 359 |     {
 360 |      "output_type": "stream",
 361 |      "name": "stdout",
 362 |      "text": "Overwriting submission/base_edit_probability_model.py\n"
 363 |     }
 364 |    ],
 365 |    "source": [
 366 |     "%%tee submission/base_edit_probability_model.py\n",
 367 |     "\n",
 368 |     "class BaseEditProbabilityModel:\n",
 369 |     "    def get_edit_logp(self, edited, original):\n",
 370 |     "        \"\"\"Gets the log-probability of editing `original` to arrive at `edited`.\n",
 371 |     "        The `original` and `edited` arguments are both single terms that are at\n",
 372 |     "        most one edit apart.\n",
 373 |     "        \n",
 374 |     "        Note: The order of the arguments is chosen so that it reads like an\n",
 375 |     "        assignment expression:\n",
 376 |     "            > edited := EDIT_FUNCTION(original)\n",
 377 |     "        or, alternatively, you can think of it as a (unnormalized) conditional probability:\n",
 378 |     "            > log P(edited | original)\n",
 379 |     "\n",
 380 |     "        Args:\n",
 381 |     "            edited (str): Edited term.\n",
 382 |     "            original (str): Original term.\n",
 383 |     "\n",
 384 |     "        Returns:\n",
 385 |     "            logp (float): Log-probability of `edited` given `original`\n",
 386 |     "                under this `EditProbabilityModel`.\n",
 387 |     "        \"\"\"\n",
 388 |     "        raise NotImplementedError  # Force subclass to implement this method"
 389 |    ]
 390 |   },
 391 |   {
 392 |    "cell_type": "markdown",
 393 |    "metadata": {},
 394 |    "source": [
 395 |     "**It is important to understand that `get_edit_logp` will be called with `original` and `edited` each being single terms that are at most 1 edit apart.** Moreover, its outputs need not be normalized probabilities that sum to 1 over all possible edits to `original` (you can think of the return value more as a \"likelihood score\" than a true probability). We provide an example usage below for clarity:\n",
 396 |     "```python\n",
 397 |     "epm = EditProbabilityModelSubclass(...)  # You will define such a subclass later\n",
 398 |     "original = 'stanford'\n",
 399 |     "edited = 'stanfrod'                      # Edited by transposing 'o' and 'r'\n",
 400 |     "score = epm.get_edit_logp(edited, original)\n",
 401 |     "```"
 402 |    ]
 403 |   },
 404 |   {
 405 |    "cell_type": "markdown",
 406 |    "metadata": {},
 407 |    "source": [
 408 |     "#### IV.2.1. Uniform-Cost Edit Model\n",
 409 |     "\n",
 410 |     "As a first pass, we will implement a *uniform-cost edit model.* This model simplifies the computation of the edit probability by assuming that every individual edit in the Damerau-Levenshtein distance has the same probability. You should try a range of values for your uniform edit probability, but in the beginning 0.01 - 0.10 is appropriate. One important thing to remember in building your model is that the user's input query $R$ may indeed be the right one in a majority of cases (*i.e.,* $R = Q$). Thus we typically choose a high ﬁxed probability for `edited == original`; a reasonable range is 0.90 - 0.95.\n",
 411 |     "\n",
 412 |     "The edit probability model that you construct here will be used when you rank candidates for query corrections. The candidate generator (described in the next section) will make one edit at a time, and it will call the edit probability model each time it makes a single edit to a term, summing log-probabilities for multi-edit changes. Therefore, all you need to do in this part is to calculate the probability of `edited` given that it is **at most one edit from `original`.** This means that `get_edit_logp` will be very simple in this case.\n",
 413 |     "\n",
 414 |     "Fill out the following class to implement a uniform-cost edit model."
 415 |    ]
 416 |   },
 417 |   {
 418 |    "cell_type": "code",
 419 |    "execution_count": 7,
 420 |    "metadata": {},
 421 |    "outputs": [
 422 |     {
 423 |      "output_type": "stream",
 424 |      "name": "stdout",
 425 |      "text": "Overwriting submission/uniform_edit_probability_model.py\n"
 426 |     }
 427 |    ],
 428 |    "source": [
 429 |     "%%tee submission/uniform_edit_probability_model.py\n",
 430 |     "\n",
 431 |     "class UniformEditProbabilityModel(BaseEditProbabilityModel):\n",
 432 |     "    def __init__(self, edit_prob=0.05):\n",
 433 |     "        \"\"\"\n",
 434 |     "        Args:\n",
 435 |     "            edit_prob (float): Probability of a single edit occurring, where\n",
 436 |     "                an edit is an insertion, deletion, substitution, or transposition,\n",
 437 |     "                as defined by the Damerau-Levenshtein distance.\n",
 438 |     "        \"\"\"\n",
 439 |     "        self.edit_prob = edit_prob\n",
 440 |     "\n",
 441 |     "    def get_edit_logp(self, edited, original):\n",
 442 |     "        \"\"\"Gets the log-probability of editing `original` to arrive at `edited`.\n",
 443 |     "        The `original` and `edited` arguments are both single terms that are at\n",
 444 |     "        most one edit apart.\n",
 445 |     "        \n",
 446 |     "        Note: The order of the arguments is chosen so that it reads like an\n",
 447 |     "        assignment expression:\n",
 448 |     "            > edited := EDIT_FUNCTION(original)\n",
 449 |     "        or, alternatively, you can think of it as a (unnormalized) conditional probability:\n",
 450 |     "            > log P(edited | original)\n",
 451 |     "\n",
 452 |     "        Args:\n",
 453 |     "            edited (str): Edited term.\n",
 454 |     "            original (str): Original term.\n",
 455 |     "\n",
 456 |     "        Returns:\n",
 457 |     "            logp (float): Log-probability of `edited` given `original`\n",
 458 |     "                under this `EditProbabilityModel`.\n",
 459 |     "        \"\"\"\n",
 460 |     "        ### Begin your code\n",
 461 |     "        if edited == original:\n",
 462 |     "            return math.log(1 - self.edit_prob)\n",
 463 |     "        return math.log(self.edit_prob)\n",
 464 |     "        ### End your code"
 465 |    ]
 466 |   },
 467 |   {
 468 |    "cell_type": "markdown",
 469 |    "metadata": {},
 470 |    "source": [
 471 |     "Make sure you pass the following sanity checks:"
 472 |    ]
 473 |   },
 474 |   {
 475 |    "cell_type": "code",
 476 |    "execution_count": 8,
 477 |    "metadata": {},
 478 |    "outputs": [
 479 |     {
 480 |      "output_type": "stream",
 481 |      "name": "stdout",
 482 |      "text": "All tests passed!\n"
 483 |     }
 484 |    ],
 485 |    "source": [
 486 |     "EDIT_PROB = 0.05\n",
 487 |     "epm = UniformEditProbabilityModel(edit_prob=EDIT_PROB)\n",
 488 |     "\n",
 489 |     "# Test a basic edit\n",
 490 |     "edited, original = 'stanfrod', 'stanford'\n",
 491 |     "assert math.isclose(epm.get_edit_logp(edited, original), math.log(EDIT_PROB))\n",
 492 |     "\n",
 493 |     "# Test a non-edit\n",
 494 |     "assert math.isclose(epm.get_edit_logp(original, original), math.log(1. - EDIT_PROB))\n",
 495 |     "\n",
 496 |     "print('All tests passed!')"
 497 |    ]
 498 |   },
 499 |   {
 500 |    "cell_type": "markdown",
 501 |    "metadata": {},
 502 |    "source": [
 503 |     "### IV.3. Candidate Generator\n",
 504 |     "\n",
 505 |     "Recall that the candidate generator takes a raw query $R$ submitted by the user, and generates candidates for the intended query $Q$. Since we know that more than 97% of spelling errors are found within an edit distance of 2 from the user's intended query, we encourage you to consider possible query corrections that are within distance 2 of $R$. This is the approach taken by Peter Norvig in [his essay on spelling correction](http://norvig.com/spell-correct.html). However, it is not tractable to use a pure \"brute force\" generator that produces all possible strings within distance 2 of $R$, because for any $R$ of non-trivial length, the number of candidates would be enormous. Thus we would have to evaluate the language and edit probability models on a huge number of candidates.\n",
 506 |     "\n",
 507 |     "\n",
 508 |     "#### IV.3.1. Candidate Generator with Restricted Search Space\n",
 509 |     "\n",
 510 |     "We can make the naïve approach tractable by aggressively narrowing down the search space while generating candidates. There are many valid approaches to efficient candidate generation, but here are a few basic ideas:\n",
 511 |     "  - Begin by looking at *each individual term* in the query string $R$, and consider all possible edits that are distance 1 from that term.\n",
 512 |     "  - Remember that you might consider hyphens and/or spaces as elements of your character set. This will allow you to consider some relatively common errors, like when a space is accidentally inserted in a word, or two terms in the query were mistakenly separated by a space when they should actually be joined.\n",
 513 |     "  - Each time you generate an edit to a term, make sure that the edited term appears in the dictionary. (Remember that we have assumed that all words in a valid candidate query will be found in our training corpus, as mentioned above in [Section IV.1.2](#smoothing) above).\n",
 514 |     "  - If you have generated possible edits to multiple individual terms, take the Cartesian product over these terms to produce a complete candidate query that includes edits to multiple terms. (But remember that you probably shouldn't go beyond a total edit distance of 2 for the query overall).\n",
 515 |     "  \n",
 516 |     "Again, there are many possible extensions and variations on the strategies mentioned here. We encourage you to explore some diﬀerent options, and then describe in your written report the strategies that you ultimately used, and how you optimized their performance. Note that **solutions that exhaustively generate and score all possible query candidates at edit distances 1 and 2 will run too slowly and will not receive full credit.**"
 517 |    ]
 518 |   },
 519 |   {
 520 |    "cell_type": "code",
 521 |    "execution_count": 9,
 522 |    "metadata": {},
 523 |    "outputs": [
 524 |     {
 525 |      "output_type": "stream",
 526 |      "name": "stdout",
 527 |      "text": "Overwriting submission/candidate_generator.py\n"
 528 |     }
 529 |    ],
 530 |    "source": [
 531 |     "%%tee submission/candidate_generator.py\n",
 532 |     "\n",
 533 |     "class CandidateGenerator:\n",
 534 |     "    # Alphabet to use for insertion and substitution\n",
 535 |     "    alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',\n",
 536 |     "                'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',\n",
 537 |     "                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',\n",
 538 |     "                ' ', ',', '.', '-']\n",
 539 |     "\n",
 540 |     "    def __init__(self, lm, epm):\n",
 541 |     "        \"\"\"\n",
 542 |     "        Args:\n",
 543 |     "            lm (LanguageModel): Language model to use for prior probabilities, P(Q).\n",
 544 |     "            epm (EditProbabilityModel): Edit probability model to use for P(R|Q).\n",
 545 |     "        \"\"\"\n",
 546 |     "        self.lm = lm\n",
 547 |     "        self.epm = epm\n",
 548 |     "\n",
 549 |     "    def get_num_oov(self, query):\n",
 550 |     "        \"\"\"Get the number of out-of-vocabulary (OOV) words in `query`.\"\"\"\n",
 551 |     "        return sum(1 for w in query.strip().split()\n",
 552 |     "                   if w not in self.lm.unigram_counts)\n",
 553 |     "\n",
 554 |     "    def filter_and_yield(self, query, lp):\n",
 555 |     "        if query.strip() and self.get_num_oov(query) == 0:\n",
 556 |     "            yield query, lp\n",
 557 |     "\n",
 558 |     "    def get_candidates(self, query):\n",
 559 |     "        \"\"\"Starts from `query`, and performs EDITS OF DISTANCE <=2 to get new\n",
 560 |     "        candidate queries. To make scoring tractable, only returns/yields\n",
 561 |     "        candidates that satisfy certain criteria (ideas for such criteria are\n",
 562 |     "        described in bullet points above).\n",
 563 |     "\n",
 564 |     "        Hint: We suggest you implement a helper function that takes a term and\n",
 565 |     "            generates all possible edits of distance one from that term.\n",
 566 |     "            It should probably only return edits that are in the vocabulary\n",
 567 |     "            (i.e., edits for which `self.get_num_oov(edited) == 0`).\n",
 568 |     "\n",
 569 |     "        Args:\n",
 570 |     "            query (str): Starting query.\n",
 571 |     "\n",
 572 |     "        Returns:\n",
 573 |     "            Iterable over tuples (cdt, cdt_edit_logp) of candidates and\n",
 574 |     "                their associated edit log-probabilities. Return value could be\n",
 575 |     "                a list or a generator yielding tuples of this form.\n",
 576 |     "        \"\"\"\n",
 577 |     "        # Yield the unedited query first\n",
 578 |     "        # We provide this line as an example of how to use `self.filter_and_yield`\n",
 579 |     "        yield from self.filter_and_yield(query, self.epm.get_edit_logp(query, query))\n",
 580 |     "\n",
 581 |     "        ### Begin your code\n",
 582 |     "        from itertools import product\n",
 583 |     "        from collections import defaultdict\n",
 584 |     "\n",
 585 |     "        # start from one term correction\n",
 586 |     "        one_edit_query = []\n",
 587 |     "        one_edit_qualified = defaultdict(set)\n",
 588 |     "        one_edit_query += self.merge_two_term(query)\n",
 589 |     "        \n",
 590 |     "        term_list = query.split()\n",
 591 |     "        two_edit_candidate = []\n",
 592 |     "        two_edit_query = []\n",
 593 |     "        two_edit_qualified = defaultdict(set)\n",
 594 |     "        for i in range(len(term_list)):\n",
 595 |     "            t = term_list[i]\n",
 596 |     "            if t.isdigit() or len(t) == 1:\n",
 597 |     "                continue\n",
 598 |     "            for c in self.get_candidate_term(t):\n",
 599 |     "                if c in self.lm.unigram_counts:\n",
 600 |     "                    edit = ' '.join(term_list[:i]+[c]+term_list[i+1:])\n",
 601 |     "                    one_edit_query.append(edit)\n",
 602 |     "                    one_edit_qualified[i].add((c, self.lm.unigram_counts[c]))\n",
 603 |     "                elif ' ' in c:\n",
 604 |     "                    c1, c2 = c.split(' ')\n",
 605 |     "                    if c1 in self.lm.unigram_counts and c2 in self.lm.unigram_counts:\n",
 606 |     "                        edit = ' '.join(term_list[:i]+[c]+term_list[i+1:])\n",
 607 |     "                        one_edit_query.append(edit)\n",
 608 |     "                else:\n",
 609 |     "                    two_edit_candidate.append((c, i))\n",
 610 |     "        for edit in one_edit_query:\n",
 611 |     "            yield from self.filter_and_yield(edit, self.epm.get_edit_logp(edit, query))\n",
 612 |     "        if not one_edit_query:\n",
 613 |     "            # try two edit distance\n",
 614 |     "            for c, i in two_edit_candidate:\n",
 615 |     "                for c2 in self.get_candidate_term(c):\n",
 616 |     "                    if c2 in self.lm.unigram_counts:\n",
 617 |     "                        edit = ' '.join(term_list[:i]+[c]+term_list[i+1:])\n",
 618 |     "                        two_edit_query.append(edit)\n",
 619 |     "                        two_edit_qualified.add((c2, self.lm.unigram_counts[c2]))\n",
 620 |     "        for edit in two_edit_query:\n",
 621 |     "            yield from self.filter_and_yield(edit, self.epm.get_edit_logp(edit, query))\n",
 622 |     "        if not two_edit_query:\n",
 623 |     "            # try multi-word correction\n",
 624 |     "            all_possible_candidates = []\n",
 625 |     "            for i in range(len(term_list)):\n",
 626 |     "                all_possible_candidates.append(\n",
 627 |     "                    [q[0] for q in sorted(list(one_edit_qualified[i] | two_edit_qualified[i]), key=lambda x: x[1], reverse=True)[:5]]\n",
 628 |     "                    )\n",
 629 |     "            for ts in product(*all_possible_candidates):\n",
 630 |     "                edit = ' '.join(ts)\n",
 631 |     "                yield from self.filter_and_yield(edit, self.epm.get_edit_logp(edit, query))\n",
 632 |     "        yield query, float('-inf')\n",
 633 |     "\n",
 634 |     "    def get_candidate_term(self, term):\n",
 635 |     "        candidates = set()\n",
 636 |     "        for i in range(len(term)+1):\n",
 637 |     "            # delete\n",
 638 |     "            if i < len(term):\n",
 639 |     "                candidates.add(term[:i]+term[i+1:])\n",
 640 |     "            # transposition\n",
 641 |     "            if i < len(term)-1:\n",
 642 |     "                candidates.add(term[:i]+term[i:i+2][::-1]+term[i+2:])\n",
 643 |     "            for c in CandidateGenerator.alphabet:\n",
 644 |     "                # insert\n",
 645 |     "                candidates.add(term[:i]+c+term[i:])\n",
 646 |     "                # substitute\n",
 647 |     "                candidates.add(term[:i]+c+term[i+1:])\n",
 648 |     "        return candidates\n",
 649 |     "\n",
 650 |     "    def merge_two_term(self, query):\n",
 651 |     "        term_list = query.split()\n",
 652 |     "        candidates = []\n",
 653 |     "        if len(term_list) > 1:\n",
 654 |     "            for i in range(len(term_list)-1):\n",
 655 |     "                possible_merge = term_list[i]+term_list[i+1]\n",
 656 |     "                if possible_merge in self.lm.unigram_counts:\n",
 657 |     "                    candidates.append(' '.join(term_list[:i]+[possible_merge]+term_list[i+2:]))\n",
 658 |     "        return candidates\n",
 659 |     "        ### End your code"
 660 |    ]
 661 |   },
 662 |   {
 663 |    "cell_type": "markdown",
 664 |    "metadata": {},
 665 |    "source": [
 666 |     "Make sure your candidate generator passes the following sanity checks. Feel free to add more tests here as you see fit."
 667 |    ]
 668 |   },
 669 |   {
 670 |    "cell_type": "code",
 671 |    "execution_count": 102,
 672 |    "metadata": {
 673 |     "tags": [
 674 |      "outputPrepend",
 675 |      "outputPrepend",
 676 |      "outputPrepend",
 677 |      "outputPrepend",
 678 |      "outputPrepend",
 679 |      "outputPrepend"
 680 |     ]
 681 |    },
 682 |    "outputs": [
 683 |     {
 684 |      "output_type": "stream",
 685 |      "name": "stdout",
 686 |      "text": "All tests passed!\n"
 687 |     }
 688 |    ],
 689 |    "source": [
 690 |     "cg = CandidateGenerator(lm, epm)\n",
 691 |     "query = 'stanford university'\n",
 692 |     "num_candidates = 0\n",
 693 |     "did_generate_original = False\n",
 694 |     "for candidate, candidate_logp in cg.get_candidates(query):\n",
 695 |     "    num_candidates += 1\n",
 696 |     "    if candidate == query:\n",
 697 |     "        did_generate_original = True\n",
 698 |     "\n",
 699 |     "    assert cg.get_num_oov(query) == 0, \\\n",
 700 |     "        \"You should not generate queries with out-of-vocab terms ('{}' has OOV terms)\".format(candidate)\n",
 701 |     "\n",
 702 |     "assert 1e2 <= num_candidates <= 1e4, \\\n",
 703 |     "    \"You should generate between 100 and 10,000 terms (generated {})\".format(num_candidates)\n",
 704 |     "\n",
 705 |     "assert did_generate_original, \"You should generate the original query ({})\".format(query)\n",
 706 |     "\n",
 707 |     "### Begin your code\n",
 708 |     "queries = ['3421393 42391',\n",
 709 |     "            'aims to provllde users with swrl unified theories+']\n",
 710 |     "for q in queries:\n",
 711 |     "    num_candidates = 0\n",
 712 |     "    did_generate_original = False\n",
 713 |     "    for candidate, candidate_logp in cg.get_candidates(q):\n",
 714 |     "        num_candidates += 1\n",
 715 |     "        if candidate == query:\n",
 716 |     "            did_generate_original = True\n",
 717 |     "\n",
 718 |     "        assert cg.get_num_oov(query) == 0, \\\n",
 719 |     "            \"You should not generate queries with out-of-vocab terms ('{}' has OOV terms)\".format(candidate)\n",
 720 |     "### End your code\n",
 721 |     "\n",
 722 |     "print('All tests passed!')"
 723 |    ]
 724 |   },
 725 |   {
 726 |    "cell_type": "markdown",
 727 |    "metadata": {},
 728 |    "source": [
 729 |     "### IV.4. Candidate Scorer\n",
 730 |     "\n",
 731 |     "The candidate scorer's job is to find the most likely query $Q$ given the raw query $R$. It does this by combining the language model for $P(Q)$, the edit probability model for $P(R\\mid Q)$, and the candidate generator (to get candidates for $Q$). Formally, given raw query $R$, the candidate scorer outputs\n",
 732 |     "$$\n",
 733 |     "    Q^{*} = \\arg\\max_{Q_{i}} P(Q_{i}\\mid R) = \\arg\\max_{Q_{i}} P(R\\mid Q_{i}) P(Q_{i}),\n",
 734 |     "$$\n",
 735 |     "where the max is taken over candidate queries $Q_{i}\\in\\{Q_1, \\ldots, Q_{n}\\}$ produced by the candidate generator given $R$."
 736 |    ]
 737 |   },
 738 |   {
 739 |    "cell_type": "markdown",
 740 |    "metadata": {},
 741 |    "source": [
 742 |     "#### IV.4.1. Candidate Scorer with Weighting\n",
 743 |     "When combining probabilities from the language model and the edit probability model, we can use a parameter to weight the two models differently:\n",
 744 |     "$$\n",
 745 |     "    P(Q\\mid R)\\propto P(R\\mid Q)P(Q)^{\\mu}.\n",
 746 |     "$$\n",
 747 |     "Start out with $\\mu = 1$, and then experiment later with different values of $\\mu$ to see which one gives you the best spelling correction accuracy. Again, be careful not to overfit your development dataset. \n",
 748 |     "\n",
 749 |     "Fill out the following class to complete the spelling corrector with uniform edit cost model."
 750 |    ]
 751 |   },
 752 |   {
 753 |    "cell_type": "code",
 754 |    "execution_count": 10,
 755 |    "metadata": {},
 756 |    "outputs": [
 757 |     {
 758 |      "output_type": "stream",
 759 |      "name": "stdout",
 760 |      "text": "Overwriting submission/candidate_scorer.py\n"
 761 |     }
 762 |    ],
 763 |    "source": [
 764 |     "%%tee submission/candidate_scorer.py\n",
 765 |     "\n",
 766 |     "class CandidateScorer:\n",
 767 |     "    \"\"\"Combines the `LanguageModel`, `EditProbabilityModel`, and\n",
 768 |     "    `CandidateGenerator` to produce the most likely query Q given a raw query R.\n",
 769 |     "    Since the candidate generator already uses the edit probability model, we\n",
 770 |     "    do not need to take the edit probability model as an argument in the constructor.\n",
 771 |     "    \"\"\"\n",
 772 |     "    def __init__(self, lm, cg, mu=1.):\n",
 773 |     "        \"\"\"\n",
 774 |     "        Args:\n",
 775 |     "            lm (LanguageModel): Language model for estimating P(Q).\n",
 776 |     "            cg (CandidateGenerator): Candidate generator for generating possible Q.\n",
 777 |     "            mu (float): Weighting factor for the language model (see write-up).\n",
 778 |     "                Remember that our probability computations are done in log-space.\n",
 779 |     "        \"\"\"\n",
 780 |     "        self.lm = lm\n",
 781 |     "        self.cg = cg\n",
 782 |     "        self.mu = mu\n",
 783 |     "\n",
 784 |     "    def get_score(self, query, log_edit_prob):\n",
 785 |     "        \"\"\"Uses the language model and `log_edit_prob` to compute the final\n",
 786 |     "        score for a candidate `query`. Uses `mu` as weighting exponent for P(Q).\n",
 787 |     "\n",
 788 |     "        Args:\n",
 789 |     "            query (str): Candidate query.\n",
 790 |     "            log_edit_prob (float): Log-probability of candidate query given\n",
 791 |     "                original query (i.e., log(P(R|Q), where R is `query`).\n",
 792 |     "\n",
 793 |     "        Returns:\n",
 794 |     "            log_p (float): Final score for the query, i.e., the log-probability\n",
 795 |     "                of the query.\n",
 796 |     "        \"\"\"\n",
 797 |     "        ### Begin your code\n",
 798 |     "        return log_edit_prob + self.mu * self.lm.get_query_logp(query)\n",
 799 |     "        ### End your code\n",
 800 |     "\n",
 801 |     "    def correct_spelling(self, r):\n",
 802 |     "        \"\"\"Corrects spelling of raw query `r` to get the intended query `q`.\n",
 803 |     "\n",
 804 |     "        Args:\n",
 805 |     "            r (str): Raw input query from the user.\n",
 806 |     "\n",
 807 |     "        Returns:\n",
 808 |     "            q (str): Spell-corrected query. That is, the query that maximizes\n",
 809 |     "                P(R|Q)*P(Q) under the language model and edit probability model,\n",
 810 |     "                restricted to Q's generated by the candidate generator.\n",
 811 |     "        \"\"\"\n",
 812 |     "        ### Begin your code\n",
 813 |     "        best_score = None\n",
 814 |     "        best_query = None\n",
 815 |     "        for c_q, log_p in self.cg.get_candidates(r):\n",
 816 |     "            try:\n",
 817 |     "                score = self.get_score(c_q, log_p)\n",
 818 |     "            except:\n",
 819 |     "                score = float('-inf')\n",
 820 |     "            if best_score is None or score > best_score:\n",
 821 |     "                best_query = c_q\n",
 822 |     "                best_score = score\n",
 823 |     "        return best_query\n",
 824 |     "        ### End your code"
 825 |    ]
 826 |   },
 827 |   {
 828 |    "cell_type": "code",
 829 |    "execution_count": 104,
 830 |    "metadata": {
 831 |     "tags": [
 832 |      "outputPrepend"
 833 |     ]
 834 |    },
 835 |    "outputs": [
 836 |     {
 837 |      "output_type": "stream",
 838 |      "name": "stdout",
 839 |      "text": "Building edit probability model...\nBuilding candidate generator...\nBuilding candidate scorer model...\nRunning spelling corrector...\n\t'stanfrod university' corrected to 'stanford university'\n\t'stanford unviersity' corrected to 'stanford university'\n\t'sanford university' corrected to 'stanford university'\nAll tests passed!\n"
 840 |     }
 841 |    ],
 842 |    "source": [
 843 |     "# Assumes LanguageModel lm was already built above\n",
 844 |     "print('Building edit probability model...')\n",
 845 |     "epm = UniformEditProbabilityModel()\n",
 846 |     "print('Building candidate generator...')\n",
 847 |     "cg = CandidateGenerator(lm, epm)\n",
 848 |     "print('Building candidate scorer model...')\n",
 849 |     "cs = CandidateScorer(lm, cg, mu=1.0)\n",
 850 |     "print('Running spelling corrector...')\n",
 851 |     "\n",
 852 |     "# Add your own queries here to test your spelling corrector\n",
 853 |     "queries = [('stanfrod university', 'stanford university'),\n",
 854 |     "           ('stanford unviersity', 'stanford university'),\n",
 855 |     "           ('sanford university', 'stanford university')]\n",
 856 |     "for query, expected in queries:\n",
 857 |     "    corrected = cs.correct_spelling(query)\n",
 858 |     "    print(\"\\t'{}' corrected to '{}'\".format(query, corrected))\n",
 859 |     "    assert corrected == expected, \"Expected '{}', got '{}'\".format(expected, corrected)\n",
 860 |     "print('All tests passed!')"
 861 |    ]
 862 |   },
 863 |   {
 864 |    "cell_type": "markdown",
 865 |    "metadata": {},
 866 |    "source": [
 867 |     "#### IV.4.2. Dev Set Evaluation (Uniform)\n",
 868 |     "\n",
 869 |     "Now that we have constructed a basic spelling corrector, we will evaluate its performance on the held-out dev set. Recall that the dev set is stored across the files in `pa2-data/dev_set/`:\n",
 870 |     "  - `queries.txt`: One raw query $R$ per line.\n",
 871 |     "  - `google.txt`: Google's corrected queries $Q$ (one per line, same order as `queries.txt`).\n",
 872 |     "  - `gold.txt`: Ground-truth queries $Q$ (again, one per line, same order).\n",
 873 |     "  \n",
 874 |     "Run the following cells to evaluate your spelling corrector on the dev set using your uniform edit probability model. We will also evaluate your model on a private test set after submission. For full credit, your spelling corrector with uniform edit probability model should achieve accuracy within 1% of the staff implementation *on the test set.* **We do not provide test set queries, but as a guideline for performance, the staff implementation gets 82.42% accuracy on the dev set.**"
 875 |    ]
 876 |   },
 877 |   {
 878 |    "cell_type": "code",
 879 |    "execution_count": 13,
 880 |    "metadata": {},
 881 |    "outputs": [],
 882 |    "source": [
 883 |     "def dev_eval(candidate_scorer, verbose=False):\n",
 884 |     "    \"\"\"Evaluate `candidate_scorer` on the dev set.\"\"\"\n",
 885 |     "    query_num = 1\n",
 886 |     "    yours_correct = 0\n",
 887 |     "    google_correct = 0\n",
 888 |     "    # Read originals, ground-truths, Google's predictions\n",
 889 |     "    dev_dir = 'pa2-data/dev_set/'\n",
 890 |     "    with tqdm(total=455, unit=' queries') as pbar, \\\n",
 891 |     "            open(os.path.join(dev_dir, 'queries.txt'), 'r') as query_fh, \\\n",
 892 |     "            open(os.path.join(dev_dir, 'gold.txt'), 'r') as gold_fh, \\\n",
 893 |     "            open(os.path.join(dev_dir, 'google.txt'), 'r') as google_fh:\n",
 894 |     "        while True:\n",
 895 |     "            # Read one line\n",
 896 |     "            query = query_fh.readline().rstrip('\\n')\n",
 897 |     "            if not query:\n",
 898 |     "                # Finished all queries\n",
 899 |     "                break\n",
 900 |     "            corrected = candidate_scorer.correct_spelling(query)\n",
 901 |     "            corrected = ' '.join(corrected.split())  # Squash multiple spaces\n",
 902 |     "            gold = gold_fh.readline().rstrip('\\n')\n",
 903 |     "            google = google_fh.readline().rstrip('\\n')\n",
 904 |     "\n",
 905 |     "            # Count whether correct\n",
 906 |     "            if corrected == gold:\n",
 907 |     "                yours_correct += 1\n",
 908 |     "            if google == gold:\n",
 909 |     "                google_correct += 1\n",
 910 |     "\n",
 911 |     "            # Print running stats\n",
 912 |     "            yours_accuracy = yours_correct / query_num * 100\n",
 913 |     "            google_accuracy = google_correct / query_num * 100\n",
 914 |     "            if verbose:\n",
 915 |     "                print('QUERY {:03d}'.format(query_num))\n",
 916 |     "                print('---------')\n",
 917 |     "                print('(original):      {}'.format(query))\n",
 918 |     "                print('(corrected):     {}'.format(corrected))\n",
 919 |     "                print('(google):        {}'.format(google))\n",
 920 |     "                print('(gold):          {}'.format(gold))\n",
 921 |     "                print('Google accuracy: {}/{} ({:5.2f}%)\\n'\n",
 922 |     "                      .format(google_correct, query_num, google_accuracy))\n",
 923 |     "                print('Your accuracy:   {}/{} ({:5.2f}%)'\n",
 924 |     "                      .format(yours_correct, query_num, yours_accuracy))\n",
 925 |     "            \n",
 926 |     "            pbar.set_postfix(google='{:5.2f}%'.format(google_accuracy),\n",
 927 |     "                             yours='{:5.2f}%'.format(yours_accuracy))\n",
 928 |     "            pbar.update()\n",
 929 |     "            query_num += 1"
 930 |    ]
 931 |   },
 932 |   {
 933 |    "cell_type": "code",
 934 |    "execution_count": 106,
 935 |    "metadata": {},
 936 |    "outputs": [
 937 |     {
 938 |      "output_type": "stream",
 939 |      "name": "stderr",
 940 |      "text": "100%|██████████| 455/455 [03:50<00:00,  1.97 queries/s, google=83.08%, yours=77.36%]\n"
 941 |     }
 942 |    ],
 943 |    "source": [
 944 |     "# Set verbose=True for debugging output\n",
 945 |     "# For reference, our implementation takes ~1 min, 40 sec to run and gets 82.42% accuracy\n",
 946 |     "dev_eval(cs, verbose=False)"
 947 |    ]
 948 |   },
 949 |   {
 950 |    "cell_type": "markdown",
 951 |    "metadata": {},
 952 |    "source": [
 953 |     "<a id='empirical'></a>\n",
 954 |     "## V. Task 2: Spelling Correction with Empirical Edit Costs (25%)\n",
 955 |     "\n",
 956 |     "\n",
 957 |     "### V.1. Improved Edit Probability Model\n",
 958 |     "\n",
 959 |     "Now that our spelling corrector is working correctly with a basic edit probability model, we will turn our attention to a somewhat more realistic approach to edit probabilities. In this task, we will learn these edit probabilities from the empirical error data provided in `data/training_set/edit1s.txt`."
 960 |    ]
 961 |   },
 962 |   {
 963 |    "cell_type": "markdown",
 964 |    "metadata": {},
 965 |    "source": [
 966 |     "#### V.1.1. Empirical Edit Costs\n",
 967 |     "\n",
 968 |     "As outlined in [Section III](#dataset) above, you have been given a list of query pairs that are precisely edit distance 1 from each other. The ﬁrst step for this task is to devise a simple algorithm to determine which speciﬁc edit exists between the two queries in each pair. By aggregating the counts of all such edits over all queries, you can estimate the probability of each individual edit. The edit probability calculation is described in more detail in the [lecture handout on spelling correction](http://web.stanford.edu/class/cs276/handouts/spell_correction.pdf). As an example, if you need to determine the probability of the letter 'e' being (mistakenly) replaced by the letter 'a' in a query, you should calculate:\n",
 969 |     "$$\n",
 970 |     "    P(\\texttt{sub}[a, e]) = \\frac{\\texttt{count}(\\texttt{sub}[a, e])}{\\texttt{count}(e)}.\n",
 971 |     "$$\n",
 972 |     "Note that the insertion and deletion operator probabilities are conditioned on the character before the character being operated on &mdash; which also means that you should devise an appropriate solution to handle the special case of insertions or deletions occurring at the beginning of a word. Finally, to account for the inevitable problem of data sparsity in our edit training ﬁle, you should apply Laplace add-one smoothing to the edit probabilities, as described in the lecture handout (linked above)."
 973 |    ]
 974 |   },
 975 |   {
 976 |    "cell_type": "code",
 977 |    "execution_count": 11,
 978 |    "metadata": {},
 979 |    "outputs": [
 980 |     {
 981 |      "output_type": "stream",
 982 |      "name": "stdout",
 983 |      "text": "Overwriting submission/empirical_edit_probability_model.py\n"
 984 |     }
 985 |    ],
 986 |    "source": [
 987 |     "%%tee submission/empirical_edit_probability_model.py\n",
 988 |     "\n",
 989 |     "class Edit:\n",
 990 |     "    \"\"\"Represents a single edit in Damerau-Levenshtein distance.\n",
 991 |     "    We use this class to count occurrences of different edits in the training data.\n",
 992 |     "    \"\"\"\n",
 993 |     "    INSERTION = 1\n",
 994 |     "    DELETION = 2\n",
 995 |     "    TRANSPOSITION = 3\n",
 996 |     "    SUBSTITUTION = 4\n",
 997 |     "\n",
 998 |     "    def __init__(self, edit_type, c1=None, c2=None):\n",
 999 |     "        \"\"\"\n",
1000 |     "        Members:\n",
1001 |     "            edit_type (int): One of Edit.{NO_EDIT,INSERTION,DELETION,\n",
1002 |     "                TRANSPOSITION,SUBSTITUTION}.\n",
1003 |     "            c1 (str): First (in original) char involved in the edit.\n",
1004 |     "            c2 (str): Second (in original) char involved in the edit.\n",
1005 |     "        \"\"\"\n",
1006 |     "        self.edit_type = edit_type\n",
1007 |     "        self.c1 = c1\n",
1008 |     "        self.c2 = c2\n",
1009 |     "\n",
1010 |     "\n",
1011 |     "class EmpiricalEditProbabilityModel(BaseEditProbabilityModel):\n",
1012 |     "\n",
1013 |     "    START_CHAR = ''      # Used to indicate start-of-query\n",
1014 |     "    NO_EDIT_PROB = 0.92  # Hyperparameter for probability assigned to no-edit\n",
1015 |     "\n",
1016 |     "    def __init__(self, training_set_path='pa2-data/training_set/edit1s.txt'):\n",
1017 |     "        \"\"\"Builds the necessary data structures to compute log-probabilities of\n",
1018 |     "        distance-1 edits in constant time. In particular, counts the unigrams\n",
1019 |     "        (single characters), bigrams (of 2 characters), alphabet size, and\n",
1020 |     "        edit count for insertions, deletions, substitutions, and transpositions.\n",
1021 |     "\n",
1022 |     "        Hint: Use the `Edit` class above. It may be easier to write the `get_edit`\n",
1023 |     "        function first, since you should call that function here.\n",
1024 |     "\n",
1025 |     "        Note: We suggest using tqdm with the size of the training set (819722) to track\n",
1026 |     "        the initializers progress when parsing the training set file.\n",
1027 |     "\n",
1028 |     "        Args:\n",
1029 |     "            training_set_path (str): Path to training set of empirical error data.\n",
1030 |     "        \"\"\"\n",
1031 |     "        # Your code needs to initialize all four of these data structures\n",
1032 |     "        self.unigram_counts = Counter()  # Maps chars c1 -> count(c1)\n",
1033 |     "        self.bigram_counts = Counter()   # Maps tuples (c1, c2) -> count((c1, c2))\n",
1034 |     "        self.alphabet_size = 0           # Counts all possible characters\n",
1035 |     "\n",
1036 |     "        # Maps edit-types -> dict mapping tuples (c1, c2) -> count(edit[c1, c2])\n",
1037 |     "        # Example usage: \n",
1038 |     "        #   > e = Edit(Edit.SUBSTITUTION, 'a', 'b')\n",
1039 |     "        #   > edit_count = self.edit_counts[e.edit_type][(e.c1, e.c2)]\n",
1040 |     "        self.edit_counts = {edit_type: Counter()\n",
1041 |     "                            for edit_type in (Edit.INSERTION, Edit.DELETION,\n",
1042 |     "                                              Edit.SUBSTITUTION, Edit.TRANSPOSITION)}\n",
1043 |     "\n",
1044 |     "        with open(training_set_path, 'r') as training_set:\n",
1045 |     "            for example in tqdm(training_set, total=819722):\n",
1046 |     "                edited, original = example.strip().split('\\t')\n",
1047 |     "\n",
1048 |     "                ### Begin your code\n",
1049 |     "                original_c = [EmpiricalEditProbabilityModel.START_CHAR] + list(original)\n",
1050 |     "                self.unigram_counts.update(original_c)\n",
1051 |     "                self.bigram_counts.update(list(zip(original_c, original_c[1:])))\n",
1052 |     "                \n",
1053 |     "                edit = self.get_edit(edited, original)\n",
1054 |     "                if edit:\n",
1055 |     "                    self.edit_counts[edit.edit_type].update([(edit.c1, edit.c2)])\n",
1056 |     "\n",
1057 |     "            self.alphabet_size = len(self.unigram_counts)\n",
1058 |     "                ### End your code\n",
1059 |     "\n",
1060 |     "    def get_edit(self, edited, original):\n",
1061 |     "        \"\"\"Gets an `Edit` object describing the type of edit performed on `original`\n",
1062 |     "        to produce `edited`.\n",
1063 |     "\n",
1064 |     "        Note: Only edits with an edit distance of at most 1 are valid inputs.\n",
1065 |     "\n",
1066 |     "        Args:\n",
1067 |     "            edited (str): Raw query, which contains exactly one edit from `original`.\n",
1068 |     "            original (str): True query. Want to find the edit which turns this into `edited`.\n",
1069 |     "\n",
1070 |     "        Returns:\n",
1071 |     "            edit (Edit): `Edit` object representing the edit to apply to `original` to get `edited`.\n",
1072 |     "                If `edited == original`, returns None.\n",
1073 |     "        \"\"\"\n",
1074 |     "        ### Begin your code\n",
1075 |     "        if edited == original:\n",
1076 |     "            return None\n",
1077 |     "        if len(edited) == len(original):\n",
1078 |     "            e_cnt = Counter(list(edited))\n",
1079 |     "            o_cnt = Counter(list(original))\n",
1080 |     "            for c1, c2 in zip(edited, original):\n",
1081 |     "                if c1 != c2:\n",
1082 |     "                    if e_cnt == o_cnt:\n",
1083 |     "                        # transposition\n",
1084 |     "                        return Edit(Edit.TRANSPOSITION, c1, c2)\n",
1085 |     "                    else:\n",
1086 |     "                        # substitution\n",
1087 |     "                        return Edit(Edit.SUBSTITUTION, c1, c2)\n",
1088 |     "        else:\n",
1089 |     "            for i in range(min(len(edited), len(original))):\n",
1090 |     "                e, o = edited[i], original[i]\n",
1091 |     "                if e != o:\n",
1092 |     "                    if len(edited) > len(original):\n",
1093 |     "                        # insertion\n",
1094 |     "                        return Edit(Edit.INSERTION, e, original[i-1] if i > 0 else EmpiricalEditProbabilityModel.START_CHAR)\n",
1095 |     "                    else:\n",
1096 |     "                        # deletion\n",
1097 |     "                        return Edit(Edit.DELETION, o, original[i-1] if i > 0 else EmpiricalEditProbabilityModel.START_CHAR)\n",
1098 |     "            if len(edited) > len(original):\n",
1099 |     "                # insertion at the end\n",
1100 |     "                return Edit(Edit.INSERTION, edited[-1], original[-1])\n",
1101 |     "            else:\n",
1102 |     "                # deletion at the end\n",
1103 |     "                return Edit(Edit.DELETION, original[-1], original[-2] if len(original) > 1 else EmpiricalEditProbabilityModel.START_CHAR)\n",
1104 |     "        ### End your code\n",
1105 |     "\n",
1106 |     "    def get_edit_logp(self, edited, original):\n",
1107 |     "        \"\"\"Gets the log-probability of editing `original` to arrive at `edited`.\n",
1108 |     "        The `original` and `edited` arguments are both single terms that are at\n",
1109 |     "        most one edit apart.\n",
1110 |     "        \n",
1111 |     "        Note: The order of the arguments is chosen so that it reads like an\n",
1112 |     "        assignment expression:\n",
1113 |     "            > edited := EDIT_FUNCTION(original)\n",
1114 |     "        or, alternatively, you can think of it as a (unnormalized) conditional probability:\n",
1115 |     "            > log P(edited | original)\n",
1116 |     "\n",
1117 |     "        Args:\n",
1118 |     "            edited (str): Edited term.\n",
1119 |     "            original (str): Original term.\n",
1120 |     "\n",
1121 |     "        Returns:\n",
1122 |     "            logp (float): Log-probability of `edited` given `original`\n",
1123 |     "                under this `EditProbabilityModel`.\n",
1124 |     "        \"\"\"\n",
1125 |     "        ### Begin your code\n",
1126 |     "        edit = self.get_edit(edited, original)\n",
1127 |     "        if edit is None:\n",
1128 |     "            return math.log(EmpiricalEditProbabilityModel.NO_EDIT_PROB)\n",
1129 |     "        else:\n",
1130 |     "            c1, c2 = edit.c1, edit.c2\n",
1131 |     "            m = self.edit_counts[edit.edit_type].get((c1,c2), 0)\n",
1132 |     "            \n",
1133 |     "            if edit.edit_type == Edit.INSERTION or edit.edit_type == Edit.SUBSTITUTION:\n",
1134 |     "                n = self.unigram_counts.get(c1, 0) + self.alphabet_size\n",
1135 |     "            elif edit.edit_type == Edit.DELETION or edit.edit_type == Edit.TRANSPOSITION:\n",
1136 |     "                n = self.bigram_counts.get((c1,c2), 0) + self.alphabet_size * self.alphabet_size\n",
1137 |     "            \n",
1138 |     "            return math.log(m + 1) - math.log(n)\n",
1139 |     "        ### End your code"
1140 |    ]
1141 |   },
1142 |   {
1143 |    "cell_type": "markdown",
1144 |    "metadata": {},
1145 |    "source": [
1146 |     "Run the following cells to evaluate your spelling corrector on the dev set using your empirical edit probability model. We will also evaluate your model on a private test set after submission. For full credit, your spelling corrector with uniform edit probability model should achieve accuracy within 1% of the staff implementation *on the test set.* **We do not provide test set queries, but as a guideline for performance, the staff implementation gets 87.91% accuracy on the dev set.**"
1147 |    ]
1148 |   },
1149 |   {
1150 |    "cell_type": "code",
1151 |    "execution_count": 12,
1152 |    "metadata": {},
1153 |    "outputs": [
1154 |     {
1155 |      "output_type": "stream",
1156 |      "name": "stderr",
1157 |      "text": "100%|██████████| 819722/819722 [00:11<00:00, 72191.44it/s]\n"
1158 |     }
1159 |    ],
1160 |    "source": [
1161 |     "# Build spelling corrector for evaluation on the dev set\n",
1162 |     "# For reference, our initialization times are 25 sec for lm, and 1 min, 40 sec for epm\n",
1163 |     "lm = LanguageModel()\n",
1164 |     "epm = EmpiricalEditProbabilityModel()\n",
1165 |     "cg = CandidateGenerator(lm, epm)\n",
1166 |     "cs = CandidateScorer(lm, cg, mu=1.0)"
1167 |    ]
1168 |   },
1169 |   {
1170 |    "cell_type": "code",
1171 |    "execution_count": 14,
1172 |    "metadata": {},
1173 |    "outputs": [
1174 |     {
1175 |      "output_type": "stream",
1176 |      "name": "stderr",
1177 |      "text": "100%|██████████| 455/455 [05:36<00:00,  1.35 queries/s, google=83.08%, yours=79.12%]\n"
1178 |     }
1179 |    ],
1180 |    "source": [
1181 |     "# Set verbose=True for debugging output\n",
1182 |     "# For reference our implementation takes ~2 min, 30 sec to run and gets 87.91% accuracy\n",
1183 |     "dev_eval(cs, verbose=False)"
1184 |    ]
1185 |   },
1186 |   {
1187 |    "cell_type": "markdown",
1188 |    "metadata": {},
1189 |    "source": [
1190 |     "<a id='written'></a>\n",
1191 |     "## VI. Written Report (20%)\n",
1192 |     "\n",
1193 |     "Be sure to document any design decisions you made, and give some brief rationale for them. Please keep your report concise."
1194 |    ]
1195 |   },
1196 |   {
1197 |    "cell_type": "markdown",
1198 |    "metadata": {},
1199 |    "source": [
1200 |     "#### VI.1. Overall System Design (5%)\n",
1201 |     "\n",
1202 |     "Provide a concise (at most 5 sentences) description of the overall system design."
1203 |    ]
1204 |   },
1205 |   {
1206 |    "cell_type": "markdown",
1207 |    "metadata": {},
1208 |    "source": [
1209 |     "  > For each term, generate possible corrections(including the original form) and calculate the probability of the queries  \n",
1210 |     "  Choose the query with highest probability  \n",
1211 |     "  In candidates generation stage, keep only the top 5 candidates when making a product to create the final query"
1212 |    ]
1213 |   },
1214 |   {
1215 |    "cell_type": "markdown",
1216 |    "metadata": {},
1217 |    "source": [
1218 |     "#### VI.2. Smoothing and Related Techniques (5%)\n",
1219 |     "\n",
1220 |     "Give a short analysis of smoothing techniques used in this assignment. For example, you might produce a plot comparing different values for $\\lambda$ in unigram-bigram interpolation."
1221 |    ]
1222 |   },
1223 |   {
1224 |    "cell_type": "markdown",
1225 |    "metadata": {},
1226 |    "source": [
1227 |     "  > Your Answer Here"
1228 |    ]
1229 |   },
1230 |   {
1231 |    "cell_type": "markdown",
1232 |    "metadata": {},
1233 |    "source": [
1234 |     "#### VI.3. Optimizations for Candidate Generation (5%)\n",
1235 |     "\n",
1236 |     "Provide a brief description of the techniques you used for optimizing candidate generation. Be sure to include an analysis of the amount by which each optimization sped up the overall spelling correction system, as well as any changes in accuracy you were able to measure."
1237 |    ]
1238 |   },
1239 |   {
1240 |    "cell_type": "markdown",
1241 |    "metadata": {},
1242 |    "source": [
1243 |     "  > Your Answer Here"
1244 |    ]
1245 |   },
1246 |   {
1247 |    "cell_type": "markdown",
1248 |    "metadata": {},
1249 |    "source": [
1250 |     "#### VI.4. Tuning Parameters (5%)\n",
1251 |     "Provide at least two plots showing how accuracy varies as you change parameter values (*e.g.,* $\\mu$ and $\\lambda$). Comment briefly (1-2 sentences) on each plot."
1252 |    ]
1253 |   },
1254 |   {
1255 |    "cell_type": "markdown",
1256 |    "metadata": {},
1257 |    "source": [
1258 |     "  > Your Answer Here"
1259 |    ]
1260 |   },
1261 |   {
1262 |    "cell_type": "markdown",
1263 |    "metadata": {},
1264 |    "source": [
1265 |     "<a id='extra'></a>\n",
1266 |     "## VII. Extra Credit (Optional, up to 10%)\n"
1267 |    ]
1268 |   },
1269 |   {
1270 |    "cell_type": "markdown",
1271 |    "metadata": {},
1272 |    "source": [
1273 |     "We have listed a few ideas here, but really any extensions that go above and beyond the scope of tasks 1 and 2 will be considered.\n",
1274 |     "\n",
1275 |     "1. **Expanded edit model.** We saw (or will see) in lecture that there are sometimes spelling errors that may not be within a \"naive\" edit distance 2 of the correct phrase, but that may have a conceptual basis that makes them very common and understandable. (Substituting 'ph' for 'f', or vice versa, is one such example.) Can you incorporate these types of errors into the edit probabilities of your edit probability model?\n",
1276 |     "2. **Empirical edit costs using Wikipedia.** In task 2, you used the dataset of queries 1 edit distance apart to learn edit probabilities. If you look at the queries in this dataset, you will observe that most of these queries are related to the Stanford corpus, the same corpus used to build the language model. It would be interesting to explore what happens if the channel model and language model are learned from diﬀerent datasets (and hence diﬀerent distributions of the underlying data). To this end, you can use a dataset of spelling errors collected from Wikipedia and available on Peter Norvig’s website (http://norvig.com/ngrams/spell-errors.txt).\n",
1277 |     "3. **Alternate Smoothing.** Try other smoothing algorithms (such as Kneser-Ney smoothing) to better capture probabilities in the training corpus.\n",
1278 |     "4. **K-gram index.** To deal with unseen words, it is possible to develop a measure for the probability of that word being spelled correctly by developing a character k-gram index over your corpus. For example, a q not followed by a u should lead to a low probability. This index can also help you generate candidate corrections much more eﬃciently.\n",
1279 |     "5. **Levenshtein Automata.** You can do even faster candidate generation using a Levenshtein transducer (http://en.wikipedia.org/wiki/Levenshtein_transducer), which uses a ﬁnite state automata for fuzzy matching of words. There is an experimental implementation in Python at https://gist.github.com/491973, but it needs to be generalized to perform the transposition operation too. This tutorial might be helpful: http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata.\n",
1280 |     "\n",
1281 |     "Finally, we will give a small amount of extra credit to the best spell correction systems, measured in terms of both accuracy and running time (as computed on our hidden test data). The top 5 systems according to either metric will receive 5% each, while the next 15 systems will receive 2.5% each.\n",
1282 |     "\n",
1283 |     "**If you decide to tackle an extra credit option, give a brief description of your approach and results below.**"
1284 |    ]
1285 |   },
1286 |   {
1287 |    "cell_type": "markdown",
1288 |    "metadata": {},
1289 |    "source": [
1290 |     "  > Your Answer Here"
1291 |    ]
1292 |   }
1293 |  ],
1294 |  "metadata": {
1295 |   "kernelspec": {
1296 |    "display_name": "Python 3.7.3 64-bit ('cs276-pa2': conda)",
1297 |    "language": "python",
1298 |    "name": "python37364bitcs276pa2condaeb20c37ab4874652871607aa8ce7f253"
1299 |   },
1300 |   "language_info": {
1301 |    "codemirror_mode": {
1302 |     "name": "ipython",
1303 |     "version": 3
1304 |    },
1305 |    "file_extension": ".py",
1306 |    "mimetype": "text/x-python",
1307 |    "name": "python",
1308 |    "nbconvert_exporter": "python",
1309 |    "pygments_lexer": "ipython3",
1310 |    "version": "3.7.3-final"
1311 |   }
1312 |  },
1313 |  "nbformat": 4,
1314 |  "nbformat_minor": 2
1315 | }


--------------------------------------------------------------------------------
/pa3/pa3-ranking.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# 1 PA3 - Ranking functions (45% of total PA3 grade)\n",
   8 |     "\n",
   9 |     "In the first part of PA3, you will devise ranking functions to rank results given some queries and corresponding search results. For each query-document pair, you are provided with several features that will help you rank the documents. You are also provided with a training set consisting of query-document pairs along with their relevance values. We will be implementing **three** different ranking functions and will use the **NDCG** metric for evaluating the effectiveness of the ranking function. The estimation of parameters for the ranking functions will be done manually (i.e., no machine learning). \n",
  10 |     "\n",
  11 |     "More specifically, it involves the following tasks:\n",
  12 |     "\n",
  13 |     "\n",
  14 |     "1. [Cosine Similarity (5%)](#V-Task1:-Cosine-Similarity-(5%)) To implement a variant of cosine similarity (with the L1-Norm) as the ranking function\n",
  15 |     "\n",
  16 |     "2. [BM25F (15%)](#VI-Task2:-BM25F-(15%)) To implement the BM25F ranking algorithm.\n",
  17 |     "\n",
  18 |     "3. [Smallest Window (10%)](#VII-Task3:-Smallest-Window-(10%)) Incorporate window sizes into the ranking algorithm from Task 1 (or Task 2 if you prefer). \n",
  19 |     "\n",
  20 |     "4. [Report (15%)](#Report-(15%)) describing your program and answer a set of questions.\n",
  21 |     "\n",
  22 |     "\n",
  23 |     "__Grading for Tasks 1, 2 and 3__\n",
  24 |     "- Half of your grade will be based on your model's performance on an autograder test set. Your scores will be visible to you when you submit on Gradescope, but the test set will not. \n",
  25 |     "- The other half of your grade will be based on your model's performance on a hidden test set. Your scores will only be visible to you when grades for this assignment are released\n",
  26 |     "- You will get full credit for solutions that receive NDCG scores within reasonable range of the NDCG scores received by the teaching staff.\n",
  27 |     "\n",
  28 |     "In the next part of PA3 (Learning to rank), you will explore different approaches to learn the parameters for ranking functions using machine learning. "
  29 |    ]
  30 |   },
  31 |   {
  32 |    "cell_type": "markdown",
  33 |    "metadata": {},
  34 |    "source": [
  35 |     "\n",
  36 |     "## Submission instructions\n",
  37 |     "\n",
  38 |     "1\\. The assignment is due before class at 4:00 pm on the due date (30th May 2019)\n",
  39 |     "\n",
  40 |     "2\\. The notebook will automatically generate **python files** in submission folder. You'll have to upload them to the PA3-code assignment on gradescope. Note that you need to upload all the individual files in the submission folder without zipping it.    \n",
  41 |     "\n",
  42 |     "3\\. While solving the assignment, do **NOT** change class and method names, autograder tests will fail otherwise. \n",
  43 |     "\n",
  44 |     "4\\. You'll also have to upload a **PDF version** of the notebook (which would be primarily used to grade your report section of the notebook) to PA3-PDF assignment on gradescope. Note that directly converting the PDF truncates code cells. To get a usable PDF version, first click on File > Print Preview, which will open in a new tab, then print to PDF using your browser's print functionality. \n",
  45 |     "\n",
  46 |     "5\\. Since there are two notebooks, we have included a script to help you merge them together before upload. Run\n",
  47 |     "```\n",
  48 |     "python pdfcat pa3-ranking.pdf pa3-learning-to-rank.pdf > pa3-solution.pdf\n",
  49 |     "``` \n",
  50 |     "to generate a single concatenated pdf file and upload `pa3-solution.pdf` to gradescope.\n",
  51 |     "\n",
  52 |     "6\\. After uploading the PDF make sure you **tag all the relevant pages to each question**. We will penalize for mistagged submissions. \n",
  53 |     "\n",
  54 |     "7\\. If you are solving the assignment in a team of two, add the other student as a group member after submitting the assignment. Do **NOT** submit the same assignment twice. "
  55 |    ]
  56 |   },
  57 |   {
  58 |    "cell_type": "markdown",
  59 |    "metadata": {},
  60 |    "source": [
  61 |     "## Setup"
  62 |    ]
  63 |   },
  64 |   {
  65 |    "cell_type": "code",
  66 |    "execution_count": 1,
  67 |    "metadata": {},
  68 |    "outputs": [],
  69 |    "source": [
  70 |     "#Load the tee magic which saves a copy of the cell when executed\n",
  71 |     "%reload_ext autograding_magics\n",
  72 |     "%load_ext autoreload\n",
  73 |     "%autoreload 2"
  74 |    ]
  75 |   },
  76 |   {
  77 |    "cell_type": "markdown",
  78 |    "metadata": {},
  79 |    "source": [
  80 |     "The `submission` folder will contain all the files to be submitted, and `base_classes` contains other class definitions which you will not submit."
  81 |    ]
  82 |   },
  83 |   {
  84 |    "cell_type": "code",
  85 |    "execution_count": 2,
  86 |    "metadata": {},
  87 |    "outputs": [],
  88 |    "source": [
  89 |     "import os\n",
  90 |     "try: \n",
  91 |     "    os.mkdir('submission')\n",
  92 |     "except FileExistsError:\n",
  93 |     "    pass\n",
  94 |     "try:\n",
  95 |     "   open('submission/__init__.py', 'x')\n",
  96 |     "except FileExistsError:\n",
  97 |     "   pass\n",
  98 |     "try: \n",
  99 |     "    os.mkdir('base_classes')\n",
 100 |     "except FileExistsError:\n",
 101 |     "    pass\n",
 102 |     "try:\n",
 103 |     "   open('base_classes/__init__.py', 'x')\n",
 104 |     "except FileExistsError:\n",
 105 |     "   pass\n",
 106 |     "try: \n",
 107 |     "    os.mkdir('output')\n",
 108 |     "except FileExistsError:\n",
 109 |     "    pass"
 110 |    ]
 111 |   },
 112 |   {
 113 |    "cell_type": "code",
 114 |    "execution_count": 3,
 115 |    "metadata": {},
 116 |    "outputs": [
 117 |     {
 118 |      "output_type": "stream",
 119 |      "name": "stdout",
 120 |      "text": "Writing submission/imports.py\n"
 121 |     }
 122 |    ],
 123 |    "source": [
 124 |     "%%tee submission/imports.py\n",
 125 |     "\n",
 126 |     "# You can add additional imports here\n",
 127 |     "import sys\n",
 128 |     "import pickle as pkl\n",
 129 |     "import array\n",
 130 |     "import os\n",
 131 |     "import timeit\n",
 132 |     "import contextlib\n",
 133 |     "from collections import OrderedDict, Counter\n",
 134 |     "import math\n",
 135 |     "\n",
 136 |     "import sys\n",
 137 |     "from base_classes.load_train_data import load_train_data\n",
 138 |     "from base_classes.id_map import IdMap\n",
 139 |     "from base_classes.ndcg import NDCG\n",
 140 |     "from base_classes.query import Query\n",
 141 |     "from base_classes.document import Document\n",
 142 |     "import numpy as np"
 143 |    ]
 144 |   },
 145 |   {
 146 |    "cell_type": "markdown",
 147 |    "metadata": {},
 148 |    "source": [
 149 |     "# II. Data "
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "markdown",
 154 |    "metadata": {},
 155 |    "source": [
 156 |     "The data for this assignment is available as a .zip file at: http://web.stanford.edu/class/cs276/pa/pa3-data.zip. The following code puts the data folder under the current directory. We have partitioned the data into two sets for you: \n",
 157 |     "1. Training set of 731 queries (pa3.(signal|rel).train)\n",
 158 |     "2. Development set of 124 queries (pa3.(signal|rel).dev)\n",
 159 |     "\n",
 160 |     "The idea is that while tuning and maximizing performance on the training set, you should also verify how well the tuned parameters are doing on the development set to ensure you are not overfitting your model. There is a hidden test set of 124 queries which we have reserved to grade your final model. For each set, there are two types of files:"
 161 |    ]
 162 |   },
 163 |   {
 164 |    "cell_type": "code",
 165 |    "execution_count": 4,
 166 |    "metadata": {},
 167 |    "outputs": [
 168 |     {
 169 |      "output_type": "stream",
 170 |      "name": "stdout",
 171 |      "text": "Data downloaded and unzipped to pa3-data...\n\nDirectory Structure:\npa3-data/\n  - pa3.rel.train\n  - BSBI.dict\n  - docs.dict\n  - pa3.rel.dev\n  - pa3.signal.dev\n  - terms.dict\n  - pa3.signal.train\n"
 172 |     }
 173 |    ],
 174 |    "source": [
 175 |     "import urllib.request\n",
 176 |     "import zipfile\n",
 177 |     "\n",
 178 |     "# Download dataset\n",
 179 |     "data_dir = 'pa3-data'\n",
 180 |     "data_url = 'http://web.stanford.edu/class/cs276/pa/{}.zip'.format(data_dir)\n",
 181 |     "urllib.request.urlretrieve(data_url, '{}.zip'.format(data_dir))\n",
 182 |     "\n",
 183 |     "# Unzip dataset\n",
 184 |     "with zipfile.ZipFile('{}.zip'.format(data_dir), 'r') as zip_fh:\n",
 185 |     "    zip_fh.extractall()\n",
 186 |     "print('Data downloaded and unzipped to {}...\\n'.format(data_dir))\n",
 187 |     "\n",
 188 |     "# Print the directory structure\n",
 189 |     "print('Directory Structure:')\n",
 190 |     "print(data_dir + os.path.sep)\n",
 191 |     "for sub_dir in os.listdir(data_dir):\n",
 192 |     "    if not sub_dir.startswith('.'):\n",
 193 |     "        print('  - ' + sub_dir)"
 194 |    ]
 195 |   },
 196 |   {
 197 |    "cell_type": "markdown",
 198 |    "metadata": {},
 199 |    "source": [
 200 |     "### Signal File \n",
 201 |     "**– pa3.signal.(train|dev):** lists queries along with documents returned by a widely used search engine for each individual query (the list of documents is shuffled and is not in the same order as returned by the search engine). Each query has 10 or less documents. For example, the format for a pair of query/document (qd) is as follows."
 202 |    ]
 203 |   },
 204 |   {
 205 |    "cell_type": "code",
 206 |    "execution_count": 64,
 207 |    "metadata": {},
 208 |    "outputs": [
 209 |     {
 210 |      "output_type": "stream",
 211 |      "name": "stdout",
 212 |      "text": "query: stanford aoerc pool hours\n  url: http://events.stanford.edu/2014/February/18/\n    title: events at stanford tuesday february 18 2014\n    header: stanford university event calendar\n    header: teaching sex at stanford\n    header: rodin the complete stanford collection\n    header: stanford rec trx suspension training\n    header: memorial church open visiting hours\n    header: alternative transportation counseling tm 3 hour stanford univ shc employees retirees family members\n    body_hits: stanford 239 271 318 457 615 642 663 960 966 971\n    body_hits: aoerc 349 401 432 530 549 578 596\n    body_hits: pool 521\n    body_length: 981\n    pagerank: 1\n  url: http://events.stanford.edu/2014/February/6/\n    title: events at stanford thursday february 6 2014\n    header: stanford university event calendar\n    header: stanford woods environmental forum featuring roz naylor\n    header: stanford school of earth sciences alumni reception at nape\n    header: an evening with stanford alumnus and p\n...\n"
 213 |     }
 214 |    ],
 215 |    "source": [
 216 |     "filename = os.path.join(data_dir, \"pa3.signal.train\")\n",
 217 |     "with open(filename, 'r', encoding = 'utf8') as f:\n",
 218 |     "    print(f.read()[0:1000])\n",
 219 |     "print(\"...\")"
 220 |    ]
 221 |   },
 222 |   {
 223 |    "cell_type": "markdown",
 224 |    "metadata": {},
 225 |    "source": [
 226 |     "This pattern repeats for the next url until all of the urls for this query are done and then the overall pattern repeats for the next query. There is only one <b>title</b>, <b>pagerank</b>, and <b>body length</b> for each url but there can be multiple <b>header</b>, <b>body hits</b> and <b>anchor text</b> (and corresponding stanford anchor count) lines.\n",
 227 |     "\n",
 228 |     "* The <b>body hits</b> line specifies the term followed by the positional postings list of that term in the document (sorted in increasing order).\n",
 229 |     "* The <b>body length</b> line states how many terms are present in the body of the document.\n",
 230 |     "* The <b>stanford anchor count</b>, specified immediately after the anchor text line, states how many anchors there are on the stanford.edu domain with that anchor text. For example, if the anchor text is “stanford math department” and the count is 9, that means there are nine links to the current page (from other pages) where the anchor text is “stanford math department”.\n",
 231 |     "* The <b>pagerank</b> is an integer from 0 to 9 that signifies a query-independent quality of the page (the higher the PageRank, the better the quality of the page)."
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "markdown",
 236 |    "metadata": {},
 237 |    "source": [
 238 |     "### Relevance File\n",
 239 |     "**– pa3.rel.(train|dev)**: lists the relevance judgments for each of the query-document pairs in the corresponding signal file. The collected relevance data was an integer ranging from −1 to 3 with a higher value indicating that the document is more relevant to that query. We have averaged relevance scores for each query-url pair with −1 ignored. For example, the format of this document is as follows:"
 240 |    ]
 241 |   },
 242 |   {
 243 |    "cell_type": "code",
 244 |    "execution_count": 6,
 245 |    "metadata": {},
 246 |    "outputs": [
 247 |     {
 248 |      "output_type": "stream",
 249 |      "name": "stdout",
 250 |      "text": "query: stanford aoerc pool hours\n  url: http://events.stanford.edu/2014/February/18/ 0.0\n  url: http://events.stanford.edu/2014/February/6/ 0.0\n  url: http://events.stanford.edu/2014/March/13/ 0.0\n  \n...\n"
 251 |     }
 252 |    ],
 253 |    "source": [
 254 |     "filename = os.path.join(data_dir, \"pa3.rel.train\")\n",
 255 |     "with open(filename, 'r', encoding = 'utf8') as f:\n",
 256 |     "    print(f.read()[0:199])\n",
 257 |     "print(\"...\")"
 258 |    ]
 259 |   },
 260 |   {
 261 |    "cell_type": "markdown",
 262 |    "metadata": {},
 263 |    "source": [
 264 |     "This pattern repeats for the next query until all of the queries in the file are done. The url line can be broken into the document url and the relevance judgment for the query-document pair."
 265 |    ]
 266 |   },
 267 |   {
 268 |    "cell_type": "markdown",
 269 |    "metadata": {},
 270 |    "source": [
 271 |     "The ranking functions also require certain collection-wide statistics (such as inverse document frequency) and we cannot infer this information just from the training set itself. We provide **docs.dict, terms.dict and BSBI.dict** what you generated from PA1, and leave you to calculate idf below."
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "markdown",
 276 |    "metadata": {},
 277 |    "source": [
 278 |     "# III. Normalized Discounted Cumulative Gain (NDCG)"
 279 |    ]
 280 |   },
 281 |   {
 282 |    "cell_type": "markdown",
 283 |    "metadata": {},
 284 |    "source": [
 285 |     "The evaluation metric used is Normalized Discounted Cumulative Gain (NDCG) since we are using a non-binary relevance metric. Since each query has at most 10 results returned, we use NDCG for the first 10 search results.\n",
 286 |     "Then, for a particular query q,\n",
 287 |     "$$NDCG(q) = \\frac{1}{Z} \\sum_{m=1}^{p}\\frac{2^{R(q,m)}-1}{log_{2}(1+m)}$$\n",
 288 |     "Here, $R(q, m)$ is the relevance judgment given to document $m$ for query $q$. $Z$ is a normalization factor. It is the ideal NDCG value. The ideal NDCG (iNDCG) value is calculated by ordering the documents in decreasing order of relevance and calculating the NDCG value with $Z=1$. If iNDCG is zero, $NDCG(q) = 1$. Finally, $p$ is the number of documents that are possible matches for that query.\n",
 289 |     "\n",
 290 |     "We can compute the NDCG for a set of queries $Q = \\{q_1,...,q_m\\}$ by taking the average of the NDCGs for each of the individual queries. \n",
 291 |     "\n",
 292 |     "The starter code [Section NDCG](#VII.1-NDCG-implementation) contains a Python implementation of NDCG which you can use directly to evaluate your ranking function on the training data. We will be using the same method to evaluate your ranking on grading data."
 293 |    ]
 294 |   },
 295 |   {
 296 |    "cell_type": "markdown",
 297 |    "metadata": {},
 298 |    "source": [
 299 |     "# IV. Ranking"
 300 |    ]
 301 |   },
 302 |   {
 303 |    "cell_type": "markdown",
 304 |    "metadata": {},
 305 |    "source": [
 306 |     "## IV.1 Term Score\n",
 307 |     "In the signal files of the training data, each query-document pair provides term information from five different fields: <b>url</b>, <b>title</b>, <b>headers</b>, <b>body</b> and <b>anchors</b>. Additionally each pair provides <b>pagerank</b> but we won’t be using it in cosine similarity. Even for BM25F, we will consider it separately as explained in [BM25F](#VI-Task2:-BM25F-(15%)). Each of the required ranking functions will construct a term score ($tf$) vector for each query-document pair from hits in these different fields. All of our ranking functions only care about terms that occur in the query.\n",
 308 |     "\n",
 309 |     "The raw term score vector, $rs$, counts how many times a query term occurs in a field. For the <b>anchor</b> field, we assume that there is one big document that contains all of the anchors with the anchor text multiplied by the anchor count. A similar approach can be followed for the <b>header</b> field as well. Thus, in the $qd$ example whose term-vector is $[\\text{stanford aoerc pool hours}]^T$ (which is shown in above printing of signal file), the $rs$\n",
 310 |     "vector for the body field will be $[{10 \\ 7 \\ 1 \\ 0}]^T$ as there are 10 hits for the term “stanford” in the <b>body</b> field and 7 hits for the term “aoerc” as well as 1 hits for the term “pool”. Similarly, the $rs$ vector for <b>anchor</b> field will be $[\\text{0 0 0 0}]^T$ as there is no anchor for this document. For <b>anothe</b>r example \n",
 311 |     "```python\n",
 312 |     "  url: https://cardinalrec.stanford.edu/facilities/aoerc/\n",
 313 |     "    ...\n",
 314 |     "    anchor_text: gyms aoerc\n",
 315 |     "      stanford_anchor_count: 3\n",
 316 |     "    anchor_text: aoerc\n",
 317 |     "      stanford_anchor_count: 13\n",
 318 |     "    anchor_text: http cardinalrec stanford edu facilities aoerc\n",
 319 |     "      stanford_anchor_count: 4\n",
 320 |     "    anchor_text: arrillaga outdoor education and recreation center aoerc link is external\n",
 321 |     "      stanford_anchor_count: 1\n",
 322 |     "    anchor_text: the arrillaga outdoor education and research center aoerc\n",
 323 |     "      stanford_anchor_count: 2\n",
 324 |     "    anchor_text: aoerc will shutdown for maintenance\n",
 325 |     "      stanford_anchor_count: 2\n",
 326 |     "```\n",
 327 |     "\n",
 328 |     "The <b>anchor</b> will be $[\\text{4 25 0 0}]^T$ as there is 4 stanford_anchor_count for term “stanford” and 25 stanford_anchor_count for term “aoerc”.\n",
 329 |     "\n",
 330 |     "Finally, the $rs$ vector <br>\n",
 331 |     "for the <b>title</b> field is $[\\text{1 0 0 0}]^T$,<br>\n",
 332 |     "for the <b>url</b> field is$[\\text{1 0 0 0}]^T$, <br>\n",
 333 |     "for the <b>header</b> field is $[\\text{5 0 0 1}]^T$ . \n",
 334 |     "\n",
 335 |     "Note that in order to extract <b>url</b> hits, you will have to tokenize the url on non-alphanumeric characters. We've provided the parser code for you.\n",
 336 |     "\n",
 337 |     "While calculating the raw term scores, we convert everything to lowercase and then calculate the counts. The <b>body_hits</b> field given in the data do not perform any stemming. However, for the other fields, you are free to experiment with different techniques like stemming etc. You may find [nltk](https://www.nltk.org/) could be useful "
 338 |    ]
 339 |   },
 340 |   {
 341 |    "cell_type": "markdown",
 342 |    "metadata": {},
 343 |    "source": [
 344 |     "## IV.2 Output Requirements\n",
 345 |     "In all three tasks, the goal is to derive specific types of ranking functions based on the training data and relevance values. Once the ranking function $rf$ has been crafted, we will then pass in the test data set and your application must use $rf$ to rank the query-document pairs and output the list of documents for each query in decreasing rank order. The NDCG evaluation metric will then be applied on these lists against the evaluation provided by you in the search ratings task earlier in the course. The higher the value, the better your ranking algorithm works."
 346 |    ]
 347 |   },
 348 |   {
 349 |    "cell_type": "markdown",
 350 |    "metadata": {},
 351 |    "source": [
 352 |     "We predefine Query and Document class for you. You can load training data and construct a query dictionary by load_train_data"
 353 |    ]
 354 |   },
 355 |   {
 356 |    "cell_type": "code",
 357 |    "execution_count": 7,
 358 |    "metadata": {},
 359 |    "outputs": [],
 360 |    "source": [
 361 |     "file_name = os.path.join(data_dir, \"pa3.signal.train\")\n",
 362 |     "query_dict = load_train_data(file_name)"
 363 |    ]
 364 |   },
 365 |   {
 366 |    "cell_type": "markdown",
 367 |    "metadata": {},
 368 |    "source": [
 369 |     "```python\n",
 370 |     "# Mapping of Query-url-Document. Query -> (url -> Document)\n",
 371 |     "query_dict[Query(\"stanford aoerc pool hours\")]  # Access a query \n",
 372 |     "query_dict[Query(\"stanford aoerc pool hours\")]['an url']  # Access a document \n",
 373 |     "query_dict[Query(\"stanford aoerc pool hours\")]['an url'].body_hits  # Access a field of document \n",
 374 |     "```"
 375 |    ]
 376 |   },
 377 |   {
 378 |    "cell_type": "code",
 379 |    "execution_count": 8,
 380 |    "metadata": {},
 381 |    "outputs": [
 382 |     {
 383 |      "output_type": "stream",
 384 |      "name": "stdout",
 385 |      "text": "document: title: events at stanford tuesday february 18 2014\n headers: ['stanford university event calendar', 'teaching sex at stanford', 'rodin the complete stanford collection', 'stanford rec trx suspension training', 'memorial church open visiting hours', 'alternative transportation counseling tm 3 hour stanford univ shc employees retirees family members']\n body_hits: {'stanford': [239, 271, 318, 457, 615, 642, 663, 960, 966, 971], 'aoerc': [349, 401, 432, 530, 549, 578, 596], 'pool': [521]}\n body_length: 981\n pagerank: 1\n\nurl http://events.stanford.edu/2014/February/18/\nheaders: ['stanford university event calendar', 'teaching sex at stanford', 'rodin the complete stanford collection', 'stanford rec trx suspension training', 'memorial church open visiting hours', 'alternative transportation counseling tm 3 hour stanford univ shc employees retirees family members']\nbody_hits: {'stanford': [239, 271, 318, 457, 615, 642, 663, 960, 966, 971], 'aoerc': [349, 401, 432, 530, 549, 578, 596], 'pool': [521]}\n"
 386 |     }
 387 |    ],
 388 |    "source": [
 389 |     "sample_doc = query_dict[Query(\"stanford aoerc pool hours\")]['http://events.stanford.edu/2014/February/18/']\n",
 390 |     "print(\"document:\", sample_doc)\n",
 391 |     "print(\"url\", sample_doc.url)\n",
 392 |     "print(\"headers:\", sample_doc.headers)\n",
 393 |     "print(\"body_hits:\",sample_doc.body_hits)"
 394 |    ]
 395 |   },
 396 |   {
 397 |    "cell_type": "markdown",
 398 |    "metadata": {},
 399 |    "source": [
 400 |     "## IV.4 Build Idf Dictionary"
 401 |    ]
 402 |   },
 403 |   {
 404 |    "cell_type": "markdown",
 405 |    "metadata": {},
 406 |    "source": [
 407 |     "In this section, you will need to build an idf dictionary contain idf of a term, which will be used later."
 408 |    ]
 409 |   },
 410 |   {
 411 |    "cell_type": "code",
 412 |    "execution_count": 21,
 413 |    "metadata": {},
 414 |    "outputs": [
 415 |     {
 416 |      "output_type": "stream",
 417 |      "name": "stdout",
 418 |      "text": "Overwriting submission/build_idf.py\n"
 419 |     }
 420 |    ],
 421 |    "source": [
 422 |     "%%tee submission/build_idf.py\n",
 423 |     "import pickle as pkl\n",
 424 |     "import math\n",
 425 |     "class Idf:\n",
 426 |     "    \"\"\"Build idf dictionary and return idf of a term, whether in or not in built dictionary.\n",
 427 |     "        Recall from PA1 that postings_dict maps termID to a 3 tuple of \n",
 428 |     "        (start_position_in_index_file, number_of_postings_in_list, length_in_bytes_of_postings_list)\n",
 429 |     "        \n",
 430 |     "        Remember that it's possible for a term to not appear in the collection corpus.\n",
 431 |     "        Thus to guard against such a case, we will apply Laplace add-one smoothing.\n",
 432 |     "        \n",
 433 |     "        Note: We expect you to store the idf as {term: idf} and handle term which is not in posting_list\n",
 434 |     "\n",
 435 |     "        Hint: For term not in built dictionary, we should return math.log10(total_doc_num / 1.0).\n",
 436 |     "    \"\"\"\n",
 437 |     "    def __init__(self):\n",
 438 |     "        \"\"\"Build an idf dictionary\"\"\"\n",
 439 |     "        try:\n",
 440 |     "            # We provide docs.dict, terms.dict and BSBI.dict what you generated from PA1\n",
 441 |     "            with open(\"pa3-data/docs.dict\", 'rb') as f:\n",
 442 |     "                docs = pkl.load(f)\n",
 443 |     "            self.total_doc_num = len(docs)\n",
 444 |     "            print(\"Total Number of Docs is\", self.total_doc_num)\n",
 445 |     "\n",
 446 |     "            with open(\"pa3-data/terms.dict\", 'rb') as f:\n",
 447 |     "                terms = pkl.load(f)\n",
 448 |     "            self.total_term_num = len(terms)\n",
 449 |     "            print(\"Total Number of Terms is\", self.total_term_num)\n",
 450 |     "\n",
 451 |     "            with open('pa3-data/BSBI.dict', 'rb') as f:\n",
 452 |     "                postings_dict, termsID = pkl.load(f)\n",
 453 |     "\n",
 454 |     "            self.idf = {}\n",
 455 |     "            ### Begin your code\n",
 456 |     "            for t_id, (_, df, _) in postings_dict.items():\n",
 457 |     "                t = terms[t_id]\n",
 458 |     "                idf = math.log10(self.total_doc_num) - math.log10(df)\n",
 459 |     "                self.idf[t] = idf\n",
 460 |     "            ### End your code\n",
 461 |     "        except FileNotFoundError:\n",
 462 |     "            print(\"doc_dict_file / term_dict_file Not Found!\")\n",
 463 |     "\n",
 464 |     "    def get_idf(self, term = None):\n",
 465 |     "        \"\"\"Return idf of return idf of a term, whether in or not in built dictionary.\n",
 466 |     "        Args:\n",
 467 |     "            term(str) : term to return its idf\n",
 468 |     "        Return(float): \n",
 469 |     "            idf of the term\n",
 470 |     "        \"\"\"\n",
 471 |     "        ### Begin your code\n",
 472 |     "        oov = math.log10(self.total_doc_num) - math.log10(1)\n",
 473 |     "        return self.idf.get(term, oov)\n",
 474 |     "        ### End your code"
 475 |    ]
 476 |   },
 477 |   {
 478 |    "cell_type": "code",
 479 |    "execution_count": 22,
 480 |    "metadata": {
 481 |     "tags": [
 482 |      "outputPrepend"
 483 |     ]
 484 |    },
 485 |    "outputs": [
 486 |     {
 487 |      "output_type": "stream",
 488 |      "name": "stdout",
 489 |      "text": "Total Number of Docs is 98998\nTotal Number of Terms is 347071\n"
 490 |     }
 491 |    ],
 492 |    "source": [
 493 |     "my_idf = Idf()\n",
 494 |     "my_idf.get_idf(\"data\")\n",
 495 |     "assert len(my_idf.idf) == 347071, 'Not matching with expected length of idf.' \n",
 496 |     "assert my_idf.get_idf(\"bilibalabulu\") > 4.9, \"Not handle unseen term or give wrong value\"\n",
 497 |     "assert my_idf.get_idf(\"data\") < my_idf.get_idf(\"radiology\"), 'idf of rarer terms should be larger than common terms.'\n",
 498 |     "assert my_idf.get_idf(\"to\") < my_idf.get_idf(\"design\"), 'idf of rarer terms should be larger than common terms.'"
 499 |    ]
 500 |   },
 501 |   {
 502 |    "cell_type": "markdown",
 503 |    "metadata": {},
 504 |    "source": [
 505 |     "# V Task1: Cosine Similarity (5%)"
 506 |    ]
 507 |   },
 508 |   {
 509 |    "cell_type": "markdown",
 510 |    "metadata": {},
 511 |    "source": [
 512 |     "The first task is to implement a variant of cosine similarity (with the L1-Norm) as the ranking function. This essentially involves constructing the <b><i>document vector</i></b> and the <b><i>query vector</i></b> and then taking their dot product. Recall from Figure 6.15 in the textbook that in order to construct the vectors, we need to decide on how we compute a term frequency, a document frequency weighting, and a normalization strategy. Let’s discuss these for both the vectors separately.\n",
 513 |     "<img src=\"fig/IIR_fig_6.15.png\">\n",
 514 |     "Figure is from Pg.128 http://nlp.stanford.edu/IR-book/pdf/06vect.pdf\n",
 515 |     "\n",
 516 |     "Note: We will only grade Task 1 on default parameter to check the correctness of your implementation. But it could be helpful to do parameter tuning on it and have a sense of the importance of each field. You will need that in Task 2."
 517 |    ]
 518 |   },
 519 |   {
 520 |    "cell_type": "markdown",
 521 |    "metadata": {},
 522 |    "source": [
 523 |     "## V.1 Query vector \n",
 524 |     "\n",
 525 |     "* Term frequency<br>\n",
 526 |     "The raw term frequencies can be computed using the query (should be 1 for most queries but not necessarily true). Again, you can use either the raw frequencies or sublinearly scale them.\n",
 527 |     "\n",
 528 |     "\n",
 529 |     "* Document frequency<br>\n",
 530 |     "Each of the terms in <i>qv</i> should be weighted using the idf value for each of the terms in the query. Computing the idf above from the corpus from PA1 to determine how many documents contain the query terms. One issue is that it is possible for a query term <i>t</i> to not appear in the collection corpus and it is not possible to evaluate ${\\text{idf}_t}$. In such a case, we will apply the Laplace add-one smoothing technique learned earlier in the course 3. (This essentially assumes the existence of a hypothetical dummy document that contains all possible terms, and therefore, adds 1 to each numerator and denominator with the idft formula.)\n",
 531 |     "\n",
 532 |     "\n",
 533 |     "* Normalization<br>\n",
 534 |     "No normalization is needed for query length because any query length normalization applies to all docs and so is not relevant to ranking.\n",
 535 |     "\n",
 536 |     "**Note**: We ask you to implement the b-t-n (boolean-idf-none) scheme for query vector and check the correctness of your scorer based on this default setting. You could select other reasonable scheme to increase the performance.\n"
 537 |    ]
 538 |   },
 539 |   {
 540 |    "cell_type": "markdown",
 541 |    "metadata": {},
 542 |    "source": [
 543 |     "## V.2 Document vector\n",
 544 |     "* Term frequency<br>\n",
 545 |     "We compute the raw term frequencies for each query term in the different fields using the method described in [Section IV.1](#IV.1-Term-Score) . For each of the fields, we can compute the <i>tf</i> vector, either using the raw scores themselves or by applying sublinear scaling on the raw scores. In sublinear scaling, we have $tf_i = 1 + log(rs_i)$ if $rs_i > 0$ and $0$ otherwise. Thus, the <i>tf</i> vector for the <b>body</b> field for qd will be $[\\text{1+log(10)  1+log(7)  1+log(1)  0}]^T$ . \n",
 546 |     "More information about sublinear tf scaling is described in <a href=\"http://nlp.stanford.edu/IR-book/pdf/06vect.pdf\"> Page 126 Section 6.4.1 of the textbook</a>.\n",
 547 |     "\n",
 548 |     "\n",
 549 |     "* Document frequency<br>\n",
 550 |     "We will not use any document frequency in the document vector. Instead, it is incorporated in the query vector as described below.\n",
 551 |     "\n",
 552 |     "\n",
 553 |     "* Normalization<br>\n",
 554 |     "We cannot use cosine normalization as we do not have access to the contents of the document and, thus, do not know what other terms (and counts of those terms) occur in the <b>body</b> field. As a result, we use length normalization instead. Moreover, since there can be huge discrepancies between the lengths of the different fields, we divide all fields by the same normalization factor, the body length. <br> Note that some documents have a body length of 0, so you will have to smooth them somehow. A good strategy is to add a value, say 500, to the body length of each document. You can experiment with this value or with other smoothing strategies and report them.\n",
 555 |     "\n",
 556 |     "Note: We ask you to implement the n-n-n* (natural-no- some normalization*) scheme for document vector for and check the correctness of your scorer based on this default setting. You could select other reasonable scheme to increase the performance.\n",
 557 |     "\n",
 558 |     "**Hint:** The normalizaton of document vector of task 1 and task 2 are different but the tasks could share same term frequency and document frequency"
 559 |    ]
 560 |   },
 561 |   {
 562 |    "cell_type": "markdown",
 563 |    "metadata": {},
 564 |    "source": [
 565 |     "Note that to fully test the correctness of your scorer, we provide the defaut weight scheme for query vector and doc vector. You should **implement the defaut ones and any other variants that you believe will increase the performance**."
 566 |    ]
 567 |   },
 568 |   {
 569 |    "cell_type": "markdown",
 570 |    "metadata": {},
 571 |    "source": [
 572 |     "## V.3 Abstract Scorer and Baseline Scorer"
 573 |    ]
 574 |   },
 575 |   {
 576 |    "cell_type": "code",
 577 |    "execution_count": 23,
 578 |    "metadata": {},
 579 |    "outputs": [
 580 |     {
 581 |      "output_type": "stream",
 582 |      "name": "stdout",
 583 |      "text": "Query q:  stanford aoerc pool hours\nDocument d:  title: events at stanford tuesday february 18 2014\n headers: ['stanford university event calendar', 'teaching sex at stanford', 'rodin the complete stanford collection', 'stanford rec trx suspension training', 'memorial church open visiting hours', 'alternative transportation counseling tm 3 hour stanford univ shc employees retirees family members']\n body_hits: {'stanford': [239, 271, 318, 457, 615, 642, 663, 960, 966, 971], 'aoerc': [349, 401, 432, 530, 549, 578, 596], 'pool': [521]}\n body_length: 981\n pagerank: 1\n\n"
 584 |     }
 585 |    ],
 586 |    "source": [
 587 |     "# We use a sample q and d to help/assert your score implementation \n",
 588 |     "q = Query(\"stanford aoerc pool hours\")\n",
 589 |     "d = query_dict[q]['http://events.stanford.edu/2014/February/18/'] # example that has body_hits\n",
 590 |     "# d = query_dict[q]['https://cardinalrec.stanford.edu/facilities/aoerc/']  # example that has anchors\n",
 591 |     "print(\"Query q: \", q)\n",
 592 |     "print(\"Document d: \", d)"
 593 |    ]
 594 |   },
 595 |   {
 596 |    "cell_type": "code",
 597 |    "execution_count": 43,
 598 |    "metadata": {},
 599 |    "outputs": [
 600 |     {
 601 |      "output_type": "stream",
 602 |      "name": "stdout",
 603 |      "text": "Overwriting submission/ascore.py\n"
 604 |     }
 605 |    ],
 606 |    "source": [
 607 |     "%%tee submission/ascore.py\n",
 608 |     "import math\n",
 609 |     "from collections import Counter\n",
 610 |     "class AScorer:\n",
 611 |     "    \"\"\" An abstract class for a scorer. \n",
 612 |     "        Implement query vector and doc vector.\n",
 613 |     "        Needs to be extended by each specific implementation of scorers.\n",
 614 |     "    \"\"\"\n",
 615 |     "    def __init__(self, idf, query_weight_scheme=None, doc_weight_scheme=None): #Modified\n",
 616 |     "        self.idf = idf\n",
 617 |     "        self.TFTYPES = [\"url\",\"title\",\"body_hits\",\"header\",\"anchor\"]\n",
 618 |     "        \n",
 619 |     "        self.default_query_weight_scheme = {\"tf\": 'b', \"df\": 't', \"norm\": None} # boolean, idf, none\n",
 620 |     "        self.default_doc_weight_scheme = {\"tf\": 'n', \"df\": 'n', \"norm\": None}   # natural, none\n",
 621 |     "        \n",
 622 |     "        self.query_weight_scheme = query_weight_scheme if query_weight_scheme is not None \\\n",
 623 |     "                                   else self.default_query_weight_scheme #Modified (added)\n",
 624 |     "        self.doc_weight_scheme = doc_weight_scheme if doc_weight_scheme is not None \\\n",
 625 |     "                                 else self.default_doc_weight_scheme #Modified (added)\n",
 626 |     "\n",
 627 |     "    def get_sim_score(self, q, d):\n",
 628 |     "        \"\"\" Score each document for each query.\n",
 629 |     "        Args:\n",
 630 |     "            q (Query): the Query\n",
 631 |     "            d (Document) :the Document\n",
 632 |     "\n",
 633 |     "        Returns:\n",
 634 |     "            pass now, will be implement in task 1, 2 and 3\n",
 635 |     "        \"\"\"        \n",
 636 |     "        raise NotImplementedError\n",
 637 |     "\n",
 638 |     "    # Include any initialization and/or parsing methods that \n",
 639 |     "    # you may want to perform on the Document fields prior to accumulating counts.\n",
 640 |     "    # See the Document class to see how the various fields are represented\n",
 641 |     "    # We have provided a few parser functions for you. Feel free to change them, and add more if you find its useful\n",
 642 |     "\n",
 643 |     "    ### Begin your code\n",
 644 |     "    def parse_query(self, query):\n",
 645 |     "        return Counter(query)\n",
 646 |     "    ### End your code\n",
 647 |     "    def parse_url(self, url, token=False):\n",
 648 |     "        \"\"\"Parse document's url. Return Counter of url's tokens\"\"\"\n",
 649 |     "        # token indicate whether we want the raw token or Counter of it\n",
 650 |     "        if url:\n",
 651 |     "            url_token_in_term = url.replace(\"http:\",\".\").replace('/','.').replace('?','.') \\\n",
 652 |     "                                   .replace('=','.').replace(\"%20\",\".\").replace(\"...\",\".\").replace(\"..\",\".\")\\\n",
 653 |     "                                   .lower();\n",
 654 |     "            url_token = url_token_in_term.split('.')[1:]\n",
 655 |     "            if token:\n",
 656 |     "                return url_token \n",
 657 |     "            else:\n",
 658 |     "                return Counter(url_token)\n",
 659 |     "        return Counter([])\n",
 660 |     "\n",
 661 |     "    def parse_title(self, title, token=False):\n",
 662 |     "        \"\"\"Parse document's title. Return Counter of title's tokens\"\"\"\n",
 663 |     "        if title:\n",
 664 |     "            if token:\n",
 665 |     "                return title.split(\" \") \n",
 666 |     "            else:\n",
 667 |     "                return Counter(title.split(\" \"))\n",
 668 |     "        else:\n",
 669 |     "            return Counter([])\n",
 670 |     "\n",
 671 |     "    def parse_headers(self, headers):\n",
 672 |     "        \"\"\"Parse document's headers. Return Counter of headers' tokens\"\"\"\n",
 673 |     "        headers_token = []\n",
 674 |     "        if headers is not None:\n",
 675 |     "            for header in headers:\n",
 676 |     "                header_token = header.split(\" \")\n",
 677 |     "                headers_token.extend(header_token)\n",
 678 |     "        return Counter(headers_token)\n",
 679 |     "\n",
 680 |     "    def parse_anchors(self, anchors):\n",
 681 |     "        \"\"\"Parse document's anchors. Return Counter of anchors' tokens\"\"\"\n",
 682 |     "        anchor_count_map = Counter({})\n",
 683 |     "        if anchors is not None:\n",
 684 |     "            for anchor in anchors:\n",
 685 |     "                count = anchors[anchor]\n",
 686 |     "                anchor_tokens = anchor.split(\" \")\n",
 687 |     "                for anchor_token in anchor_tokens:\n",
 688 |     "                    if(anchor_token in anchor_count_map.keys()):\n",
 689 |     "                        anchor_count_map[anchor_token] += count\n",
 690 |     "                    else:\n",
 691 |     "                        anchor_count_map[anchor_token] = count           \n",
 692 |     "        return anchor_count_map\n",
 693 |     " \n",
 694 |     "    def parse_body_hits(self, body_hits):\n",
 695 |     "        \"\"\"Parse document's body_hits. Return Counter of body_hits' tokens\"\"\"\n",
 696 |     "        body_hits_count_map = Counter({})\n",
 697 |     "        if body_hits is not None:\n",
 698 |     "            for body_hit in body_hits:\n",
 699 |     "                body_hits_count_map[body_hit] = len(body_hits[body_hit])\n",
 700 |     "        return body_hits_count_map\n",
 701 |     "    \n",
 702 |     "    \n",
 703 |     "    def get_query_vector(self, q, query_weight_scheme=None):\n",
 704 |     "\n",
 705 |     "        \"\"\" Handle the query vector. \n",
 706 |     "        1. get term freq 2. get doc freq 3. normalization\n",
 707 |     "        Refer to above SMART notificaton and figure\n",
 708 |     "        \n",
 709 |     "        Compute the raw term (and/or sublinearly scaled) frequencies\n",
 710 |     "        Additionally weight each of the terms using the idf value of the term in the query \n",
 711 |     "        (we use the PA1 corpus to determine how many documents contain the query terms \n",
 712 |     "        which is calculated above and stored in self.idf).\n",
 713 |     "        \n",
 714 |     "        Note that no normalization is needed for query length \n",
 715 |     "        because any query length normalization applies to all docs and so is not relevant to ranking.\n",
 716 |     "        \n",
 717 |     "        Args:\n",
 718 |     "            q (Query): Query(\"some query\")\n",
 719 |     "            \n",
 720 |     "        Returns:\n",
 721 |     "            query_vec (dict):  the query vector\n",
 722 |     "        \"\"\"  \n",
 723 |     "       \n",
 724 |     "        if query_weight_scheme is None:\n",
 725 |     "            query_weight_scheme = self.query_weight_scheme #modified\n",
 726 |     "            \n",
 727 |     "        query_vec = {}\n",
 728 |     "        ### Begin your code\n",
 729 |     "        for k, v in self.parse_query(q).items():\n",
 730 |     "            idf = self.idf.get_idf(k)\n",
 731 |     "            query_vec[k] = (1+math.log10(v))*idf\n",
 732 |     "        ### End your code\n",
 733 |     "        return query_vec\n",
 734 |     "    \n",
 735 |     "    def get_doc_vector(self, q, d, doc_weight_scheme=None):\n",
 736 |     "        \n",
 737 |     "        \"\"\"get term freqs for documents\n",
 738 |     "        You will need to \n",
 739 |     "        1. Initialize tfs for tf types (as in self.TFTYPES)\n",
 740 |     "        2. Initialize tfs for query_words\n",
 741 |     "        3. Tokenize url, title, and headers, anchors, body_hits if exits\n",
 742 |     "        4. (we've already provided parse functions above)\n",
 743 |     "        5. Loop through query terms increasing relevant tfs\n",
 744 |     "        \n",
 745 |     "        Args:\n",
 746 |     "        q (Query) : Query(\"some query\")\n",
 747 |     "        d (Document) : Query(\"some query\")[\"an url\"]\n",
 748 |     "        \n",
 749 |     "        Returns:\n",
 750 |     "        doc_vec (dict) :A dictionary of doc term frequency:\n",
 751 |     "                    tf type -> query_word -> score\n",
 752 |     "                    For example: the output of document d\n",
 753 |     "                    Should be look like \"{'url': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0},\n",
 754 |     "                                     'title': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0},...\"\"\n",
 755 |     "        \"\"\"\n",
 756 |     "        if doc_weight_scheme is None:\n",
 757 |     "            doc_weight_scheme = self.doc_weight_scheme #modified\n",
 758 |     "            \n",
 759 |     "        doc_vec = {} \n",
 760 |     "        \n",
 761 |     "        ### Begin your code\n",
 762 |     "        zones = {'url': self.parse_url, 'title': self.parse_title, 'headers': self.parse_headers, 'anchors': self.parse_anchors, 'body_hits': self.parse_body_hits}\n",
 763 |     "        for zone in zones.keys():\n",
 764 |     "            if eval('d.{}'.format(zone)):\n",
 765 |     "                q_dict = dict()\n",
 766 |     "                cnt = zones[zone](eval('d.{}'.format(zone)))\n",
 767 |     "                for term in q:\n",
 768 |     "                    q_dict[term] = cnt.get(term, 0)\n",
 769 |     "                doc_vec[zone] = q_dict\n",
 770 |     "        ### End your code\n",
 771 |     "        \n",
 772 |     "        # Normalization\n",
 773 |     "        if doc_weight_scheme['norm']:\n",
 774 |     "            norm_func = doc_weight_scheme[\"norm\"]\n",
 775 |     "            doc_vec = norm_func(q, d, doc_vec)\n",
 776 |     "        return doc_vec\n",
 777 |     "        \n",
 778 |     "        \n",
 779 |     "    def normalize_doc_vec(self, q, d, doc_vec):\n",
 780 |     "        \"\"\" Normalize the doc vector\n",
 781 |     "        Task 1 and 2 will use different normlization. You can also try other different normalization methods.\n",
 782 |     "        Args: \n",
 783 |     "            doc_vec (dict) : the doc vector\n",
 784 |     "            q (Query) : the query\n",
 785 |     "            d (Document) : the document\n",
 786 |     "        \"\"\"\n",
 787 |     "        raise NotImplementedError\n",
 788 |     "        \n",
 789 |     "    # For the learning-to-rank ipython notebook, you may choose to define additional function(s)\n",
 790 |     "    # below for various possible kinds of normalization. \n",
 791 |     "    # You will not need to fill this section out for the \"ranking\" notebook. \n",
 792 |     "\n",
 793 |     "    ### Begin your code\n",
 794 |     "\n",
 795 |     "    ### End your code \n",
 796 |     "    \n",
 797 |     "    def get_net_score(self, q, query_vec, d, doc_vec):\n",
 798 |     "        \"\"\" calculate net score\n",
 799 |     "        Args:\n",
 800 |     "            q (Query) : the query\n",
 801 |     "            query_vec (dict) : the query vector\n",
 802 |     "            d (Document) : the document\n",
 803 |     "            doc_vec (dict) : the document vector\n",
 804 |     "        Return:\n",
 805 |     "            score (float) : the net score\n",
 806 |     "        \"\"\"\n",
 807 |     "        raise NotImplementedError"
 808 |    ]
 809 |   },
 810 |   {
 811 |    "cell_type": "markdown",
 812 |    "metadata": {},
 813 |    "source": [
 814 |     "Free free to compare your get_doc_vector result with above instruction and your own understanding. \n",
 815 |     "We did not use  other techniques such as stemming. But free to do that yourself to increase the performance"
 816 |    ]
 817 |   },
 818 |   {
 819 |    "cell_type": "code",
 820 |    "execution_count": 44,
 821 |    "metadata": {},
 822 |    "outputs": [
 823 |     {
 824 |      "output_type": "stream",
 825 |      "name": "stdout",
 826 |      "text": "{'stanford': 0.14313422813430865, 'aoerc': 4.995626420883029, 'pool': 2.447851715495207, 'hours': 1.2883117872943215}\n"
 827 |     },
 828 |     {
 829 |      "output_type": "execute_result",
 830 |      "data": {
 831 |       "text/plain": "{'url': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0},\n 'title': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0},\n 'headers': {'stanford': 5, 'aoerc': 0, 'pool': 0, 'hours': 1},\n 'body_hits': {'stanford': 10, 'aoerc': 7, 'pool': 1, 'hours': 0}}"
 832 |      },
 833 |      "metadata": {},
 834 |      "execution_count": 44
 835 |     }
 836 |    ],
 837 |    "source": [
 838 |     "a_scorer = AScorer(my_idf)\n",
 839 |     "query_vec = a_scorer.get_query_vector(q, None) \n",
 840 |     "print(query_vec)\n",
 841 |     "doc_vec = a_scorer.get_doc_vector(q, d, None) \n",
 842 |     "doc_vec"
 843 |    ]
 844 |   },
 845 |   {
 846 |    "cell_type": "markdown",
 847 |    "metadata": {},
 848 |    "source": [
 849 |     "## IV.4 Baseline Score\n",
 850 |     "\n",
 851 |     "Here we provide the a baseline score to partially test your implementation of get_query_vector and get_doc_vector."
 852 |    ]
 853 |   },
 854 |   {
 855 |    "cell_type": "code",
 856 |    "execution_count": 40,
 857 |    "metadata": {},
 858 |    "outputs": [
 859 |     {
 860 |      "output_type": "stream",
 861 |      "name": "stdout",
 862 |      "text": "Overwriting base_classes/baseline_score.py\n"
 863 |     }
 864 |    ],
 865 |    "source": [
 866 |     "%%tee base_classes/baseline_score.py\n",
 867 |     "class BaselineScorer(AScorer):\n",
 868 |     "    def __init__(self, idf):\n",
 869 |     "        super().__init__(idf)\n",
 870 |     "    \n",
 871 |     "    def get_sim_score(self, q, d):\n",
 872 |     "        q_vec = self.get_query_vector(q)\n",
 873 |     "        d_vec = self.get_doc_vector(q, d)\n",
 874 |     "        score = 0\n",
 875 |     "        if 'body_hits' in d_vec.keys():\n",
 876 |     "            for term in d_vec['body_hits'].keys():\n",
 877 |     "                score += d_vec['body_hits'][term]\n",
 878 |     "        return score"
 879 |    ]
 880 |   },
 881 |   {
 882 |    "cell_type": "code",
 883 |    "execution_count": 41,
 884 |    "metadata": {},
 885 |    "outputs": [
 886 |     {
 887 |      "output_type": "stream",
 888 |      "name": "stdout",
 889 |      "text": "query vector:  {'stanford': 0.016127938186961028, 'aoerc': 0.562892294675663, 'pool': 0.2758166350071672, 'hours': 0.14516313213020882}\ndoc vector {'url': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0}, 'title': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0}, 'headers': {'stanford': 5, 'aoerc': 0, 'pool': 0, 'hours': 1}, 'body_hits': {'stanford': 10, 'aoerc': 7, 'pool': 1, 'hours': 0}}\n"
 890 |     }
 891 |    ],
 892 |    "source": [
 893 |     "baseline_scorer = BaselineScorer(my_idf)\n",
 894 |     "print('query vector: ',  baseline_scorer.get_query_vector(q))\n",
 895 |     "print('doc vector', baseline_scorer.get_doc_vector(q, d))\n",
 896 |     "assert baseline_scorer.get_sim_score(q, d) == 18, \"Similarity scorer using default weight scheme for q and d \\\n",
 897 |     "                                                   does not match with our results\""
 898 |    ]
 899 |   },
 900 |   {
 901 |    "cell_type": "markdown",
 902 |    "metadata": {},
 903 |    "source": [
 904 |     "For a document $d$ and query $q$, if $qv_q$ is the query vector and $tf_{d,u}$, $tf_{d,t}$, $tf_{d,b}$, $tf_{d,h}$ and $tf_{d,a}$ are the term score vector for the <b>url</b>, <b>title</b>, <b>body</b>, <b>header</b> and <b>anchor fields</b>, respectively, then the net score is\n",
 905 |     "$$qv_q \\cdot (c_u \\cdot tf_{d,u} + c_t \\cdot tf_{d,t} + c_b \\cdot tf_{d,b} + c_h \\cdot tf_{d,h} + c_a \\cdot tf_{d,a})$$\n",
 906 |     "\n",
 907 |     "Here, $c_u$, $c_t$, $c_b$, $c_h$ and $c_a$ are the weights given to <b>url</b>, <b>title</b>, <b>body</b>, <b>header</b> and <b>anchor fields</b>, respectively.\n",
 908 |     "\n",
 909 |     "The goal is to determine the weights for all 5 fields (and, thus, the ranking function using cosine similarity) so that the NDCG function is of an optimal value when run on the test set. You will use the training set given to derive the above parameters.\n",
 910 |     "\n",
 911 |     "**Hint**: Note that the absolute values of weights won’t matter as they will be the same for all documents, only the relative weights for different fields is important; i.e. you can multiply each weight by a constant and the ranking will remain the same. In order to estimate the relative weights, try to reason the relative importance of the different fields."
 912 |    ]
 913 |   },
 914 |   {
 915 |    "cell_type": "code",
 916 |    "execution_count": 42,
 917 |    "metadata": {},
 918 |    "outputs": [],
 919 |    "source": [
 920 |     "default_params_cosine = {\n",
 921 |     "    \"url_weight\" : 10,\n",
 922 |     "    \"title_weight\": 0.1,\n",
 923 |     "    \"body_hits_weight\" : 0.1,\n",
 924 |     "    \"header_weight\" : 0.1,\n",
 925 |     "    \"anchor_weight\" : 0.1,\n",
 926 |     "    \"smoothing_body_length\" : 800,\n",
 927 |     "}"
 928 |    ]
 929 |   },
 930 |   {
 931 |    "cell_type": "code",
 932 |    "execution_count": 133,
 933 |    "metadata": {},
 934 |    "outputs": [
 935 |     {
 936 |      "output_type": "stream",
 937 |      "name": "stdout",
 938 |      "text": "Overwriting submission/cosine_score.py\n"
 939 |     }
 940 |    ],
 941 |    "source": [
 942 |     "%%tee submission/cosine_score.py\n",
 943 |     "from submission.ascore import * #modified\n",
 944 |     "class CosineSimilarityScorer(AScorer):\n",
 945 |     "\n",
 946 |     "    def __init__(self, idf, query_dict, params, query_weight_scheme=None, doc_weight_scheme=None): #Modified\n",
 947 |     "        # query_dict is unnecessary for CosineSimilarityScorer,\n",
 948 |     "        # but it's useful for child class SmallestWindowScorer\n",
 949 |     "        super().__init__(idf, query_weight_scheme=query_weight_scheme, doc_weight_scheme=doc_weight_scheme) #Modified\n",
 950 |     "        self.url_weight = params[\"url_weight\"]\n",
 951 |     "        self.title_weight  = params[\"title_weight\"]\n",
 952 |     "        self.body_hits_weight = params[\"body_hits_weight\"]\n",
 953 |     "        self.header_weight = params[\"header_weight\"]\n",
 954 |     "        self.anchor_weight = params[\"anchor_weight\"]\n",
 955 |     "        self.smoothing_body_length = params[\"smoothing_body_length\"]\n",
 956 |     "        \n",
 957 |     "    def get_net_score(self, q, query_vec, d, doc_vec):\n",
 958 |     "        \"\"\" calculate net score\n",
 959 |     "        Args:\n",
 960 |     "            q (Query) : the query\n",
 961 |     "            query_vec (dict) : the query vector\n",
 962 |     "            d (Document) : the document\n",
 963 |     "            doc_vec (dict) : the document vector\n",
 964 |     "        Return:\n",
 965 |     "            score (float) : the net score\n",
 966 |     "        \"\"\"\n",
 967 |     "        ### Begin your code\n",
 968 |     "        zone_param = {'url': self.url_weight, 'title': self.title_weight, 'body_hits': self.body_hits_weight, 'headers': self.header_weight, 'anchors': self.anchor_weight}\n",
 969 |     "        d_vec = dict()\n",
 970 |     "        for zone, cnt in doc_vec.items():\n",
 971 |     "            for k, v in cnt.items():\n",
 972 |     "                d_vec[k] = d_vec.get(k, 0) + zone_param[zone]*v\n",
 973 |     "        score = 0\n",
 974 |     "        for term in q:\n",
 975 |     "            score += query_vec[term] * d_vec[term]\n",
 976 |     "        ### End your code\n",
 977 |     "        return score\n",
 978 |     "    \n",
 979 |     "    \n",
 980 |     "    ## Normalization\n",
 981 |     "    def L1_normalize_doc_vec(self, q, d, doc_vec): \n",
 982 |     "        \"\"\" Normalize the doc vector\n",
 983 |     "        Note that we should give uniform normalization to all fields\n",
 984 |     "        as discussed in Session V.2 Document vector - Normalization.\n",
 985 |     "        Args: \n",
 986 |     "            q (Query) : the query\n",
 987 |     "            d (Document) : the document\n",
 988 |     "            doc_vec (dict) : the doc vector\n",
 989 |     "        Return:\n",
 990 |     "            doc_vec (dict) : the doc vector after normalization\n",
 991 |     "        \"\"\"\n",
 992 |     "        ### Begin your code\n",
 993 |     "        for zone in doc_vec.keys():\n",
 994 |     "            for term in doc_vec[zone].keys():\n",
 995 |     "                tf = doc_vec[zone][term]\n",
 996 |     "                doc_vec[zone][term] = (1 + math.log10(tf)) / (d.body_length + self.smoothing_body_length) if tf > 0 else 0\n",
 997 |     "        return doc_vec\n",
 998 |     "        ### End your code    \n",
 999 |     "        \n",
1000 |     "        \n",
1001 |     "    def get_sim_score(self, q, d):\n",
1002 |     "        \"\"\" Get the similarity score between a document and a query.\n",
1003 |     "        Args:\n",
1004 |     "            q (Query) : the query\n",
1005 |     "            d (Document) : the document\n",
1006 |     "            \n",
1007 |     "        Return: the similarity score of q and d\n",
1008 |     "        \"\"\"\n",
1009 |     "        query_vec = self.get_query_vector(q) \n",
1010 |     "        # Define normalizattion functon here or directly pass in normalize_func as shown in below cell\n",
1011 |     "        self.doc_weight_scheme['norm'] = self.L1_normalize_doc_vec #modified\n",
1012 |     "        # Normalization\n",
1013 |     "        norm_doc_vec = self.get_doc_vector(q, d, self.doc_weight_scheme) #modified\n",
1014 |     "        return self.get_net_score(q, query_vec, d, norm_doc_vec)"
1015 |    ]
1016 |   },
1017 |   {
1018 |    "cell_type": "code",
1019 |    "execution_count": 134,
1020 |    "metadata": {},
1021 |    "outputs": [
1022 |     {
1023 |      "output_type": "stream",
1024 |      "name": "stdout",
1025 |      "text": "QUERY Vector:  {'stanford': 0.14313422813430865, 'aoerc': 4.995626420883029, 'pool': 2.447851715495207, 'hours': 1.2883117872943215} \n\nunnormalized doc vector {'url': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0}, 'title': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0}, 'headers': {'stanford': 5, 'aoerc': 0, 'pool': 0, 'hours': 1}, 'body_hits': {'stanford': 10, 'aoerc': 7, 'pool': 1, 'hours': 0}} \n\nscore after normalize doc vector 0.001568758578249973 \n\n"
1026 |     }
1027 |    ],
1028 |    "source": [
1029 |     "cs = CosineSimilarityScorer(my_idf, query_dict, default_params_cosine)\n",
1030 |     "\n",
1031 |     "query_weight_scheme = {\"tf\": 'b', \"df\": 't', \"norm\": None} \n",
1032 |     "doc_weight_scheme = {\"tf\": 'n', \"df\": 'n', \"norm\": cs.L1_normalize_doc_vec}\n",
1033 |     "\n",
1034 |     "print('QUERY Vector: ',  cs.get_query_vector(q, query_weight_scheme), '\\n')\n",
1035 |     "\n",
1036 |     "print('unnormalized doc vector', cs.get_doc_vector(q, d, None), '\\n')\n",
1037 |     "print('score after normalize doc vector', cs.get_sim_score(q, d), '\\n')"
1038 |    ]
1039 |   },
1040 |   {
1041 |    "cell_type": "markdown",
1042 |    "metadata": {},
1043 |    "source": [
1044 |     "# VI Task2: BM25F (15%) "
1045 |    ]
1046 |   },
1047 |   {
1048 |    "cell_type": "markdown",
1049 |    "metadata": {},
1050 |    "source": [
1051 |     "The second task is to implement the BM25F ranking algorithm. The algorithm is decribed in detail in the lecture slides. Specifically, you should have a look at BM25F related slides of [04/25 lecture](http://web.stanford.edu/class/cs276/19handouts/lecture7-probir-1per.pdf) before reading further. Here, instead of using the term scores from [Section IV.1](##IV.1-Term-Score), we use field-dependent normalized term frequency ($ftf$). Thus, for a given term $t$ and field $f \\in \\{url, header, body, title, anchor\\}$ in document $d$, \n",
1052 |     "\n",
1053 |     "\\begin{equation} \n",
1054 |     "ftf_{d,f,t} = \\frac{tf_{d,f,t}}{1 + B_f((\\text{len}_{d,f} / \\text{avlen}_f) - 1)} \n",
1055 |     "\\tag{1}\n",
1056 |     "\\end{equation}\n",
1057 |     "where $tf_{d,f,t}$ is the raw term frequency of $t$ in field $f$ in document $d$, $len_{d,f}$ is the length of $f$ in $d$ and $avlen_f$ is the average field length for $f$. The variables $avlen_{body}$, $avlen_{url}$, $avlen_{title}$, $avlen_{header}$ and $avlen_{anchor}$ can be computed using the training set. $B_f$ is a field-dependent parameter and must be tuned for this task. If $avlen_f$ is zero (should not happen in this dataset), then $ftf_{d,f,t} = 0$.\n",
1058 |     "\n",
1059 |     "Then, the overall weight for the term $t$ in document $d$ among all fields is \n",
1060 |     "  \n",
1061 |     "\\begin{equation}\n",
1062 |     "w_{d,t} = \\sum_{f}W_f \\cdot ftf_{d,f,t}\n",
1063 |     "\\tag{2}\n",
1064 |     "\\end{equation}\n",
1065 |     "Here, $W_f$ is also a field-dependent parameter that determines the relative weights given to each field. This value is similar in theory to the tuning parameters for Task 1. \n",
1066 |     "\n",
1067 |     "\n",
1068 |     "Since, we also have a non-textual feature, in the form of <b>pagerank</b>, we incorporate it into our ranking function using the method described in the BM25 lecture regarding ranking with non-textual features.\n",
1069 |     "\n",
1070 |     "\n",
1071 |     "Therefore, the overall score of document $d$ for query $q$ is then:  \n",
1072 |     "  \n",
1073 |     "\\begin{equation}\n",
1074 |     "\\sum_{t \\in q} \\frac{w_{d,t}}{K_1 + w_{d,t}}idf_t + \\lambda V_{j}(f)\n",
1075 |     "\\tag{3}\n",
1076 |     "\\end{equation}\n",
1077 |     "where $K_1$ is also a free parameter and $V_{j}$ can be a log/saturation/sigmoid function as mentioned in the slides (you will need to experiment with the other parameter $\\lambda^\\prime$ used by the $V_{j}$ function).\n",
1078 |     "\n",
1079 |     "Thus, for this task, there are a minimum of 13 parameters to optimize, namely $B_{url}, B_{title}, B_{header}$, $B_{body}, B_{anchor}$, $W_{url}, W_{title}, W_{header}$, $W_{body}, W_{anchor}$, $\\lambda, \\lambda^\\prime$ and $K_1$. Additionaly, you also have to select the $V_{j}$ function appropriately.\n",
1080 |     "\n",
1081 |     "While in theory, BM25F should give a better NDCG value as it incorporates a lot of more information, this need not necessarily be the case. \n",
1082 |     "\n",
1083 |     "**Hint**: The weight values obtained in Task1 may be a good starting point for this task. Again note that the weights will depend on the \"importance\" of the fields. Moreover, as mentioned in the slides, log(pagerank) works well in practice but you should try other functions as well and see how they work.\n"
1084 |    ]
1085 |   },
1086 |   {
1087 |    "cell_type": "code",
1088 |    "execution_count": 65,
1089 |    "metadata": {},
1090 |    "outputs": [],
1091 |    "source": [
1092 |     "default_params_bm25f = {\n",
1093 |     "    \"url_weight\" : 0.1,\n",
1094 |     "    \"title_weight\": 0.1,\n",
1095 |     "    \"body_hits_weight\" : 0.1,\n",
1096 |     "    \"header_weight\" : 0.1,\n",
1097 |     "    \"anchor_weight\" : 0.1,\n",
1098 |     "    \"b_url\" : 0.1,\n",
1099 |     "    \"b_title\" : 0.1,\n",
1100 |     "    \"b_header\" : 0.1,\n",
1101 |     "    \"b_body_hits\" : 0.1,\n",
1102 |     "    \"b_anchor\" : 0.1,\n",
1103 |     "    \"k1\": 0.1,\n",
1104 |     "    \"pagerank_lambda\" : 0.1,\n",
1105 |     "    \"pagerank_lambda_prime\" : 0.1, \n",
1106 |     "}"
1107 |    ]
1108 |   },
1109 |   {
1110 |    "cell_type": "code",
1111 |    "execution_count": null,
1112 |    "metadata": {},
1113 |    "outputs": [],
1114 |    "source": [
1115 |     "%%tee submission/params_bm25f.py\n",
1116 |     "### Begin your code\n",
1117 |     "params_bm25f = {\n",
1118 |     "    \n",
1119 |     "}\n",
1120 |     "### End your code"
1121 |    ]
1122 |   },
1123 |   {
1124 |    "cell_type": "code",
1125 |    "execution_count": 123,
1126 |    "metadata": {},
1127 |    "outputs": [
1128 |     {
1129 |      "output_type": "stream",
1130 |      "name": "stdout",
1131 |      "text": "Overwriting submission/bm25f_score.py\n"
1132 |     }
1133 |    ],
1134 |    "source": [
1135 |     "%%tee submission/bm25f_score.py\n",
1136 |     "from submission.ascore import * #modified\n",
1137 |     "class BM25FScorer(AScorer):\n",
1138 |     "\n",
1139 |     "    def __init__(self, idf, query_dict, params, query_weight_scheme=None, doc_weight_scheme=None): #modified\n",
1140 |     "        super().__init__(idf, query_weight_scheme=query_weight_scheme, doc_weight_scheme=doc_weight_scheme) #modified\n",
1141 |     "        self.query_dict = query_dict\n",
1142 |     "        \n",
1143 |     "        self.url_weight = params['url_weight']\n",
1144 |     "        self.title_weight  = params['title_weight']\n",
1145 |     "        self.body_hits_weight = params['body_hits_weight']\n",
1146 |     "        self.header_weight = params['header_weight']\n",
1147 |     "        self.anchor_weight = params['anchor_weight']\n",
1148 |     "        # bm25 specific weights\n",
1149 |     "        self.b_url = params['b_url']\n",
1150 |     "        self.b_title = params['b_title']\n",
1151 |     "        self.b_header = params['b_header']\n",
1152 |     "        self.b_body_hits = params['b_body_hits']\n",
1153 |     "        self.b_anchor = params['b_anchor']\n",
1154 |     "        self.k1 = params['k1']\n",
1155 |     "        self.pagerank_lambda = params['pagerank_lambda']\n",
1156 |     "        self.pagerank_lambda_prime = params['pagerank_lambda_prime']\n",
1157 |     "\n",
1158 |     "        # BM25F data structures feel free to modify these\n",
1159 |     "        # Document -> field -> length\n",
1160 |     "        self.length = {}\n",
1161 |     "        self.avg_length = {}\n",
1162 |     "        self.pagerank_scores = {}\n",
1163 |     "        \n",
1164 |     "        self.calc_avg_length()\n",
1165 |     "        \n",
1166 |     "    def calc_avg_length(self):\n",
1167 |     "        \"\"\" Set up average lengths for BM25F, also handling PageRank. \n",
1168 |     "        You need to \n",
1169 |     "        Initialize any data structures needed.\n",
1170 |     "        Perform any preprocessing you would like to do on the fields.\n",
1171 |     "        Handle pagerank\n",
1172 |     "        Accumulate lengths of fields in documents. \n",
1173 |     "        Hint: You could use query_dict\n",
1174 |     "        \"\"\"\n",
1175 |     "        ### Begin your code\n",
1176 |     "        zones = {'url': self.parse_url, 'title': self.parse_title, 'headers': self.parse_headers, 'anchors': self.parse_anchors, 'body_hits': self.parse_body_hits}\n",
1177 |     "        num_doc = 0\n",
1178 |     "        for url in self.query_dict.values():\n",
1179 |     "            for doc in url.values():\n",
1180 |     "                if doc.url not in self.length:\n",
1181 |     "                    self.length[doc.url] = dict()\n",
1182 |     "                    for zone in zones.keys():\n",
1183 |     "                        if eval('doc.{}'.format(zone)):\n",
1184 |     "                            length = sum( zones[zone](eval('doc.{}'.format(zone))).values() )\n",
1185 |     "                            self.length[doc.url][zone] = length\n",
1186 |     "                            num_doc += 1\n",
1187 |     "                            for z, l in self.length[doc.url].items():\n",
1188 |     "                                self.avg_length[z] = self.avg_length.get(z, 0) + l\n",
1189 |     "                        else:\n",
1190 |     "                            self.length[doc.url][zone] = 0\n",
1191 |     "        for field in self.avg_length.keys():\n",
1192 |     "            self.avg_length[field] /= num_doc\n",
1193 |     "        ### End your code\n",
1194 |     "        \n",
1195 |     "    def get_net_score(self, q, query_vec, d, doc_vec):\n",
1196 |     "        \"\"\" Compute the overall score using above equation\n",
1197 |     "        Args:\n",
1198 |     "            q (Query) : the query\n",
1199 |     "            query_vec (dict) : the query vector\n",
1200 |     "            d (Document) : the document\n",
1201 |     "            doc_vec (dict) : the doc vector\n",
1202 |     "        Return:\n",
1203 |     "            score (float) : the net score\n",
1204 |     "        \"\"\"\n",
1205 |     "        ### Begin your code\n",
1206 |     "        term = set(q.query_words)\n",
1207 |     "        zone_param = {'url': self.url_weight, 'title': self.title_weight, 'body_hits': self.body_hits_weight, 'headers': self.header_weight, 'anchor': self.anchor_weight}\n",
1208 |     "        w_dt = dict()\n",
1209 |     "        for t in term:\n",
1210 |     "            w_dt[t] = sum(zone_param[z]*doc_vec[z][t] for z in zone_param.keys() if z in doc_vec)\n",
1211 |     "\n",
1212 |     "        score = sum([w_dt[t] / (self.k1 + w_dt[t]) * query_vec[t] for t in term])\n",
1213 |     "        pr = d.pagerank + 1 if d.pagerank else 0\n",
1214 |     "        score += math.log(pr + self.pagerank_lambda_prime) * self.pagerank_lambda\n",
1215 |     "        ### End your code\n",
1216 |     "        return score\n",
1217 |     "    \n",
1218 |     "    \n",
1219 |     "    def bm25f_normalize_doc_vec(self, q, d, doc_vec):\n",
1220 |     "        \"\"\" Normalize the raw term frequencies in fields in document d \n",
1221 |     "            using above equation (1).\n",
1222 |     "        Args:\n",
1223 |     "            q (Query) : the query       \n",
1224 |     "            d (Document) : the document\n",
1225 |     "            doc_vec (dict) : the doc vector\n",
1226 |     "        Return: \n",
1227 |     "            doc_vec (dict) : the doc vector after normalization\n",
1228 |     "        \"\"\"\n",
1229 |     "        ### Begin your code\n",
1230 |     "        zone_param = {'url': self.b_url, 'title': self.b_title, 'body_hits': self.b_body_hits, 'headers': self.b_header, 'anchor': self.b_anchor}\n",
1231 |     "\n",
1232 |     "        for z in zone_param.keys():\n",
1233 |     "            if z in doc_vec:\n",
1234 |     "                for t in doc_vec[z].keys():\n",
1235 |     "                    doc_vec[z][t] /= 1+zone_param[z]*(self.length[d.url][z]/self.avg_length[z]-1)\n",
1236 |     "        return doc_vec\n",
1237 |     "        ### End your code    \n",
1238 |     "        \n",
1239 |     "    def get_sim_score(self, q, d):\n",
1240 |     "        \"\"\" Get the similarity score between a document and a query.\n",
1241 |     "        Args:\n",
1242 |     "            d (Document) : the document\n",
1243 |     "            q (Query) : the query\n",
1244 |     "            \n",
1245 |     "        Return:\n",
1246 |     "            the similarity score\n",
1247 |     "        \"\"\"\n",
1248 |     "        query_vec = self.get_query_vector(q)\n",
1249 |     "        # Define normalizattion functon here or directly pass in normalize_func as shown in below cell\n",
1250 |     "        self.doc_weight_scheme['norm'] = self.bm25f_normalize_doc_vec #modified\n",
1251 |     "        norm_doc_vec = self.get_doc_vector(q, d, self.doc_weight_scheme) #modified\n",
1252 |     "        # Normalization\n",
1253 |     "        return self.get_net_score(q, query_vec, d, norm_doc_vec)"
1254 |    ]
1255 |   },
1256 |   {
1257 |    "cell_type": "code",
1258 |    "execution_count": 124,
1259 |    "metadata": {},
1260 |    "outputs": [
1261 |     {
1262 |      "output_type": "stream",
1263 |      "name": "stdout",
1264 |      "text": "QUERY Vector:  {'stanford': 0.14313422813430865, 'aoerc': 4.995626420883029, 'pool': 2.447851715495207, 'hours': 1.2883117872943215} \n\nunnormalized doc vector {'url': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0}, 'title': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0}, 'headers': {'stanford': 5, 'aoerc': 0, 'pool': 0, 'hours': 1}, 'body_hits': {'stanford': 10, 'aoerc': 7, 'pool': 1, 'hours': 0}} \n\nscore after normalize doc vector 6.2443860936984255 \n\n"
1265 |     }
1266 |    ],
1267 |    "source": [
1268 |     "bm25f_scorer = BM25FScorer(my_idf, query_dict, default_params_bm25f)\n",
1269 |     "\n",
1270 |     "# You can directly pass in normalize_func here or define normalizattion functon as shown in above cell\n",
1271 |     "query_weight_scheme = {\"tf\": 'b', \"df\": 't', \"norm\": None} \n",
1272 |     "doc_weight_scheme = {\"tf\": 'n', \"df\": 'n', \"norm\": bm25f_scorer.bm25f_normalize_doc_vec}\n",
1273 |     "\n",
1274 |     "\n",
1275 |     "print('QUERY Vector: ',  bm25f_scorer.get_query_vector(q, query_weight_scheme), '\\n')\n",
1276 |     "print('unnormalized doc vector', bm25f_scorer.get_doc_vector(q, d, None), '\\n')\n",
1277 |     "print('score after normalize doc vector', bm25f_scorer.get_sim_score(q, d), '\\n')"
1278 |    ]
1279 |   },
1280 |   {
1281 |    "cell_type": "markdown",
1282 |    "metadata": {},
1283 |    "source": [
1284 |     "# VII Task3: Smallest Window (10%)"
1285 |    ]
1286 |   },
1287 |   {
1288 |    "cell_type": "markdown",
1289 |    "metadata": {},
1290 |    "source": [
1291 |     "The final task is to incorporate window sizes into the ranking algorithm from Task 2 (or Task 1 if you prefer). For a given query, the smallest window $w_{q,d}$ is defined to be the smallest sequence of tokens in document $d$ such that all of the terms in the query $q$ for are present in that sequence. A window can only be specific to a particular field and for anchor fields, all of the terms in $q$ must be present within a particular anchor text (i.e, if one term occurs in one anchor text and another term in a different anchor text, then it cannot be considered for a window). If $d$ does not contain any of the query terms or a window cannot be found, then $w_{q,d} = \\infty$. Intuitively, the smaller $w_{q,d}$ is, the more relevant the document should be to the query. Thus, we can also multiply the document score (from Task 1 or Task 2) by a boost based on $w$ such that:"
1292 |    ]
1293 |   },
1294 |   {
1295 |    "cell_type": "markdown",
1296 |    "metadata": {},
1297 |    "source": [
1298 |     "* If $w_{q,d} = \\infty$, then the boost is 1. \n",
1299 |     "* If $w_{q,d} = |Q|$ where $Q$ are the unique terms in $q$, then we multiply the score by some factor $B$. \n",
1300 |     "* For values of $w_{q,d}$ between the query length and infinite, we provide a boost between $B$ and 1. The boost should decrease rapidly with the size of $w_{q,d}$ and can decrease exponentially or as $\\frac{1}{x}$. "
1301 |    ]
1302 |   },
1303 |   {
1304 |    "cell_type": "markdown",
1305 |    "metadata": {},
1306 |    "source": [
1307 |     "Thus, for this task, there are either 7 or 15 parameters to optimize, depending on whether you decide to modify cosine similarity or BM25F. The choice of function to use when the window size is not the same as the query length is another factor to also consider. "
1308 |    ]
1309 |   },
1310 |   {
1311 |    "cell_type": "code",
1312 |    "execution_count": 98,
1313 |    "metadata": {},
1314 |    "outputs": [],
1315 |    "source": [
1316 |     "# params depends on the scorer you are using\n",
1317 |     "default_params_window = {\n",
1318 |     " 'B': 1.16,\n",
1319 |     " 'url_weight': 0.1,\n",
1320 |     " 'title_weight': 0.1,\n",
1321 |     " 'body_hits_weight': 0.1,\n",
1322 |     " 'header_weight': 0.1,\n",
1323 |     " 'anchor_weight': 0.1,\n",
1324 |     " 'b_url': 0.1,\n",
1325 |     " 'b_title': 0.1,\n",
1326 |     " 'b_header': 0.1,\n",
1327 |     " 'b_body_hits': 0.1,\n",
1328 |     " 'b_anchor': 0.1,\n",
1329 |     " 'k1': 0.1,\n",
1330 |     " 'pagerank_lambda': 0.1,\n",
1331 |     " 'pagerank_lambda_prime': 0.1\n",
1332 |     "}"
1333 |    ]
1334 |   },
1335 |   {
1336 |    "cell_type": "code",
1337 |    "execution_count": null,
1338 |    "metadata": {},
1339 |    "outputs": [],
1340 |    "source": [
1341 |     "%%tee submission/params_window.py\n",
1342 |     "# params depends on scorer you are using\n",
1343 |     "### Begin your code\n",
1344 |     "params_window = {\n",
1345 |     "    \n",
1346 |     "}\n",
1347 |     "### End your code"
1348 |    ]
1349 |   },
1350 |   {
1351 |    "cell_type": "code",
1352 |    "execution_count": 118,
1353 |    "metadata": {},
1354 |    "outputs": [
1355 |     {
1356 |      "output_type": "stream",
1357 |      "name": "stdout",
1358 |      "text": "Overwriting submission/smallest_window_score.py\n"
1359 |     }
1360 |    ],
1361 |    "source": [
1362 |     "%%tee submission/smallest_window_score.py\n",
1363 |     "from submission.cosine_score import * #modified\n",
1364 |     "from submission.bm25f_score import * #modified\n",
1365 |     "class SmallestWindowScorer(BM25FScorer): \n",
1366 |     "    \"\"\"\n",
1367 |     "     A skeleton for implementing the Smallest Window scorer in Task 3.\n",
1368 |     "     Note: The class provided in the skeleton code extends BM25Scorer in Task 2. \n",
1369 |     "     However, you don't necessarily have to use Task 2. (You could also use Task 1, \n",
1370 |     "     in which case, you'd probably like to extend CosineSimilarityScorer instead.)\n",
1371 |     "     Also, feel free to modify or add helpers inside this class.\n",
1372 |     "     \n",
1373 |     "     Note: If you plan to use cosine similarity scorer\n",
1374 |     "             - change parent class to CosineSimilarityScorer \n",
1375 |     "             - change normalization method in get_sim_score \n",
1376 |     "    \"\"\"\n",
1377 |     "    def __init__(self, idf, query_dict, params, query_weight_scheme=None, doc_weight_scheme=None): #modified\n",
1378 |     "        super().__init__(idf, query_dict, params, query_weight_scheme=query_weight_scheme, doc_weight_scheme=doc_weight_scheme) #modified\n",
1379 |     "        self.query_dict = query_dict\n",
1380 |     "        \n",
1381 |     "        # smallest window specific weights\n",
1382 |     "        self.B = params[\"B\"]\n",
1383 |     "    \n",
1384 |     "    # Write helper functions here\n",
1385 |     "    ### Begin your code\n",
1386 |     "    def get_window_seq(self, term_set, seq):\n",
1387 |     "        if not seq:\n",
1388 |     "            return float('inf')\n",
1389 |     "        i = j = 0\n",
1390 |     "        cnt = dict()\n",
1391 |     "        window = float('inf')\n",
1392 |     "        while j < len(seq):\n",
1393 |     "            if seq[j] in term_set:\n",
1394 |     "                cnt[seq[j]] = cnt.get(seq[j], 0) + 1\n",
1395 |     "            while i <= j and len(cnt) == len(term_set):\n",
1396 |     "                window = min(window, j-i+1)\n",
1397 |     "                cnt[seq[i]] -= 1\n",
1398 |     "                if cnt[seq[i]] == 0:\n",
1399 |     "                    del cnt[seq[i]]\n",
1400 |     "                i += 1\n",
1401 |     "            j += 1\n",
1402 |     "        return window\n",
1403 |     "\n",
1404 |     "    def get_window_body_hits(self, term_set, body_hits):\n",
1405 |     "        window = float('inf')\n",
1406 |     "        filtered = []\n",
1407 |     "        for t in term_set:\n",
1408 |     "            if t in body_hits:\n",
1409 |     "                for pos in body_hits[t]:\n",
1410 |     "                    filtered.append((pos, t))\n",
1411 |     "            else:\n",
1412 |     "                return float('inf')\n",
1413 |     "        i = j = 0\n",
1414 |     "        filtered = sorted(filtered)\n",
1415 |     "        cnt = dict()\n",
1416 |     "        while j < len(filtered):\n",
1417 |     "            j_pos, j_t = filtered[j]\n",
1418 |     "            cnt[t] = cnt.get(t, 0) + 1\n",
1419 |     "            while i <= j and len(cnt) == len(term_set):\n",
1420 |     "                i_pos, i_t = filtered[i]\n",
1421 |     "                window = min(window, j_pos-i_pos+1)\n",
1422 |     "                cnt[i_t] -= 1\n",
1423 |     "                if cnt[i_t] == 0:\n",
1424 |     "                    del cnt[i_t]\n",
1425 |     "                i += 1\n",
1426 |     "            j = 1\n",
1427 |     "        return window\n",
1428 |     "\n",
1429 |     "    def get_window(self, term_set, d, z):\n",
1430 |     "        if z == 'title':\n",
1431 |     "            return self.get_window_seq(term_set, d.title)\n",
1432 |     "        elif z == 'body_hits':\n",
1433 |     "            return self.get_window_body_hits(term_set, d.body_hits)\n",
1434 |     "        elif z == 'headers':\n",
1435 |     "            return min(self.get_window_seq(term_set, h) for h in d.headers)\n",
1436 |     "        else:\n",
1437 |     "            return float('inf')\n",
1438 |     "\n",
1439 |     "    ### End your code\n",
1440 |     "        \n",
1441 |     "    def get_boost_score(self, q, d):\n",
1442 |     "        \"\"\" calculate boost score based on smallest window size\"\"\"\n",
1443 |     "        ### Begin your code\n",
1444 |     "        term_set = set(q.query_words)\n",
1445 |     "        window = float('inf')\n",
1446 |     "        zones = {'url', 'title', 'headers'}\n",
1447 |     "        for z in zones:\n",
1448 |     "            window = min(window, self.get_window(term_set, d, z))\n",
1449 |     "        return (self.B-1) / (window-len(term_set)+1) + 1\n",
1450 |     "        ### End your code\n",
1451 |     "    \n",
1452 |     "\n",
1453 |     "    def get_sim_score(self, q, d):\n",
1454 |     "        \"\"\" Get the similarity score between a document and a query.\n",
1455 |     "        Args:\n",
1456 |     "            d (Document) : the document\n",
1457 |     "            q (Query) : the query\n",
1458 |     "            \n",
1459 |     "        Return:\n",
1460 |     "            the raw similarity score times boost\n",
1461 |     "        \"\"\"\n",
1462 |     "        boost = self.get_boost_score(q, d)\n",
1463 |     "        query_vec = self.get_query_vector(q)\n",
1464 |     "        # Define normalizattion functon here or directly pass in normalize_func as shown in below cell\n",
1465 |     "        # Depends on which parent class you are using \n",
1466 |     "        self.doc_weight_scheme['norm'] = self.bm25f_normalize_doc_vec #modified\n",
1467 |     "        norm_doc_vec = self.get_doc_vector(q, d, self.doc_weight_scheme) #modified\n",
1468 |     "        raw_score = self.get_net_score(q, query_vec, d, norm_doc_vec)\n",
1469 |     "       \n",
1470 |     "        return boost * raw_score"
1471 |    ]
1472 |   },
1473 |   {
1474 |    "cell_type": "code",
1475 |    "execution_count": 119,
1476 |    "metadata": {},
1477 |    "outputs": [
1478 |     {
1479 |      "output_type": "stream",
1480 |      "name": "stdout",
1481 |      "text": "QUERY Vector:  {'stanford': 0.14313422813430865, 'aoerc': 4.995626420883029, 'pool': 2.447851715495207, 'hours': 1.2883117872943215} \n\nunnormalized doc vector {'url': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0}, 'title': {'stanford': 1, 'aoerc': 0, 'pool': 0, 'hours': 0}, 'headers': {'stanford': 5, 'aoerc': 0, 'pool': 0, 'hours': 1}, 'body_hits': {'stanford': 10, 'aoerc': 7, 'pool': 1, 'hours': 0}} \n\nscore after normalize doc vector 6.2443860936984255 \n\n"
1482 |     }
1483 |    ],
1484 |    "source": [
1485 |     "smallest_window_scorer = SmallestWindowScorer(my_idf, query_dict, default_params_window)\n",
1486 |     "\n",
1487 |     "# You can directly define weight_scheme here \n",
1488 |     "query_weight_scheme = {\"tf\": 'b', \"df\": 't', \"norm\": None}  \n",
1489 |     "doc_weight_scheme = {\"tf\": 'n', \"df\": 'n', \"norm\": smallest_window_scorer.bm25f_normalize_doc_vec}\n",
1490 |     "\n",
1491 |     "print('QUERY Vector: ',  smallest_window_scorer.get_query_vector(q, query_weight_scheme), '\\n')\n",
1492 |     "print('unnormalized doc vector', smallest_window_scorer.get_doc_vector(q, d, None), '\\n')\n",
1493 |     "print('score after normalize doc vector', smallest_window_scorer.get_sim_score(q, d), '\\n')"
1494 |    ]
1495 |   },
1496 |   {
1497 |    "cell_type": "markdown",
1498 |    "metadata": {},
1499 |    "source": [
1500 |     "# VIII Rank and Evaluate"
1501 |    ]
1502 |   },
1503 |   {
1504 |    "cell_type": "markdown",
1505 |    "metadata": {},
1506 |    "source": [
1507 |     "In the rank class, you need to construct the ranking results based on different scores. "
1508 |    ]
1509 |   },
1510 |   {
1511 |    "cell_type": "code",
1512 |    "execution_count": 130,
1513 |    "metadata": {},
1514 |    "outputs": [
1515 |     {
1516 |      "output_type": "stream",
1517 |      "name": "stdout",
1518 |      "text": "Overwriting submission/rank.py\n"
1519 |     }
1520 |    ],
1521 |    "source": [
1522 |     "%%tee submission/rank.py\n",
1523 |     "from collections import Counter\n",
1524 |     "from collections import OrderedDict\n",
1525 |     "\n",
1526 |     "class Rank:\n",
1527 |     "    def score(self, query_dict, score_type, idf, params):\n",
1528 |     "        \n",
1529 |     "        \"\"\" Call this function to score and rank documents for some queries, \n",
1530 |     "            with a specified scoring function.\n",
1531 |     "        Args:\n",
1532 |     "            query_dict (dict) :  Mapping of Query-url-Document.\n",
1533 |     "            score_type (str) : \"baseline\"  \"cosine\" \"bm25f\" \"window\" \"extra\"\n",
1534 |     "            idf (dict) : term-idf dictionary\n",
1535 |     "            params(dict) : parames for scorer\n",
1536 |     "        Return \n",
1537 |     "            query_rankings (dict) : a mapping of queries to rankings\n",
1538 |     "        \"\"\"\n",
1539 |     "        if score_type == \"baseline\": scorer = BaselineScorer(idf)\n",
1540 |     "        elif score_type == \"cosine\": scorer = CosineSimilarityScorer(idf, query_dict, params)\n",
1541 |     "        elif score_type == \"bm25f\": scorer = BM25FScorer(idf, query_dict, params)\n",
1542 |     "        elif score_type == \"window\": scorer = SmallestWindowScorer(idf, query_dict, params)\n",
1543 |     "        elif score_type == \"extra\": scorer = ExtraCreditScorer(idf, query_dict, params) \n",
1544 |     "        else: print(\"Wrong score type!\")\n",
1545 |     "\n",
1546 |     "        # loop through urls for query, getting scores\n",
1547 |     "        query_rankings = {}\n",
1548 |     "        for query in query_dict.keys():\n",
1549 |     "            doc_and_scores = {}\n",
1550 |     "            # rank the urls based on scores\n",
1551 |     "            ### Begin your code\n",
1552 |     "            for d in query_dict[query].values():\n",
1553 |     "                doc_and_scores[d] = scorer.get_sim_score(query, d)\n",
1554 |     "            query_rankings[query] = [k for k, _ in sorted(doc_and_scores.items(), key=lambda x: -x[1])]\n",
1555 |     "            ### End your code\n",
1556 |     "        \n",
1557 |     "        return query_rankings\n",
1558 |     "    \n",
1559 |     "    def rank_with_score(self, input_dict):\n",
1560 |     "        \n",
1561 |     "        \"\"\" Call this function to accept dictionary with an ordered ranking of queries. \n",
1562 |     "        You will need to implement this function for the learning-to-rank ipython notebook. \n",
1563 |     "        Note that this function will likely replicate code from the score function above.\n",
1564 |     "        Args:\n",
1565 |     "            input_dict (dict) :  Mapping of Query-url-score.\n",
1566 |     "        Return \n",
1567 |     "            query_rankings (dict) : An ordered dictionary of Query->url->score (ordering done for each query)\n",
1568 |     "        \n",
1569 |     "        \"\"\"\n",
1570 |     "        # loop through urls for query, getting scores\n",
1571 |     "        query_rankings = {}\n",
1572 |     "        for query in input_dict.keys():\n",
1573 |     "            url_and_scores = {}\n",
1574 |     "            # sort the urls based on scores\n",
1575 |     "            ### Begin your code\n",
1576 |     "            query_rankings[query] = OrderedDict(sorted([(k,v) for k,v in input_dict[query].items()], key=lambda x : -x[1]))\n",
1577 |     "            ### End your code\n",
1578 |     "        return query_rankings\n",
1579 |     "    \n",
1580 |     "    def write_ranking_to_file(self, query_rankings, ranked_result_file):\n",
1581 |     "        with open(ranked_result_file, \"w\") as f:\n",
1582 |     "            for query in query_rankings.keys():\n",
1583 |     "                f.write(\"query: \"+ query.__str__() + \"\\n\")\n",
1584 |     "                for res in query_rankings[query]:\n",
1585 |     "                \n",
1586 |     "                    url_string = \"  url: \" + res.url + \"\\n\" + \\\n",
1587 |     "                                \"    title: \" + res.title + \"\\n\" +\\\n",
1588 |     "                                \"    debug: \" + \"\\n\" \n",
1589 |     "                    \n",
1590 |     "                    f.write(url_string)\n",
1591 |     "                    \n",
1592 |     "        print(\"Write ranking result to \" + ranked_result_file + \" sucessfully!\")     \n",
1593 |     " "
1594 |    ]
1595 |   },
1596 |   {
1597 |    "cell_type": "markdown",
1598 |    "metadata": {},
1599 |    "source": [
1600 |     "Write your result to file to check your implementation"
1601 |    ]
1602 |   },
1603 |   {
1604 |    "cell_type": "code",
1605 |    "execution_count": 131,
1606 |    "metadata": {},
1607 |    "outputs": [
1608 |     {
1609 |      "output_type": "stream",
1610 |      "name": "stdout",
1611 |      "text": "Write ranking result to output/ranked_result_bm25f sucessfully!\n"
1612 |     }
1613 |    ],
1614 |    "source": [
1615 |     "# This is an example of how to use ranking class \n",
1616 |     "# Ranking \n",
1617 |     "r = Rank()\n",
1618 |     "query_rankings = r.score(query_dict, 'bm25f', my_idf, default_params_bm25f)\n",
1619 |     "ranked_result_file = os.path.join(\"output\", \"ranked_result_bm25f\")\n",
1620 |     "r.write_ranking_to_file(query_rankings, ranked_result_file)"
1621 |    ]
1622 |   },
1623 |   {
1624 |    "cell_type": "markdown",
1625 |    "metadata": {},
1626 |    "source": [
1627 |     "## VIII.1 NDCG implementation"
1628 |    ]
1629 |   },
1630 |   {
1631 |    "cell_type": "markdown",
1632 |    "metadata": {},
1633 |    "source": [
1634 |     "We provide the NDCG implementation for you in 'base_classes/ndcg.py'. You can use them to evaluate your results and do paramater tuning."
1635 |    ]
1636 |   },
1637 |   {
1638 |    "cell_type": "markdown",
1639 |    "metadata": {},
1640 |    "source": [
1641 |     "This is an example of how to use the Rank class and NDCG class to evaluate your scorer and ranking function."
1642 |    ]
1643 |   },
1644 |   {
1645 |    "cell_type": "code",
1646 |    "execution_count": 135,
1647 |    "metadata": {},
1648 |    "outputs": [
1649 |     {
1650 |      "output_type": "stream",
1651 |      "name": "stdout",
1652 |      "text": "Write ranking result to output/ranked_result_cosine sucessfully!\nWrite ndcg result to output/ndcg_result_cosine sucessfully!\n0.8190388127738062\n"
1653 |     }
1654 |    ],
1655 |    "source": [
1656 |     "# This is an example of how to use ranking class and NDCG\n",
1657 |     "\n",
1658 |     "# Load data and generate query dict\n",
1659 |     "signal_file_name = \"pa3.signal.train\"\n",
1660 |     "query_dict = load_train_data(os.path.join(data_dir, signal_file_name))\n",
1661 |     "\n",
1662 |     "# Ranking \n",
1663 |     "r = Rank()\n",
1664 |     "query_rankings = r.score(query_dict, 'cosine', my_idf, default_params_cosine)\n",
1665 |     "ranked_result_file = os.path.join(\"output\", \"ranked_result_cosine\")\n",
1666 |     "r.write_ranking_to_file(query_rankings, ranked_result_file)\n",
1667 |     "\n",
1668 |     "# NDCG\n",
1669 |     "ndcg = NDCG()\n",
1670 |     "rel_filename = 'pa3.rel.train'\n",
1671 |     "rel_file = os.path.join(data_dir, rel_filename)\n",
1672 |     "\n",
1673 |     "ndcg.get_rel_scores(rel_file)\n",
1674 |     "ndcg.read_ranking_calc(ranked_result_file)\n",
1675 |     "\n",
1676 |     "# You can also write the ndcg result to file\n",
1677 |     "ndcg_result_file = os.path.join(\"output\", \"ndcg_result_cosine\") \n",
1678 |     "ndcg.write_ndcg_result(ndcg_result_file)\n",
1679 |     "\n",
1680 |     "# calculate average NDCG\n",
1681 |     "avg_ndcg = ndcg.get_avg_ndcg()\n",
1682 |     "print(avg_ndcg)"
1683 |    ]
1684 |   },
1685 |   {
1686 |    "cell_type": "markdown",
1687 |    "metadata": {},
1688 |    "source": [
1689 |     "Make sure to use NDCG on **both training and development** sets for all tasks to do parameter tuning. \n",
1690 |     "\n",
1691 |     "\n",
1692 |     "\n",
1693 |     "Your solution will be evaluated on a hidden test set, and full credit will be given to models that are within 1% of the staff implementation's test-set . Typically, the higher the better. \n"
1694 |    ]
1695 |   },
1696 |   {
1697 |    "cell_type": "markdown",
1698 |    "metadata": {},
1699 |    "source": [
1700 |     "# Report (15%)"
1701 |    ]
1702 |   },
1703 |   {
1704 |    "cell_type": "markdown",
1705 |    "metadata": {},
1706 |    "source": [
1707 |     "**1. (1%) Report NDCG on both training and development sets for all tasks.**\n",
1708 |     "> *Your answer here*"
1709 |    ]
1710 |   },
1711 |   {
1712 |    "cell_type": "markdown",
1713 |    "metadata": {},
1714 |    "source": [
1715 |     "**2. (2%)  For the three tasks, you should report all final model parameter values. Describe the intuition when tuning your models, and why those weights work in getting a good score. Were there any particular properties about the documents that allowed a higher weight to be given to one field as opposed to another?**\n",
1716 |     "> *Your answer here*"
1717 |    ]
1718 |   },
1719 |   {
1720 |    "cell_type": "markdown",
1721 |    "metadata": {},
1722 |    "source": [
1723 |     "**3. (3%)  In BM25F, in addition to the weights given to the fields, there are 8 other parameters, $B_{url}$, $B_{title}$, $B_{header}$, $B_{body}$, $B_{anchor}$, λ, λ′ and K1. How do these parameters affect the ranking function?**\n",
1724 |     "> *Your answer here*"
1725 |    ]
1726 |   },
1727 |   {
1728 |    "cell_type": "markdown",
1729 |    "metadata": {},
1730 |    "source": [
1731 |     "**4. (3%)In task 1, you may either use raw frequencies or sublinearly scale them to compute term frequency. Please report your choice and the reasons behind them. For BM25F, why did you select a particular $V_j$ function?**\n",
1732 |     "> *Your answer here*"
1733 |    ]
1734 |   },
1735 |   {
1736 |    "cell_type": "markdown",
1737 |    "metadata": {},
1738 |    "source": [
1739 |     "**5.(3%)  Briefly describe your design of smallest window. For a function that includes the smallest window as one component, how does varying B and the boost function change the performance of the ranking algorithm?**\n",
1740 |     "> *Your answer here*"
1741 |    ]
1742 |   },
1743 |   {
1744 |    "cell_type": "markdown",
1745 |    "metadata": {},
1746 |    "source": [
1747 |     "**6.(3%)  What other metrics, not used in this assignment, could be used to get a better scoring function from the document? The metrics could either be static (query-independent, e.g. document length) or dynamic (query-dependent, e.g. smallest window).**\n",
1748 |     "> *Your answer here*"
1749 |    ]
1750 |   },
1751 |   {
1752 |    "cell_type": "markdown",
1753 |    "metadata": {},
1754 |    "source": [
1755 |     "###  You are all done in the PA3 part 1. Now, it's time to start part 2 to explore different approaches to learn the parameters for ranking functions using machine learning. "
1756 |    ]
1757 |   }
1758 |  ],
1759 |  "metadata": {
1760 |   "kernelspec": {
1761 |    "display_name": "Python 3.7.3 64-bit ('cs276-pa3': conda)",
1762 |    "language": "python",
1763 |    "name": "python37364bitcs276pa3conda5d59678e0f834eb89f8c66ff589d5169"
1764 |   },
1765 |   "language_info": {
1766 |    "codemirror_mode": {
1767 |     "name": "ipython",
1768 |     "version": 3
1769 |    },
1770 |    "file_extension": ".py",
1771 |    "mimetype": "text/x-python",
1772 |    "name": "python",
1773 |    "nbconvert_exporter": "python",
1774 |    "pygments_lexer": "ipython3",
1775 |    "version": "3.7.3-final"
1776 |   },
1777 |   "toc": {
1778 |    "base_numbering": 1,
1779 |    "nav_menu": {
1780 |     "height": "66px",
1781 |     "width": "252px"
1782 |    },
1783 |    "number_sections": true,
1784 |    "sideBar": true,
1785 |    "skip_h1_title": false,
1786 |    "title_cell": "Table of Contents",
1787 |    "title_sidebar": "Contents",
1788 |    "toc_cell": false,
1789 |    "toc_position": {},
1790 |    "toc_section_display": "block",
1791 |    "toc_window_display": false
1792 |   }
1793 |  },
1794 |  "nbformat": 4,
1795 |  "nbformat_minor": 2
1796 | }


--------------------------------------------------------------------------------