├── setup.cfg
├── requirements.txt
├── .gitignore
├── convert_msmarco_doc_to_anserini.py
├── convert_msmarco_doc_to_t5_format.py
├── convert_msmarco_passages_doc_to_anserini.py
├── msmarco-v1
    └── augment_corpus.py
├── msmarco-v2
    └── augment_corpus.py
├── convert_msmarco_passage_to_anserini.py
├── LICENSE
└── README.md


/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece==0.1.95
2 | spacy==2.1.6
3 | tensorflow==2.4.1
4 | transformers>=4.6.0
5 | datasets
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | anserini/
 3 | *~
 4 | 
 5 | # for MS MARCO passage
 6 | collection.tar.gz
 7 | passage-predictions/
 8 | qrels.dev.small.tsv
 9 | queries.dev.small.tsv
10 | collection.tsv
11 | 
12 | lucene-index-msmarco-passage-expanded/
13 | msmarco-passage-expanded/
14 | run.msmarco-passage-expanded.dev.small.txt
15 | 
16 | # for MS MARCO do
17 | doc-predictions/
18 | msmarco-docs.tsv.gz
19 | msmarco_doc_passage_ids.txt
20 | 
21 | msmarco-doc-expanded/
22 | lucene-index-msmarco-doc-expanded/
23 | run.msmarco-doc-expanded.dev.small.txt
24 | 
25 | msmarco-doc-expanded-passage/
26 | lucene-index-msmarco-doc-expanded-passage/
27 | run.msmarco-doc-expanded-passage.dev.small.txt
28 | 


--------------------------------------------------------------------------------
/convert_msmarco_doc_to_anserini.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import gzip
 3 | import json
 4 | import os
 5 | 
 6 | from tqdm import tqdm
 7 | 
 8 | def generate_output_dict(doc, predicted_queries):
 9 |     doc_id, doc_url, doc_title, doc_text = doc[0], doc[1], doc[2], doc[3]
10 |     doc_text = doc_text.strip()
11 |     predicted_queries = ' '.join(predicted_queries)
12 |     expanded_text = f'{doc_url} {doc_title} {doc_text} {predicted_queries}'
13 |     output_dict = {'id': doc_id, 'contents': expanded_text}
14 |     return output_dict
15 | 
16 | parser = argparse.ArgumentParser(
17 |     description='Concatenate MS MARCO original docs with predicted queries')
18 | parser.add_argument('--original_docs_path', required=True, help='MS MARCO .tsv corpus file.')
19 | parser.add_argument('--doc_ids_path', required=True, help='File mapping segments to doc ids.')
20 | parser.add_argument('--predictions_path', required=True, help='File containing predicted queries.')
21 | parser.add_argument('--output_docs_path', required=True, help='Output file in the anserini jsonl format.')
22 | 
23 | args = parser.parse_args()
24 | 
25 | os.makedirs(os.path.dirname(args.output_docs_path), exist_ok=True)
26 | 
27 | f_corpus = gzip.open(args.original_docs_path, mode='rt')
28 | f_out = open(args.output_docs_path, 'w')
29 | 
30 | print('Appending predictions...')
31 | doc_id = None
32 | for doc_id_ref, predicted_queries_partial in tqdm(zip(open(args.doc_ids_path),
33 |                                                       open(args.predictions_path))):
34 |     doc_id_ref = doc_id_ref.strip()
35 |     if doc_id_ref != doc_id:
36 |         if doc_id is not None:
37 |             output_dict = generate_output_dict(doc, predicted_queries)
38 |             f_out.write(json.dumps(output_dict) + '\n')
39 | 
40 |         doc = next(f_corpus).split('\t')
41 |         doc_id = doc[0]
42 |         predicted_queries = []
43 | 
44 |     predicted_queries.append(predicted_queries_partial)
45 | 
46 | output_dict = generate_output_dict(doc, predicted_queries)
47 | f_out.write(json.dumps(output_dict) + '\n')
48 | f_corpus.close()
49 | f_out.close()
50 | print('Done!')
51 | 


--------------------------------------------------------------------------------
/convert_msmarco_doc_to_t5_format.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import re
 3 | import spacy
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | def load_corpus(path):
 8 |     print('Loading corpus...')
 9 |     corpus = {}
10 |     for line in tqdm(open(path)):
11 |         doc_id, doc_url, doc_title, doc_text = line.split('\t')
12 |         doc_text = doc_text.strip()
13 |         corpus[doc_id] = (doc_title, doc_text)
14 |     return corpus
15 | 
16 | 
17 | parser = argparse.ArgumentParser(
18 |     description='Create T5-formatted tsv file from MS MARCO Document Ranking '
19 |                 'dataset.')
20 | parser.add_argument('--corpus_path', required=True, default='', help='')
21 | parser.add_argument('--output_passage_texts_path', required=True, default='',
22 |                     help='')
23 | parser.add_argument('--output_passage_doc_ids_path', required=True, default='',
24 |                     help='')
25 | parser.add_argument('--stride', default=5, help='')
26 | parser.add_argument('--max_length', default=10, help='')
27 | 
28 | args = parser.parse_args()
29 | 
30 | nlp = spacy.blank("en")
31 | nlp.add_pipe(nlp.create_pipe("sentencizer"))
32 | 
33 | corpus = load_corpus(path=args.corpus_path)
34 | 
35 | n_passages = 0
36 | n_no_passages = 0
37 | with open(args.output_passage_texts_path, 'w') as fout_passage_texts, \
38 |         open(args.output_passage_doc_ids_path, 'w') as fout_passage_doc_ids:
39 | 
40 |     for doc_id, (doc_title, doc_text) in tqdm(corpus.items(), total=len(corpus)):
41 |         doc = nlp(doc_text[:10000])
42 |         sentences = [sent.string.strip() for sent in doc.sents]
43 |         if not sentences:
44 |             n_no_passages += 1
45 |         for i in range(0, len(sentences), args.stride):
46 |             segment = ' '.join(sentences[i:i + args.max_length])
47 |             segment = doc_title + ' ' + segment
48 | 
49 |             # Remove starting #'s as T5 skips those lines by default.
50 |             segment = re.sub(r'^#*', '', segment)
51 | 
52 |             fout_passage_doc_ids.write(f'{doc_id}\n')
53 |             fout_passage_texts.write(f'{segment}\n')
54 |             n_passages += 1
55 |             if i + args.max_length >= len(sentences):
56 |                 break
57 | 
58 | print(f'Wrote {n_passages} passages from {len(corpus)} docs.')
59 | print(f'There were {n_no_passages} docs without passages.')
60 | 
61 | print('Done!')
62 | 


--------------------------------------------------------------------------------
/convert_msmarco_passages_doc_to_anserini.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Segment the documents and append their url, title, predicted queries to them.
 3 | Then, they are saved into json which can be used for indexing.
 4 | '''
 5 | 
 6 | import argparse
 7 | import gzip
 8 | import json
 9 | import os
10 | import spacy
11 | from tqdm import tqdm
12 | 
13 | 
14 | def create_segments(doc_text, max_length, stride):
15 |     doc_text = doc_text.strip()
16 |     doc = nlp(doc_text[:10000])
17 |     sentences = [sent.string.strip() for sent in doc.sents]
18 |     segments = []
19 |     for i in range(0, len(sentences), stride):
20 |         segment = " ".join(sentences[i:i+max_length])
21 |         segments.append(segment)
22 |         if i + max_length >= len(sentences):
23 |             break
24 |     return segments
25 | 
26 | 
27 | parser = argparse.ArgumentParser(
28 |     description='Concatenate MS MARCO original docs with predicted queries')
29 | parser.add_argument('--original_docs_path', required=True, help='MS MARCO .tsv corpus file.')
30 | parser.add_argument('--doc_ids_path', required=True, help='File mapping segments to doc ids.')
31 | parser.add_argument('--output_docs_path', required=True, help='Output file in the anserini jsonl format.')
32 | parser.add_argument('--predictions_path', default=None, help='File containing predicted queries.')
33 | parser.add_argument('--max_length', default=10)
34 | parser.add_argument('--stride', default=5)
35 | args = parser.parse_args()
36 | 
37 | os.makedirs(os.path.dirname(args.output_docs_path), exist_ok=True)
38 | 
39 | f_corpus = gzip.open(args.original_docs_path, mode='rt')
40 | f_out = open(args.output_docs_path, 'w')
41 | max_length = args.max_length
42 | stride = args.stride
43 | nlp = spacy.blank("en")
44 | nlp.add_pipe(nlp.create_pipe("sentencizer"))
45 | 
46 | print('Spliting documents...')
47 | doc_id_ref = None
48 | 
49 | if args.predictions_path is None:
50 |     doc_ids_queries = zip(open(args.doc_ids_path))
51 | else:
52 |     doc_ids_queries = zip(open(args.doc_ids_path), open(args.predictions_path))
53 | for doc_id_query in tqdm(doc_ids_queries):
54 |     doc_id = doc_id_query[0].strip()
55 |     if doc_id != doc_id_ref:
56 |         f_doc_id, doc_url, doc_title, doc_text = next(f_corpus).split('\t')
57 |         while f_doc_id != doc_id:
58 |             f_doc_id, doc_url, doc_title, doc_text = next(f_corpus).split('\t')
59 |         segments = create_segments(doc_text, args.max_length, args.stride)
60 |         seg_id = 0
61 |     else:
62 |         seg_id += 1
63 |     doc_seg = f'{doc_id}#{seg_id}'
64 |     if seg_id < len(segments):
65 |         segment = segments[seg_id]
66 |         if args.predictions_path is None:
67 |             expanded_text = f'{doc_url} {doc_title} {segment}'
68 |         else:
69 |             predicted_queries_partial = doc_id_query[1]
70 |             expanded_text = f'{doc_url} {doc_title} {segment} {predicted_queries_partial}'
71 |         output_dict = {'id': doc_seg, 'contents': expanded_text}
72 |         f_out.write(json.dumps(output_dict) + '\n')
73 |     doc_id_ref = doc_id
74 | f_corpus.close()
75 | f_out.close()
76 | print('Done!')
77 | 


--------------------------------------------------------------------------------
/msmarco-v1/augment_corpus.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Pyserini: Reproducible IR research with sparse and dense representations
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import argparse
18 | from datasets import load_dataset
19 | import os
20 | import json
21 | from tqdm import tqdm
22 | from pyserini.search import SimpleSearcher
23 | 
24 | 
25 | def augment_corpus_with_doc2query_t5(dataset, searcher, f_out, num_queries, text_key="contents"):
26 |     print('Output docs...')
27 |     output = open(f_out, 'w')
28 |     counter = 0
29 |     set_d2q_ids = set()
30 |     for i in tqdm(range(len(dataset))):
31 |         docid = dataset[i]["id"]
32 |         set_d2q_ids.add(docid)
33 |         output_dict = json.loads(searcher.doc(docid).raw())
34 |         if num_queries == -1:
35 |             concatenated_queries = " ".join(dataset[i]["predicted_queries"])
36 |         else:
37 |             concatenated_queries = " ".join(dataset[i]["predicted_queries"][:num_queries])
38 |         output_dict[text_key] = f"{output_dict[text_key]}\n{concatenated_queries}"
39 |         counter += 1
40 |         output.write(json.dumps(output_dict) + '\n')
41 |     counter_no_exp = 0
42 |     for i in tqdm(range(searcher.num_docs)):
43 |         if searcher.doc(i).docid() not in set_d2q_ids:
44 |             output_dict = json.loads(searcher.doc(i).raw())
45 |             counter_no_exp += 1
46 |             output_dict[text_key] = f"{output_dict[text_key]}\n"
47 |             output.write(json.dumps(output_dict) + '\n')
48 |     output.close()
49 |     print(f'{counter + counter_no_exp} lines output. {counter_no_exp} lines with no expansions.')
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     parser = argparse.ArgumentParser(
54 |         description='Concatenate MS MARCO V1 corpus with predicted queries')
55 |     parser.add_argument('--hgf_d2q_dataset', required=True,
56 |                         choices=['castorini/msmarco_v1_passage_doc2query-t5_expansions',
57 |                                  'castorini/msmarco_v1_doc_segmented_doc2query-t5_expansions',
58 |                                  'castorini/msmarco_v1_doc_doc2query-t5_expansions'])
59 |     parser.add_argument('--prebuilt_index', required=True, help='Prebuilt index name')
60 |     parser.add_argument('--output_psg_path', required=True, help='Output file for d2q-t5 augmented corpus.')
61 |     parser.add_argument('--num_queries', default=-1, type=int, help='Number of expansions used.')
62 |     parser.add_argument('--cache_dir', default=".", type=str, help='Path to cache the hgf dataset')
63 |     args = parser.parse_args()
64 | 
65 |     os.makedirs(args.output_psg_path, exist_ok=True)
66 | 
67 |     dataset = load_dataset(args.hgf_d2q_dataset, split="train", cache_dir=args.cache_dir)
68 |     if args.prebuilt_index in ['msmarco-v1-passage', 'msmarco-v1-doc-segmented', 'msmarco-v1-doc']:
69 |         searcher = SimpleSearcher.from_prebuilt_index(args.prebuilt_index)
70 |     else:
71 |         searcher = SimpleSearcher(args.prebuilt_index)
72 |     augment_corpus_with_doc2query_t5(dataset,
73 |                                      searcher,
74 |                                      os.path.join(args.output_psg_path, "docs.jsonl"),
75 |                                      args.num_queries)
76 |     print('Done!')
77 | 


--------------------------------------------------------------------------------
/msmarco-v2/augment_corpus.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Pyserini: Reproducible IR research with sparse and dense representations
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | import argparse
 18 | from datasets import load_dataset
 19 | import os
 20 | import json
 21 | from tqdm import tqdm
 22 | from multiprocessing import Pool, Manager
 23 | from pyserini.search import SimpleSearcher
 24 | 
 25 | 
 26 | def augment_corpus_with_doc2query_t5(dataset, expdocid_dict, f_out, start, end, num_queries, text_key="passage"):
 27 |     print('Output docs...')
 28 |     output = open(f_out, 'w')
 29 |     counter = 0
 30 |     for i in tqdm(range(start, end)):
 31 |         docid = dataset[i]["id"]
 32 |         output_dict = docid2doc[docid]
 33 |         expdocid_dict[docid] = 1
 34 |         if num_queries == -1:
 35 |             concatenated_queries = " ".join(dataset[i]["predicted_queries"])
 36 |         else:
 37 |             concatenated_queries = " ".join(dataset[i]["predicted_queries"][:num_queries])
 38 |         output_dict[text_key] = output_dict[text_key].replace("\n", " ")
 39 |         output_dict[text_key] = f"{output_dict[text_key]}\n{concatenated_queries}"
 40 |         counter += 1
 41 |         output.write(json.dumps(output_dict) + '\n')
 42 |     output.close()
 43 |     print(f'{counter} lines output. Done!')
 44 | 
 45 | 
 46 | if __name__ == '__main__':
 47 |     parser = argparse.ArgumentParser(
 48 |         description='Concatenate MS MARCO V2 corpus with predicted queries')
 49 |     parser.add_argument('--hgf_d2q_dataset', required=True,
 50 |                         choices=['castorini/msmarco_v2_passage_doc2query-t5_expansions',
 51 |                                  'castorini/msmarco_v2_doc_segmented_doc2query-t5_expansions',
 52 |                                  'castorini/msmarco_v2_doc_doc2query-t5_expansions'])
 53 |     parser.add_argument('--index_path', required=True, help='Input index path')
 54 |     parser.add_argument('--output_psg_path', required=True, help='Output file for d2q-t5 augmented corpus.')
 55 |     parser.add_argument('--num_workers', default=1, type=int, help='Number of workers used.')
 56 |     parser.add_argument('--num_queries', default=-1, type=int, help='Number of expansions used.')
 57 |     parser.add_argument('--cache_dir', default=".", type=str, help='Path to cache the hgf dataset')
 58 |     parser.add_argument('--task', default="passage", type=str, help='One of passage or document.')
 59 |     args = parser.parse_args()
 60 | 
 61 |     os.makedirs(args.output_psg_path, exist_ok=True)
 62 |     dataset = load_dataset(args.hgf_d2q_dataset, split="train", cache_dir=args.cache_dir)
 63 |     if args.index_path in ['msmarco-v2-passage', 'msmarco-v2-passage-augmented',
 64 |                            'msmarco-v2-doc-segmented', 'msmarco-v2-doc']:
 65 |         searcher = SimpleSearcher.from_prebuilt_index(args.index_path)
 66 |     else:
 67 |         searcher = SimpleSearcher(args.index_path)
 68 |     if searcher.num_docs != len(dataset):
 69 |         print("Total number of expanded queries: {}".format(len(dataset)))
 70 |     print('Total passages loaded: {}'.format(searcher.num_docs))
 71 |     manager = Manager()
 72 |     docid2doc = manager.dict()
 73 |     for i in tqdm(range(searcher.num_docs)):
 74 |         doc = searcher.doc(i)
 75 |         docid2doc[doc.docid()] = json.loads(doc.raw())
 76 |     pool = Pool(args.num_workers)
 77 |     expdocid_dict = manager.dict()
 78 |     for i in range(args.num_workers):
 79 |         f_out = os.path.join(args.output_psg_path, 'dt5q_aug_psg' + str(i) + '.json')
 80 |         print(f_out)
 81 |         start = i * (searcher.num_docs // args.num_workers)
 82 |         end = (i + 1) * (searcher.num_docs // args.num_workers)
 83 |         if i == args.num_workers - 1:
 84 |             end = searcher.num_docs
 85 |         pool.apply_async(augment_corpus_with_doc2query_t5,
 86 |                          args=(dataset, expdocid_dict, f_out, start, end, args.num_queries, args.task, ))
 87 |     pool.close()
 88 |     pool.join()
 89 | 
 90 |     if len(docid2doc) != len(expdocid_dict):
 91 |         f_out = os.path.join(args.output_psg_path, 'dt5q_aug_psg' + str(args.num_workers - 1) + '.json')
 92 |         with open(f_out, 'a') as output:
 93 |             for id in tqdm(docid2doc.keys()):
 94 |                 if id not in expdocid_dict:
 95 |                     print(f"doc {id} not expanded")
 96 |                     output.write(json.dumps(docid2doc[id]) + '\n')
 97 |                     expdocid_dict[id] = 1
 98 |         assert len(docid2doc) == len(expdocid_dict)
 99 |     print('Done!')
100 |     print(f'{searcher.num_docs} documents and {len(dataset)} expanded documents.')
101 | 


--------------------------------------------------------------------------------
/convert_msmarco_passage_to_anserini.py:
--------------------------------------------------------------------------------
  1 | '''Converts MSMARCO's tsv collection to Anserini jsonl files with field configurations.'''
  2 | import argparse
  3 | import json
  4 | import os
  5 | 
  6 | 
  7 | # NLTK English stopwords
  8 | stop_words = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there',
  9 |               'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they',
 10 |               'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into',
 11 |               'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who',
 12 |               'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below',
 13 |               'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me',
 14 |               'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our',
 15 |               'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she',
 16 |               'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and',
 17 |               'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then',
 18 |               'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not',
 19 |               'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too',
 20 |               'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't',
 21 |               'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it',
 22 |               'how', 'further', 'was', 'here', 'than'}
 23 | 
 24 | 
 25 | # process text by tokenizing and removing stopwords
 26 | def process_text(text):
 27 |     processed = text.lower().replace('.', ' ').replace(',', ' ').replace('?', ' ')
 28 |     return [word for word in processed.split() if word not in stop_words]
 29 | 
 30 | 
 31 | # split new and repeated prediction words
 32 | def split_new_repeated(pred_text, doc_text):
 33 |     pred_repeated = []
 34 |     pred_new = []
 35 | 
 36 |     doc_text_set = set(process_text(doc_text))
 37 |     processed_pred_text = process_text(pred_text)
 38 |     for word in processed_pred_text:
 39 |         if word in doc_text_set:
 40 |             pred_repeated.append(word)
 41 |         else:
 42 |             pred_new.append(word)
 43 | 
 44 |     return pred_new, pred_repeated
 45 | 
 46 | 
 47 | if __name__ == '__main__':
 48 |     parser = argparse.ArgumentParser(
 49 |         description='Converts MSMARCO\'s tsv collection to Anserini jsonl '
 50 |                     'files.')
 51 |     parser.add_argument('--collection_path', required=True, help='MS MARCO .tsv collection file')
 52 |     parser.add_argument('--predictions', required=True, help='File containing predicted queries.')
 53 |     parser.add_argument('--output_folder', required=True, help='output folder')
 54 |     parser.add_argument('--max_docs_per_file', default=1000000, type=int,
 55 |                         help='maximum number of documents in each jsonl file.')
 56 | 
 57 |     # parameters to simulate BM25F via duplicated text
 58 |     parser.add_argument('--original_copies', default=1, type=int, help='number of original text duplicates.')
 59 |     parser.add_argument('--prediction_copies', default=1, type=int, help='number of predicted text duplicates.')
 60 | 
 61 |     # parameters to separate new and repeated prediction text
 62 |     parser.add_argument('--split_predictions', default=False, type=bool,
 63 |                         help='separate predicted text into repeated and new.')
 64 |     parser.add_argument('--repeated_prediction_copies', default=1, type=int,
 65 |                         help='number of repeated predicted text duplicates, must set split_predictions to true.')
 66 |     parser.add_argument('--new_prediction_copies', default=1, type=int,
 67 |                         help='number of new predicted text duplicates, must set split_predictions to true.')
 68 | 
 69 |     args = parser.parse_args()
 70 | 
 71 |     if not os.path.exists(args.output_folder):
 72 |         os.makedirs(args.output_folder)
 73 | 
 74 |     print('Converting collection...')
 75 | 
 76 |     file_index = 0
 77 |     new_words = 0
 78 |     total_words = 0
 79 | 
 80 |     with open(args.collection_path) as f_corpus, open(args.predictions) as f_pred:
 81 |         for i, (line_doc, line_pred) in enumerate(zip(f_corpus, f_pred)):
 82 |             # Write to a new file when the current one reaches maximum capacity.
 83 |             if i % args.max_docs_per_file == 0:
 84 |                 if i > 0:
 85 |                     output_jsonl_file.close()
 86 |                 output_path = os.path.join(args.output_folder, f'docs{file_index:02d}.json')
 87 |                 output_jsonl_file = open(output_path, 'w')
 88 |                 file_index += 1
 89 | 
 90 |             doc_id, doc_text = line_doc.rstrip().split('\t')
 91 |             pred_text = line_pred.rstrip()
 92 | 
 93 |             contents = ''
 94 |             if args.split_predictions:
 95 |                 pred_new, pred_repeated = split_new_repeated(pred_text, doc_text)
 96 |                 new_words += len(pred_new)
 97 |                 total_words += len(pred_new) + len(pred_repeated)
 98 | 
 99 |                 contents += (doc_text + ' ') * args.original_copies
100 |                 contents += (' '.join(pred_repeated) + ' ') * args.repeated_prediction_copies
101 |                 contents += (' '.join(pred_new) + ' ') * args.new_prediction_copies
102 |             else:
103 |                 contents += (doc_text + ' ') * args.original_copies
104 |                 contents += (pred_text + ' ') * args.prediction_copies
105 | 
106 |             output_dict = {'id': doc_id, 'contents': contents}
107 |             output_jsonl_file.write(json.dumps(output_dict) + '\n')
108 | 
109 |             if i % 100000 == 0:
110 |                 print('Converted {} docs in {} files'.format(i, file_index))
111 | 
112 |     if args.split_predictions:
113 |         print(f"Found {100 * new_words/total_words}% new predicted text")
114 | 
115 |     output_jsonl_file.close()
116 |     print('Done!')
117 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Document Expansion by Query Prediction
  2 | 
  3 | The repo describes experiments with docTTTTTquery (sometimes written as docT5query or doc2query-T5), the latest version of the doc2query family of document expansion models.
  4 | The basic idea is to train a model, that when given an input document, generates questions that the document might answer (or more broadly, queries for which the document might be relevant).
  5 | These predicted questions (or queries) are then appended to the original documents, which are then indexed as before.
  6 | The docTTTTTquery model gets its name from the use of T5 as the expansion model.
  7 | 
  8 | The primary advantage of this approach is that expensive neural inference is pushed to _indexing time_, which means that "bag of words" queries against an inverted index built on the augmented document collection are only slightly slower (due to longer documents) &mdash; but the retrieval results are _much_ better.
  9 | Of course, these documents can be further reranked by another neural model in a [multi-stage ranking architecture](https://arxiv.org/abs/1910.14424).
 10 | 
 11 | This technique was introduced in November 2019 on MS MARCO passage ranking task.
 12 | Results on the [leaderboard](https://microsoft.github.io/msmarco/) show that docTTTTTquery is much more effective than doc2query and (almost) as effective as the best non-BERT ranking model, while increasing query latency (time to retrieve 1000 docs per query) only slightly compared to vanilla BM25:
 13 | 
 14 | MS MARCO Passage Ranking Leaderboard (Nov 30th 2019) | Eval MRR@10 | Latency
 15 | :------------------------------------ | :------: | ------:
 16 | [BM25 + BERT](https://github.com/nyu-dl/dl4marco-bert) from [(Nogueira et al., 2019)](https://arxiv.org/abs/1904.08375) | 36.8 | 3500 ms
 17 | FastText + Conv-KNRM (Single) [(Hofstätter et al. SIGIR 2019)](https://github.com/sebastian-hofstaetter/sigir19-neural-ir) (best non-BERT) | 27.7 | -
 18 | docTTTTTquery (this code)             | 27.2 | 64 ms
 19 | DeepCT [(Dai and Callan, 2019)](https://github.com/AdeDZY/DeepCT)              | 23.9 | 55 ms
 20 | doc2query [(Nogueira et al., 2019)](https://github.com/nyu-dl/dl4ir-doc2query)              | 21.8 | 61 ms
 21 | [BM25](https://github.com/castorini/anserini/blob/master/docs/experiments-msmarco-passage.md)  | 18.6  | 55 ms
 22 | 
 23 | For more details, check out our paper:
 24 | 
 25 | + Rodrigo Nogueira and Jimmy Lin.  [From doc2query to docTTTTTquery.](https://cs.uwaterloo.ca/~jimmylin/publications/Nogueira_Lin_2019_docTTTTTquery-v2.pdf)
 26 | 
 27 | Why's the paper so short? Check out [our proposal for micropublications](https://github.com/lintool/guide/blob/master/micropublications.md)!
 28 | 
 29 | ## Quick Links
 30 | 
 31 | + [Data and Trained Models: MS MARCO Passage Ranking Dataset](#Data-and-Trained-Models-MS-MARCO-Passage-Ranking-Dataset)
 32 | + [Reproducing MS MARCO Passage Ranking Results with Anserini](#Reproducing-MS-MARCO-Passage-Ranking-Results-with-Anserini)
 33 | + [Predicting Queries from Passages: T5 Inference with PyTorch](#Predicting-Queries-from-Passages-T5-Inference-with-PyTorch)
 34 | + [Predicting Queries from Passages: T5 Inference with TensorFlow](#Predicting-Queries-from-Passages-T5-Inference-with-TensorFlow)
 35 | + [Learning a New Prediction Model: T5 Training with TensorFlow](#Learning-a-New-Prediction-Model-T5-Training-with-TensorFlow)
 36 | + [Reproducing MS MARCO Document Ranking Results with Anserini](#Reproducing-MS-MARCO-Document-Ranking-Results-with-Anserini)
 37 | + [Predicting Queries from Documents: T5 Inference with TensorFlow](#Predicting-Queries-from-Documents-T5-Inference-with-TensorFlow)
 38 | 
 39 | ## Data and Trained Models: MS MARCO Passage Ranking Dataset
 40 | 
 41 | The basic docTTTTTquery model is trained on the MS MARCO passage ranking dataset.
 42 | We make the following data and models available for download:
 43 | 
 44 | + `doc_query_pairs.train.tsv`: Approximately 500,000 passage-query pairs used to train the model.
 45 | + `queries.dev.small.tsv`: 6,980 queries from the MS MARCO dev set. In this tsv file, the first column is the query id and the second is the query text.
 46 | + `qrels.dev.small.tsv`: 7,437 pairs of query relevant passage ids from the MS MARCO dev set. In this tsv file, the first column is the query id and the third column is the passage id. The other two columns (second and fourth) are not used.
 47 | + `collection.tar.gz`: All passages (8,841,823) in the MS MARCO passage corpus. In this tsv file, the first column is the passage id and the second is the passage text.
 48 | + `predicted_queries_topk_sampling.zip`: 80 predicted queries for each MS MARCO passage, using T5-base and top-_k_ sampling.
 49 | + `run.dev.small.tsv`:  Approximately 6,980,000 pairs of dev set queries and retrieved passages using the passages expanded with docTTTTTquery + BM25. In this tsv file, the first column is the query id, the second column is the passage id, and the third column is the rank of the passage. There are 1000 passages per query in this file.
 50 | + `t5-base.zip`: trained T5 model used for generating the expansions.
 51 | + `t5-large.zip`: larger trained T5 model; we didn't find the output to be any better.
 52 | 
 53 | Download and verify the above files from the below table:
 54 | 
 55 | File | Size | MD5 | Download
 56 | :----|-----:|:----|:-----
 57 | `doc_query_pairs.train.tsv` | 197 MB | `aa673014f93d43837ca4525b9a33422c` | [[Dropbox](https://www.dropbox.com/s/5i64irveqvvegey/doc_query_pairs.train.tsv?dl=1)] [[GitLab](https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-passage/doc_query_pairs.train.tsv)]
 58 | `queries.dev.small.tsv` | 283 KB | `4621c583f1089d223db228a4f95a05d1` | [[Dropbox](https://www.dropbox.com/s/hq6xjhswiz60siu/queries.dev.small.tsv?dl=1)] [[GitLab](https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-passage/queries.dev.small.tsv)]
 59 | `qrels.dev.small.tsv` | 140 KB| `38a80559a561707ac2ec0f150ecd1e8a` | [[Dropbox](https://www.dropbox.com/s/khsplt2fhqwjs0v/qrels.dev.small.tsv?dl=1)] [[GitLab](https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-passage/qrels.dev.small.tsv)]
 60 | `collection.tar.gz` | 987 MB | `87dd01826da3e2ad45447ba5af577628` | [[Dropbox](https://www.dropbox.com/s/lvvpsx0cjk4vemv/collection.tar.gz?dl=1)] [[GitLab](https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-passage/collection.tar.gz)]
 61 | `predicted_queries_topk_sampling.zip` | 7.9 GB | `8bb33ac317e76385d5047322db9b9c34` | [[Dropbox](https://www.dropbox.com/s/uzkvv4gpj3a596a/predicted_queries_topk_sampling.zip?dl=1)] [[GitLab](https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-passage/predicted_queries_topk_sampling.zip)]
 62 | `run.dev.small.tsv` | 127 MB | `c7a2006ec92f1f25955a314acd9b81b0` | [[Dropbox](https://www.dropbox.com/s/nc1drdkjpxxsngg/run.dev.small.tsv?dl=1)] [[GitLab](https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-passage/run.dev.small.tsv)]
 63 | `t5-base.zip` | 357 MB | `881d3ca87c307b3eac05fae855c79014` | [[Dropbox](https://www.dropbox.com/s/q1nye6wfsvf5sen/t5-base.zip?dl=1)] [[GitLab](https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-passage/t5-base.zip)]
 64 | `t5-large.zip` | 1.2 GB | `21c7e625210b0ae872679bc36ed92d44` | [[Dropbox](https://www.dropbox.com/s/gzq8r68uk38bmum/t5-large.zip?dl=1)] [[GitLab](https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-passage/t5-large.zip)]
 65 | 
 66 | ## Reproducing MS MARCO Passage Ranking Results with Anserini
 67 | 
 68 | We provide instructions on how to reproduce our docTTTTTquery results for the MS MARCO passage ranking task with the [Anserini](https://github.com/castorini/anserini) IR toolkit, using the predicted queries provided above.
 69 | 
 70 | First, install Anserini (see [homepage](https://github.com/castorini/anserini) for more details):
 71 | 
 72 | ```bash
 73 | sudo apt-get install maven
 74 | git clone --recurse-submodules https://github.com/castorini/anserini.git
 75 | cd anserini
 76 | mvn clean package appassembler:assemble
 77 | cd tools/eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd ../../..
 78 | cd tools/eval/ndeval && make && cd ../../..
 79 | ```
 80 | 
 81 | For the purposes of this of this guide, we'll assume that `anserini` is cloned as a sub-directory of this repo, i.e., `docTTTTTquery/anserini/`.
 82 | 
 83 | Next, download `queries.dev.small.tsv`, `qrels.dev.small.tsv`, `collection.tar.gz`, and `predicted_queries_topk_sampling.zip` using one of the options above.
 84 | The first three files can go into base directory of the repo `docTTTTTquery/`, but put the zip file in a separate sub-directory `docTTTTTquery/passage-predictions`.
 85 | The zip file contains a lot of individual files, so this will keep your directory structure manageable.
 86 | 
 87 | Before appending the predicted queries to the passages, we need to concatenate them.
 88 | The commands below create a file that contains 40 concatenated predictions per line and 8,841,823 lines, one for each passage in the corpus.
 89 | We concatenate only the first 40 predictions as there is only a tiny gain in MRR@10 when using all 80 predictions (nevertheless, we provide 80 predictions in case researchers want to use this data for other purposes).
 90 | 
 91 | ```bash
 92 | cd passage-predictions
 93 | 
 94 | unzip predicted_queries_topk_sampling.zip
 95 | 
 96 | for i in $(seq -f "%03g" 0 17); do
 97 |     echo "Processing chunk $i"
 98 |     paste -d" " predicted_queries_topk_sample0[0-3]?.txt${i}-1004000 \
 99 |     > predicted_queries_topk.txt${i}-1004000
100 | done
101 | 
102 | cat predicted_queries_topk.txt???-1004000 > predicted_queries_topk.txt-1004000
103 | ```
104 | 
105 | As a sanity check:
106 | 
107 | ```bash
108 | $ wc predicted_queries_topk.txt-1004000
109 |  8841823 2253863941 12517353325 predicted_queries_topk.txt-1004000
110 | ```
111 | 
112 | Go back to your repo base directory `docTTTTTquery/`.
113 | We can now append the predicted queries to the original MS MARCO passage collection:
114 | 
115 | ```bash
116 | tar xvf collection.tar.gz
117 | 
118 | python convert_msmarco_passage_to_anserini.py \
119 |   --collection_path=collection.tsv \
120 |   --predictions=passage-predictions/predicted_queries_topk.txt-1004000 \
121 |   --output_folder=msmarco-passage-expanded
122 | ```
123 | 
124 | Now, create an index using Anserini on the expanded passages (we're assuming Anserini is cloned as a sub-directory):
125 | 
126 | ```bash
127 | sh anserini/target/appassembler/bin/IndexCollection \
128 |   -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
129 |   -threads 9 -input msmarco-passage-expanded -index lucene-index-msmarco-passage-expanded
130 | ```
131 | 
132 | Once the expanded passages are indexed, we can retrieve 1000 passages per query for the MS MARCO dev set:
133 | 
134 | ```bash
135 | sh anserini/target/appassembler/bin/SearchMsmarco \
136 |   -index lucene-index-msmarco-passage-expanded -queries queries.dev.small.tsv \
137 |   -output run.msmarco-passage-expanded.dev.small.txt -hits 1000 -threads 8
138 | ```
139 | 
140 | Finally, we evaluate the results using the MS MARCO eval script:
141 | 
142 | ```bash
143 | python anserini/tools/eval/msmarco_eval.py qrels.dev.small.tsv run.msmarco-passage-expanded.dev.small.txt
144 | ```
145 | 
146 | The results should be:
147 | 
148 | ```
149 | #####################
150 | MRR @10: 0.27680089370991834
151 | QueriesRanked: 6980
152 | #####################
153 | ```
154 | 
155 | Voilà!
156 | 
157 | ## Predicting Queries from Passages: T5 Inference with PyTorch
158 | 
159 | We will use the excellent [🤗 Transformers library](https://github.com/huggingface/transformers) by Hugging Face to sample queries from our T5 model.
160 | 
161 | First, install the library:
162 | 
163 | ```bash
164 | pip install transformers
165 | ```
166 | 
167 | Then load the model checkpoint:
168 | 
169 | ```python
170 | import torch
171 | from transformers import T5Tokenizer, T5ForConditionalGeneration
172 | 
173 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
174 | 
175 | tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco')
176 | model = T5ForConditionalGeneration.from_pretrained('castorini/doc2query-t5-base-msmarco')
177 | model.to(device)
178 | ```
179 | 
180 | Sample 3 questions from a example document:
181 | ```python
182 | doc_text = 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'
183 | 
184 | input_ids = tokenizer.encode(doc_text, return_tensors='pt').to(device)
185 | outputs = model.generate(
186 |     input_ids=input_ids,
187 |     max_length=64,
188 |     do_sample=True,
189 |     top_k=10,
190 |     num_return_sequences=3)
191 | 
192 | for i in range(3):
193 |     print(f'sample {i + 1}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
194 | ```
195 | 
196 | The output should be similar to this:
197 | ```
198 | sample 1: why was the manhattan project successful
199 | sample 2: the manhattan project what it means
200 | sample 3: what was the most important aspect of the manhattan project
201 | ```
202 | 
203 | For more information on how to use T5 with HuggingFace's transformers library, [check their documentation](https://huggingface.co/transformers/model_doc/t5.html).
204 | 
205 | ## Predicting Queries from Passages: T5 Inference with TensorFlow
206 | 
207 | Next, we provide instructions on how to use our trained T5 models to predict queries for each of the 8.8M documents in the MS MARCO corpus.
208 | To speed up inference, we will use TPUs (and consequently Google Cloud machines), so this installation must be performed on a Google Cloud instance.
209 | 
210 | To begin, install T5 (check the [original T5 repository](https://github.com/google-research/text-to-text-transfer-transformer) for the latest installation instructions):
211 | 
212 | ```bash
213 | pip install t5[gcp]
214 | ```
215 | 
216 | We first need to prepare an input file that contains one passage text per line. We achieve this by extracting the second column of `collection.tsv`:
217 | 
218 | ```bash
219 | cut -f1 collection.tsv > input_docs.txt
220 | ```
221 | 
222 | We also need to split the file into smaller files (each with 1M lines) to avoid TensorFlow complaining that proto arrays can only be 2GB at the most:
223 | 
224 | ```bash
225 | split --suffix-length 2 --numeric-suffixes --lines 1000000 input_docs.txt input_docs.txt
226 | ```
227 | 
228 | We now upload the input docs to Google Cloud Storage:
229 | 
230 | ```bash
231 | gsutil cp input_docs.txt?? gs://your_bucket/data/
232 | ```
233 | 
234 | We also need to upload our trained t5-base model to GCS:
235 | 
236 | ```bash
237 | wget https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-passage/t5-base.zip
238 | unzip t5-base.zip
239 | gsutil cp model.ckpt-1004000* gs://your_bucket/models/
240 | ```
241 | 
242 | We are now ready to predict queries from passages. Remember to replace `your_tpu`, `your_tpu_zone`, `your_project_id` and `your_bucket` with your values. Note that the command below will only sample one query per passage. If you want multiple samples, you will need to repeat this process multiple times (remember to replace `output_filename` with a new filename for each sample).
243 | 
244 | ```bash
245 | for ITER in {00..08}; do
246 |     t5_mesh_transformer \
247 |       --tpu="your_tpu" \
248 |       --gcp_project="your_project_id" \
249 |       --tpu_zone="your_tpu_zone" \
250 |       --model_dir="gs://your_bucket/models/" \
251 |       --gin_file="gs://t5-data/pretrained_models/base/operative_config.gin" \
252 |       --gin_file="infer.gin" \
253 |       --gin_file="sample_decode.gin" \
254 |       --gin_param="infer_checkpoint_step = 1004000" \
255 |       --gin_param="utils.run.sequence_length = {'inputs': 512, 'targets': 64}" \
256 |       --gin_param="Bitransformer.decode.max_decode_length = 64" \
257 |       --gin_param="input_filename = 'gs://your_bucket/data/input_docs.txt$ITER'" \
258 |       --gin_param="output_filename = 'gs://your_bucket/data/predicted_queries_topk_sample.txt$ITER'" \
259 |       --gin_param="tokens_per_batch = 131072" \
260 |       --gin_param="Bitransformer.decode.temperature = 1.0" \
261 |       --gin_param="Unitransformer.sample_autoregressive.sampling_keep_top_k = 10"
262 | done
263 | ```
264 | It should take approximately 8 hours to sample one query for each of the 8.8M passages, costing ~$20 USD (8 hours at $2.40 USD/hour) on a preemptible TPU.
265 | 
266 | ## Learning a New Prediction Model: T5 Training with TensorFlow
267 | 
268 | Finally, we show how to learn a new prediction model.
269 | The following command will train a T5-base model for 4k iterations to predict queries from passages.
270 | We assume you put the tsv training file in `gs://your_bucket/data/doc_query_pairs.train.tsv` (download from above).
271 | Also, change `your_tpu_name`, `your_tpu_zone`, `your_project_id`, and `your_bucket` accordingly.
272 | 
273 | ```bash
274 | t5_mesh_transformer  \
275 |   --tpu="your_tpu_name" \
276 |   --gcp_project="your_project_id" \
277 |   --tpu_zone="your_tpu_zone" \
278 |   --model_dir="gs://your_bucket/models/" \
279 |   --gin_param="init_checkpoint = 'gs://t5-data/pretrained_models/base/model.ckpt-999900'" \
280 |   --gin_file="dataset.gin" \
281 |   --gin_file="models/bi_v1.gin" \
282 |   --gin_file="gs://t5-data/pretrained_models/base/operative_config.gin" \
283 |   --gin_param="utils.run.train_dataset_fn = @t5.models.mesh_transformer.tsv_dataset_fn" \
284 |   --gin_param="tsv_dataset_fn.filename = 'gs://your_bucket/data/doc_query_pairs.train.tsv'" \
285 |   --gin_file="learning_rate_schedules/constant_0_001.gin" \
286 |   --gin_param="run.train_steps = 1004000" \
287 |   --gin_param="tokens_per_batch = 131072" \
288 |   --gin_param="utils.tpu_mesh_shape.tpu_topology ='v3-8'
289 | ```
290 | 
291 | ## Reproducing MS MARCO Document Ranking Results with Anserini
292 | 
293 | Here we detail how to reproduce docTTTTTquery runs for the MS MARCO _document_ ranking task.
294 | The MS MARCO document ranking task is similar to the MS MARCO passage ranking task, but the corpus contains longer documents, which need to be split into shorter passages before being fed to docTTTTTquery.
295 | 
296 | Like the instructions for MS MARCO passage ranking task, we explain the process in reverse order (i.e., indexing, expansion, query prediction), since we believe there are more users interested in experimenting with the expanded index than expanding the document themselves.
297 | 
298 | Here are the relevant files to download:
299 | 
300 | File | Size | MD5 | Download
301 | :----|-----:|:----|:-----
302 | `msmarco-docs.tsv.gz` | 7.9 GB | `103b19e21ad324d8a5f1ab562425c0b4` | [[Dropbox](https://www.dropbox.com/s/t7r324wchnf98pm/msmarco-docs.tsv.gz?dl=1)] [[GitLab](https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-doc/msmarco-docs.tsv.gz)]
303 | `predicted_queries_doc.tar.gz` | 2.2 GB | `4967214dfffbd33722837533c838143d` | [[Dropbox](https://www.dropbox.com/s/s4vwuampddu7677/predicted_queries_doc.tar.gz?dl=1)] [[GitLab](https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-doc/predicted_queries_doc.tar.gz)]
304 | `msmarco_doc_passage_ids.txt` | 170 MB | `82c00bebab0d98c1dc07d78fac3d8b8d` | [[Dropbox](https://www.dropbox.com/s/wi6i2hzkcmbmusq/msmarco_doc_passage_ids.txt?dl=1)] [[GitLab](https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-doc/msmarco_doc_passage_ids.txt)]
305 | 
306 | ### Per-Document Expansion
307 | 
308 | The most straightforward way to use docTTTTTquery is to append the expanded queries to _each_ document.
309 | First, download the original corpus (`msmarco-docs.tsv.gz`), the predicted queries (`predicted_queries_doc.tar.gz`), and a file mapping passages to their document ids (`msmarco_doc_passage_ids.txt`), using one of the options above.
310 | Put `predicted_queries_doc.tar.gz` in a sub-directory `doc-predictions/`.
311 | 
312 | Merge the predicted queries into a single file; there are 10 predicted queries per document.
313 | This can be accomplished as follows:
314 | 
315 | ```bash
316 | cd doc-predictions/
317 | 
318 | tar xvfz predicted_queries_doc.tar.gz
319 | 
320 | for i in $(seq -f "%03g" 0 9); do
321 |     cat predicted_queries_doc_sample${i}.txt???-1004000 > predicted_queries_doc_sample${i}_all.txt
322 | done
323 | 
324 | paste -d" " \
325 |   predicted_queries_doc_sample000_all.txt \
326 |   predicted_queries_doc_sample001_all.txt \
327 |   predicted_queries_doc_sample002_all.txt \
328 |   predicted_queries_doc_sample003_all.txt \
329 |   predicted_queries_doc_sample004_all.txt \
330 |   predicted_queries_doc_sample005_all.txt \
331 |   predicted_queries_doc_sample006_all.txt \
332 |   predicted_queries_doc_sample007_all.txt \
333 |   predicted_queries_doc_sample008_all.txt \
334 |   predicted_queries_doc_sample009_all.txt \
335 |    > predicted_queries_doc_sample_all.txt
336 | ```
337 | 
338 | Sanity check:
339 | 
340 | ```bash
341 | $ md5sum predicted_queries_doc_sample_all.txt 
342 | b01b2fbbb8d382684a80fbf51efbca93  predicted_queries_doc_sample_all.txt
343 | $ wc predicted_queries_doc_sample_all.txt 
344 |   20545677 1379262573 7672087649 predicted_queries_doc_sample_all.txt
345 | ```
346 | 
347 | We now append the queries to the original documents (this step takes approximately 10 minutes, the counter needs to get to 20545677):
348 | 
349 | ```bash
350 | python convert_msmarco_doc_to_anserini.py \
351 |   --original_docs_path=msmarco-docs.tsv.gz \
352 |   --doc_ids_path=msmarco_doc_passage_ids.txt \
353 |   --predictions_path=doc-predictions/predicted_queries_doc_sample_all.txt \
354 |   --output_docs_path=msmarco-doc-expanded/docs.json
355 | ```
356 | 
357 | Once we have the expanded documents (about 29 GB in size), the next step is to build an index with Anserini.
358 | As above, we'll assume that Anserini is cloned as a sub-directory of this repo, i.e., `docTTTTTquery/anserini/`.
359 | This step takes approximately 40 minutes:
360 | 
361 | ```bash
362 | sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \
363 |   -generator DefaultLuceneDocumentGenerator -threads 1 \
364 |   -input msmarco-doc-expanded -index lucene-index-msmarco-doc-expanded
365 | ```
366 | 
367 | We can then retrieve the documents using the dev queries (this step takes approximately 10 minutes):
368 | 
369 | ```
370 | sh anserini/target/appassembler/bin/SearchCollection \
371 |   -index lucene-index-msmarco-doc-expanded \
372 |   -topicreader TsvString -topics anserini/src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt \
373 |   -output run.msmarco-doc-expanded.dev.small.txt -bm25
374 | ```
375 | 
376 | And evaluate using `trec_eval` tool:
377 | 
378 | ```bash
379 | anserini/tools/eval/trec_eval.9.0.4/trec_eval -m map -m recall.1000 \
380 |   anserini/src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt run.msmarco-doc-expanded.dev.small.txt
381 | ```
382 | 
383 | The output should be:
384 | 
385 | ```
386 | map                   	all	0.2886
387 | recall_1000           	all	0.9259
388 | ```
389 | 
390 | In comparison, indexing with the original documents gives:
391 | 
392 | ```
393 | map                     all     0.2310
394 | recall_1000             all     0.8856
395 | ```
396 | 
397 | ### Per-Passage Expansion
398 | 
399 | Although per-document expansion is the most straightforward way to use docTTTTTquery, we have found that _per passage_ expansion works even better.
400 | In this approach, we split the documents into passages and append the expanded queries to _each_ passage.
401 | We then index the passages of this expanded corpus.
402 | 
403 | We will reuse the file `predicted_queries_doc_sample_all.txt` that contains all the predicted queries from last section.
404 | To start, append the queries to the passages:
405 | 
406 | ```
407 | python convert_msmarco_passages_doc_to_anserini.py \
408 |   --original_docs_path=msmarco-docs.tsv.gz \
409 |   --doc_ids_path=msmarco_doc_passage_ids.txt \
410 |   --predictions_path=doc-predictions/predicted_queries_doc_sample_all.txt \
411 |   --output_docs_path=msmarco-doc-expanded-passage/docs.json
412 | ```
413 | 
414 | This step takes several hours (the counter needs to get to 20545677).
415 | Upon completion, index the passages with Anserini:
416 | 
417 | ```bash
418 | sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \
419 |   -generator DefaultLuceneDocumentGenerator -threads 1 \
420 |   -input msmarco-doc-expanded-passage -index lucene-index-msmarco-doc-expanded-passage
421 | ```
422 | 
423 | Then, we can retrieve the top 1k passages with dev queries:
424 | 
425 | ```
426 | sh anserini/target/appassembler/bin/SearchCollection \
427 |   -index lucene-index-msmarco-doc-expanded-passage \
428 |   -topicreader TsvString -topics anserini/src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt \
429 |   -output run.msmarco-doc-expanded-passage.dev.small.txt \
430 |   -bm25 -hits 10000 -selectMaxPassage -selectMaxPassage.delimiter "#" -selectMaxPassage.hits 1000
431 | ```
432 | 
433 | In a bit more detail, we retrieve the top 10k passages per query, but then use Anserini's `-selectMaxPassage` option to select only the best (highest-scoring) passage from each document, finally returning top 1k docid per query.
434 | 
435 | Evaluation:
436 | 
437 | ```
438 | anserini/tools/eval/trec_eval.9.0.4/trec_eval -m map -m recall.1000 \
439 |   anserini/src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt \
440 |   run.msmarco-doc-expanded-passage.dev.small.txt
441 | ```
442 | 
443 | The output should be:
444 | 
445 | ```
446 | map                   	all	0.3182
447 | recall_1000           	all	0.9490
448 | ```
449 | 
450 | In comparison with per-passage expansion, we will use per passage _without expansion_ as the baseline. In this method, we will not append the predicted queries to the passages.
451 | 
452 | We will first split the original documents into passages:
453 | ```
454 | python convert_msmarco_passages_doc_to_anserini.py \
455 |   --original_docs_path=msmarco-docs.tsv.gz \
456 |   --doc_ids_path=msmarco_doc_passage_ids.txt \
457 |   --output_docs_path=msmarco-doc-passage/docs.json \
458 | ```
459 | 
460 | It will also take several hours, and the generated file will be 27G. Same as what we did for per-passage expansion, we will use Anserini to index the file, retrieve the top1k passages from them for the dev queries and evaluate them.
461 | 
462 | ```
463 | sh anserini/target/appassembler/bin/IndexCollection -collection JsonCollection \
464 |   -generator DefaultLuceneDocumentGenerator -threads 1 \
465 |   -input msmarco-doc-passage -index lucene-index-msmarco-doc-passage
466 | 
467 | sh anserini/target/appassembler/bin/SearchCollection \
468 |   -index lucene-index-msmarco-doc-passage \
469 |   -topicreader TsvString -topics anserini/src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt \
470 |   -output run.msmarco-doc-passage.dev.small.txt \
471 |   -bm25 -hits 10000 -selectMaxPassage -selectMaxPassage.delimiter "#" -selectMaxPassage.hits 1000
472 | 
473 | anserini/tools/eval/trec_eval.9.0.4/trec_eval -m map -m recall.1000 \
474 |   anserini/src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt \
475 |   run.msmarco-doc-passage.dev.small.txt
476 | ```
477 | 
478 | The result is:
479 | ```
480 | map                   	all	0.2688
481 | recall_1000           	all	0.9180
482 | ```
483 | 
484 | ## Predicting Queries from Documents: T5 Inference with TensorFlow
485 | 
486 | If you want to predict the queries yourself, please follow the instructions below.
487 | 
488 | We begin by downloading the corpus, which contains 3.2M documents.
489 | ```bash
490 | wget http://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz
491 | gunzip msmarco-docs.tsv.gz
492 | ```
493 | 
494 | We split the corpus into files of 100k documents, which later can be processed in parallel.
495 | ```bash
496 | split --suffix-length 2 --numeric-suffixes --lines 100000 msmarco-docs.tsv msmarco-docs.tsv
497 | ```
498 | 
499 | We now segment each document using a sliding window of 10 sentences and stride of 5 sentences:
500 | ```bash
501 | for ITER in {00..32}; do
502 |     python convert_msmarco_doc_to_t5_format.py \
503 |         --corpus_path=msmarco-docs.tsv$ITER \
504 |         --output_passage_texts_path=${OUTPUT_DIR}/passage_texts.txt$ITER \
505 |         --output_passage_doc_ids_path=${OUTPUT_DIR}/msmarco_doc_passage_ids.txt$ITER
506 | done
507 | ```
508 | 
509 | Note that we use spacy 2.1.6 to do so. Other versions generate different segments, which change retrieval results.
510 | 
511 | We are now ready to run inference. Since this is a costly step, we recommend using Google Cloud
512 | with TPUs to run it faster.
513 | 
514 | We will use the docTTTTTquery model trained on the MS MARCO passage ranking dataset, so you need to upload it to your Google Storage bucket.
515 | ```bash
516 | wget https://git.uwaterloo.ca/jimmylin/doc2query-data/raw/master/T5-passage/t5-base.zip
517 | unzip t5-base.zip
518 | gsutil cp model.ckpt-1004000* gs://your_bucket/models/
519 | ```
520 | 
521 | Run the command below to sample one question per passage (note that you will need to start a TPU).
522 | 
523 | ```bash
524 | for ITER in {00..32}; do
525 |     t5_mesh_transformer \
526 |       --tpu="your_tpu" \
527 |       --gcp_project="your_project_id" \
528 |       --tpu_zone="your_tpu_zone" \
529 |       --model_dir="gs://your_bucket/models/" \
530 |       --gin_file="gs://t5-data/pretrained_models/base/operative_config.gin" \
531 |       --gin_file="infer.gin" \
532 |       --gin_file="sample_decode.gin" \
533 |       --gin_param="infer_checkpoint_step = 1004000" \
534 |       --gin_param="utils.run.sequence_length = {'inputs': 512, 'targets': 64}" \
535 |       --gin_param="Bitransformer.decode.max_decode_length = 64" \
536 |       --gin_param="input_filename = './passage_texts.txt$ITER'" \
537 |       --gin_param="output_filename = './predicted_queries_topk_sample.txt$ITER'" \
538 |       --gin_param="tokens_per_batch = 131072" \
539 |       --gin_param="Bitransformer.decode.temperature = 1.0" \
540 |       --gin_param="Unitransformer.sample_autoregressive.sampling_keep_top_k = 10" \
541 |       --gin_param="utils.tpu_mesh_shape.tpu_topology ='v3-8'
542 | done
543 | ```
544 | 
545 | ## MS MARCO V2 Passage Expansion
546 | 
547 | Here we provide instructions on how to reproduce our docTTTTTquery results for the MS MARCO V2 passage ranking task with the Anserini IR toolkit, using predicted queries.
548 | We opensource the [predicted queries](https://huggingface.co/datasets/castorini/msmarco_v2_passage_doc2query-t5_expansions/viewer/default/train) using the [🤗 Datasets library](https://github.com/huggingface/datasets).
549 | Note that this is a very large dataset, so we ran the docTTTTTquery inference step across multiple TPUs.
550 | In fact, there is a signficant blow-up in the dataset size compared to MS MARCO v1, because of which we choose to only generate 20 queries per passage.
551 | Also, we use a different docTTTTTquery model trained on the MS MARCO v2 passage ranking dataset.
552 | 
553 | We use the [metadata-augmented passage corpus](https://github.com/castorini/anserini/blob/master/docs/experiments-msmarco-v2.md#passage-collection-augmented) which was shown to have better effectiveness.
554 | 
555 | First, we download the expanded queries dataset and expand this corpus using `NUM_QUERIES` queries per passage:
556 | ```bash
557 | export NUM_QUERIES=20
558 | python3 msmarco-v2/augment_corpus.py --hgf_d2q_dataset castorini/msmarco_v2_passage_doc2query-t5_expansions \
559 |         --original_psg_path collections/msmarco_v2_passage_augmented \
560 |         --output_psg_path collections/msmarco_v2_passage_augmented_d2q-t5_${NUM_QUERIES} \
561 |         --num_workers 70 \
562 |         --num_queries ${NUM_QUERIES} \
563 |         --task passage \
564 |         --cache_dir /path/to/cache/dir
565 | ```
566 | The dataset is downloaded and processed in the cache directory after which the corpus is expanded too.
567 | So make sure you have enough storage space (around 300 GB for this entire task).
568 | If the dataset is not already cached, this script would take about 18 hours.
569 | If it is, you can expect it to finish in about 10 hours.
570 | 
571 | Upon completion, index the expanded passages with Anserini:
572 | ```bash
573 | sh target/appassembler/bin/IndexCollection -collection MsMarcoV2PassageCollection \
574 |  -generator DefaultLuceneDocumentGenerator -threads 70 \
575 |  -input collections/msmarco_v2_passage_augmented_d2q-t5_${NUM_QUERIES} \
576 |  -index indexes/msmarco-v2-passage-augmented-d2q-t5-${NUM_QUERIES} \
577 |  -optimize
578 | ```
579 | Note that this index does not store any "extras" (positions, document vectors, raw documents, etc.) because we don't need any of these for BM25 retrieval.
580 | 
581 | 
582 | Finally, we can perform runs on the dev queries (both sets):
583 | 
584 | ```bash
585 | target/appassembler/bin/SearchCollection -index indexes/msmarco-v2-passage-augmented-d2q-t5-${NUM_QUERIES} \
586 |  -topicreader TsvInt -topics src/main/resources/topics-and-qrels/topics.msmarco-v2-passage.dev.txt \
587 |  -output runs/run.msmarco-v2-passage-augmented-d2q-t5-${NUM_QUERIES}.dev.txt -bm25 -hits 1000
588 | 
589 | target/appassembler/bin/SearchCollection -index indexes/msmarco-v2-passage-augmented-d2q-t5-${NUM_QUERIES} \
590 |  -topicreader TsvInt -topics src/main/resources/topics-and-qrels/topics.msmarco-v2-passage.dev2.txt \
591 |  -output runs/run.msmarco-v2-passage-augmented-d2q-t5-${NUM_QUERIES}.dev2.txt -bm25 -hits 1000
592 | ```
593 | 
594 | Evaluation:
595 | ```bash
596 | $ tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map -m recip_rank src/main/resources/topics-and-qrels/qrels.msmarco-v2-passage.dev.txt runs/run.msmarco-v2-passage-augmented-d2q-t5-${NUM_QUERIES}.dev.txt
597 | map                     all     0.1160
598 | recip_rank              all     0.1172
599 | 
600 | $ tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100,1000 src/main/resources/topics-and-qrels/qrels.msmarco-v2-passage.dev.txt runs/run.msmarco-v2-passage-augmented-d2q-t5-${NUM_QUERIES}.dev.txt
601 | recall_100              all     0.5039
602 | recall_1000             all     0.7647
603 | 
604 | $ tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map -m recip_rank src/main/resources/topics-and-qrels/qrels.msmarco-v2-passage.dev2.txt runs/run.msmarco-v2-passage-augmented-d2q-t5-${NUM_QUERIES}.dev2.txt
605 | map                     all     0.1158
606 | recip_rank              all     0.1170
607 | 
608 | $ tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100,1000 src/main/resources/topics-and-qrels/qrels.msmarco-v2-passage.dev2.txt runs/run.msmarco-v2-passage-augmented-d2q-t5-${NUM_QUERIES}.dev2.txt
609 | recall_100              all     0.5158
610 | recall_1000             all     0.7659
611 | ```
612 | 
613 | ## MS MARCO V2 (Segmented) Document Expansion
614 | 
615 | This guide provide sinstructions on how to reproduce our docTTTTTquery results for the MS MARCO V2 document ranking task with the Anserini IR toolkit, using predicted queries.
616 | We opensource the [predicted queries](https://huggingface.co/datasets/castorini/msmarco_v2_doc_segmented_doc2query-t5_expansions/viewer/default/train) using the [🤗 Datasets library](https://github.com/huggingface/datasets).
617 | Note that this is a very large dataset, so we ran the docTTTTTquery inference step across multiple TPUs.
618 | Also, we use a different docTTTTTquery model trained on the MS MARCO v2 passage ranking dataset.
619 | 
620 | We use the [segmented document corpus](https://github.com/castorini/anserini/blob/master/docs/experiments-msmarco-v2.md#document-collection-segmented) which was shown to have better effectiveness.
621 | 
622 | First, we download the expanded queries dataset and expand this corpus using `NUM_QUERIES` queries per passage:
623 | 
624 | ```bash
625 | export NUM_QUERIES=10
626 | python3 msmarco-v2/augment_corpus.py --hgf_d2q_dataset castorini/msmarco_v2_doc_segmented_doc2query-t5_expansions \
627 |         --original_psg_path collections/msmarco_v2_doc_segmented \
628 |         --output_psg_path collections/msmarco_v2_doc_segmented_d2q-t5_${NUM_QUERIES} \
629 |         --num_workers 60 \
630 |         --num_queries ${NUM_QUERIES} \
631 |         --task segment \
632 |         --cache_dir /path/to/cache/dir
633 | ```
634 | The dataset is downloaded and processed in the cache directory after which the corpus is expanded too.
635 | So make sure you have enough storage space (around 300 GB for this entire task).
636 | If the dataset is not already cached, this script would take about 18 hours.
637 | If it is, you can expect it to finish in about 10 hours.
638 | 
639 | Upon completion, index the expanded document segments with Anserini:
640 | ```bash
641 | sh target/appassembler/bin/IndexCollection -collection MsMarcoV2DocCollection \
642 |  -generator DefaultLuceneDocumentGenerator -threads 60 \
643 |  -input collections/msmarco_v2_doc_segmented_d2q-t5_${NUM_QUERIES} \
644 |  -index indexes/msmarco-v2-doc-segmented-d2q-t5-${NUM_QUERIES} \
645 |  -optimize
646 | ```
647 | Note that this index does not store any "extras" (positions, document vectors, raw documents, etc.) because we don't need any of these for BM25 retrieval.
648 | 
649 | 
650 | Finally, we can perform runs on the dev queries (both sets):
651 | ```bash
652 | target/appassembler/bin/SearchCollection -index /store/scratch/rpradeep/msmarco-v2/indexes/msmarco-v2-doc-segmented-d2q-t5-${NUM_QUERIES} \
653 |   -topicreader TsvInt -topics src/main/resources/topics-and-qrels/topics.msmarco-v2-doc.dev.txt \
654 |   -output runs/run.msmarco-v2-doc-segmented-d2q-t5-${NUM_QUERIES}.dev.txt \
655 |   -bm25 -hits 10000 -selectMaxPassage -selectMaxPassage.delimiter "#" -selectMaxPassage.hits 1000
656 | 
657 | target/appassembler/bin/SearchCollection -index /store/scratch/rpradeep/msmarco-v2/indexes/msmarco-v2-doc-segmented-d2q-t5-${NUM_QUERIES} \
658 |   -topicreader TsvInt -topics src/main/resources/topics-and-qrels/topics.msmarco-v2-doc.dev2.txt \
659 |   -output runs/run.msmarco-v2-doc-segmented-d2q-t5-${NUM_QUERIES}.dev2.txt \
660 |   -bm25 -hits 10000 -selectMaxPassage -selectMaxPassage.delimiter "#" -selectMaxPassage.hits 1000
661 | ```
662 | 
663 | 
664 | Evaluation:
665 | ```bash
666 | $ tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map -m recip_rank src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-d2q-t5-${NUM_QUERIES}.dev.txt
667 | map                     all     0.2203
668 | recip_rank              all     0.2226
669 | 
670 | $ tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100,1000 src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev.txt runs/run.msmarco-v2-doc-segmented-d2q-t5-${NUM_QUERIES}.dev.txt
671 | recall_100              all     0.7297
672 | recall_1000             all     0.8982
673 | 
674 | $ tools/eval/trec_eval.9.0.4/trec_eval -c -M 100 -m map -m recip_rank src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev2.txt runs/run.msmarco-v2-doc-segmented-d2q-t5-${NUM_QUERIES}.dev2.txt
675 | map                     all     0.2205
676 | recip_rank              all     0.2234
677 | 
678 | $ tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100,1000 src/main/resources/topics-and-qrels/qrels.msmarco-v2-doc.dev2.txt runs/run.msmarco-v2-doc-segmented-d2q-t5-${NUM_QUERIES}.dev2.txt
679 | recall_100              all     0.7316
680 | recall_1000             all     0.8952
681 | ```


--------------------------------------------------------------------------------