├── .gitignore
├── LICENSE
├── README.md
├── SECURITY.md
├── _config.yml
├── downloadData.sh
├── generateArtificialSessions.py
├── generateQueryEmbeddings.py
├── generateQueryEmbeddingsBERT.py
├── generateQuerysets.py
└── querySetStats.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.suo
  8 | *.user
  9 | *.userosscache
 10 | *.sln.docstates
 11 | 
 12 | # User-specific files (MonoDevelop/Xamarin Studio)
 13 | *.userprefs
 14 | 
 15 | # Build results
 16 | [Dd]ebug/
 17 | [Dd]ebugPublic/
 18 | [Rr]elease/
 19 | [Rr]eleases/
 20 | x64/
 21 | x86/
 22 | bld/
 23 | [Bb]in/
 24 | [Oo]bj/
 25 | [Ll]og/
 26 | 
 27 | # Visual Studio 2015/2017 cache/options directory
 28 | .vs/
 29 | # Uncomment if you have tasks that create the project's static files in wwwroot
 30 | #wwwroot/
 31 | 
 32 | # Visual Studio 2017 auto generated files
 33 | Generated\ Files/
 34 | 
 35 | # MSTest test Results
 36 | [Tt]est[Rr]esult*/
 37 | [Bb]uild[Ll]og.*
 38 | 
 39 | # NUNIT
 40 | *.VisualState.xml
 41 | TestResult.xml
 42 | 
 43 | # Build Results of an ATL Project
 44 | [Dd]ebugPS/
 45 | [Rr]eleasePS/
 46 | dlldata.c
 47 | 
 48 | # Benchmark Results
 49 | BenchmarkDotNet.Artifacts/
 50 | 
 51 | # .NET Core
 52 | project.lock.json
 53 | project.fragment.lock.json
 54 | artifacts/
 55 | **/Properties/launchSettings.json
 56 | 
 57 | # StyleCop
 58 | StyleCopReport.xml
 59 | 
 60 | # Files built by Visual Studio
 61 | *_i.c
 62 | *_p.c
 63 | *_i.h
 64 | *.ilk
 65 | *.meta
 66 | *.obj
 67 | *.iobj
 68 | *.pch
 69 | *.pdb
 70 | *.ipdb
 71 | *.pgc
 72 | *.pgd
 73 | *.rsp
 74 | *.sbr
 75 | *.tlb
 76 | *.tli
 77 | *.tlh
 78 | *.tmp
 79 | *.tmp_proj
 80 | *.log
 81 | *.vspscc
 82 | *.vssscc
 83 | .builds
 84 | *.pidb
 85 | *.svclog
 86 | *.scc
 87 | 
 88 | # Chutzpah Test files
 89 | _Chutzpah*
 90 | 
 91 | # Visual C++ cache files
 92 | ipch/
 93 | *.aps
 94 | *.ncb
 95 | *.opendb
 96 | *.opensdf
 97 | *.sdf
 98 | *.cachefile
 99 | *.VC.db
100 | *.VC.VC.opendb
101 | 
102 | # Visual Studio profiler
103 | *.psess
104 | *.vsp
105 | *.vspx
106 | *.sap
107 | 
108 | # Visual Studio Trace Files
109 | *.e2e
110 | 
111 | # TFS 2012 Local Workspace
112 | $tf/
113 | 
114 | # Guidance Automation Toolkit
115 | *.gpState
116 | 
117 | # ReSharper is a .NET coding add-in
118 | _ReSharper*/
119 | *.[Rr]e[Ss]harper
120 | *.DotSettings.user
121 | 
122 | # JustCode is a .NET coding add-in
123 | .JustCode
124 | 
125 | # TeamCity is a build add-in
126 | _TeamCity*
127 | 
128 | # DotCover is a Code Coverage Tool
129 | *.dotCover
130 | 
131 | # AxoCover is a Code Coverage Tool
132 | .axoCover/*
133 | !.axoCover/settings.json
134 | 
135 | # Visual Studio code coverage results
136 | *.coverage
137 | *.coveragexml
138 | 
139 | # NCrunch
140 | _NCrunch_*
141 | .*crunch*.local.xml
142 | nCrunchTemp_*
143 | 
144 | # MightyMoose
145 | *.mm.*
146 | AutoTest.Net/
147 | 
148 | # Web workbench (sass)
149 | .sass-cache/
150 | 
151 | # Installshield output folder
152 | [Ee]xpress/
153 | 
154 | # DocProject is a documentation generator add-in
155 | DocProject/buildhelp/
156 | DocProject/Help/*.HxT
157 | DocProject/Help/*.HxC
158 | DocProject/Help/*.hhc
159 | DocProject/Help/*.hhk
160 | DocProject/Help/*.hhp
161 | DocProject/Help/Html2
162 | DocProject/Help/html
163 | 
164 | # Click-Once directory
165 | publish/
166 | 
167 | # Publish Web Output
168 | *.[Pp]ublish.xml
169 | *.azurePubxml
170 | # Note: Comment the next line if you want to checkin your web deploy settings,
171 | # but database connection strings (with potential passwords) will be unencrypted
172 | *.pubxml
173 | *.publishproj
174 | 
175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
176 | # checkin your Azure Web App publish settings, but sensitive information contained
177 | # in these scripts will be unencrypted
178 | PublishScripts/
179 | 
180 | # NuGet Packages
181 | *.nupkg
182 | # The packages folder can be ignored because of Package Restore
183 | **/[Pp]ackages/*
184 | # except build/, which is used as an MSBuild target.
185 | !**/[Pp]ackages/build/
186 | # Uncomment if necessary however generally it will be regenerated when needed
187 | #!**/[Pp]ackages/repositories.config
188 | # NuGet v3's project.json files produces more ignorable files
189 | *.nuget.props
190 | *.nuget.targets
191 | 
192 | # Microsoft Azure Build Output
193 | csx/
194 | *.build.csdef
195 | 
196 | # Microsoft Azure Emulator
197 | ecf/
198 | rcf/
199 | 
200 | # Windows Store app package directories and files
201 | AppPackages/
202 | BundleArtifacts/
203 | Package.StoreAssociation.xml
204 | _pkginfo.txt
205 | *.appx
206 | 
207 | # Visual Studio cache files
208 | # files ending in .cache can be ignored
209 | *.[Cc]ache
210 | # but keep track of directories ending in .cache
211 | !*.[Cc]ache/
212 | 
213 | # Others
214 | ClientBin/
215 | ~$*
216 | *~
217 | *.dbmdl
218 | *.dbproj.schemaview
219 | *.jfm
220 | *.pfx
221 | *.publishsettings
222 | orleans.codegen.cs
223 | 
224 | # Including strong name files can present a security risk 
225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
226 | #*.snk
227 | 
228 | # Since there are multiple workflows, uncomment next line to ignore bower_components
229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
230 | #bower_components/
231 | 
232 | # RIA/Silverlight projects
233 | Generated_Code/
234 | 
235 | # Backup & report files from converting an old project file
236 | # to a newer Visual Studio version. Backup files are not needed,
237 | # because we have git ;-)
238 | _UpgradeReport_Files/
239 | Backup*/
240 | UpgradeLog*.XML
241 | UpgradeLog*.htm
242 | ServiceFabricBackup/
243 | *.rptproj.bak
244 | 
245 | # SQL Server files
246 | *.mdf
247 | *.ldf
248 | *.ndf
249 | 
250 | # Business Intelligence projects
251 | *.rdl.data
252 | *.bim.layout
253 | *.bim_*.settings
254 | *.rptproj.rsuser
255 | 
256 | # Microsoft Fakes
257 | FakesAssemblies/
258 | 
259 | # GhostDoc plugin setting file
260 | *.GhostDoc.xml
261 | 
262 | # Node.js Tools for Visual Studio
263 | .ntvs_analysis.dat
264 | node_modules/
265 | 
266 | # Visual Studio 6 build log
267 | *.plg
268 | 
269 | # Visual Studio 6 workspace options file
270 | *.opt
271 | 
272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
273 | *.vbw
274 | 
275 | # Visual Studio LightSwitch build output
276 | **/*.HTMLClient/GeneratedArtifacts
277 | **/*.DesktopClient/GeneratedArtifacts
278 | **/*.DesktopClient/ModelManifest.xml
279 | **/*.Server/GeneratedArtifacts
280 | **/*.Server/ModelManifest.xml
281 | _Pvt_Extensions
282 | 
283 | # Paket dependency manager
284 | .paket/paket.exe
285 | paket-files/
286 | 
287 | # FAKE - F# Make
288 | .fake/
289 | 
290 | # JetBrains Rider
291 | .idea/
292 | *.sln.iml
293 | 
294 | # CodeRush
295 | .cr/
296 | 
297 | # Python Tools for Visual Studio (PTVS)
298 | __pycache__/
299 | *.pyc
300 | 
301 | # Cake - Uncomment if you are using it
302 | # tools/**
303 | # !tools/packages.config
304 | 
305 | # Tabs Studio
306 | *.tss
307 | 
308 | # Telerik's JustMock configuration file
309 | *.jmconfig
310 | 
311 | # BizTalk build output
312 | *.btp.cs
313 | *.btm.cs
314 | *.odx.cs
315 | *.xsd.cs
316 | 
317 | # OpenCover UI analysis results
318 | OpenCover/
319 | 
320 | # Azure Stream Analytics local run output 
321 | ASALocalRun/
322 | 
323 | # MSBuild Binary and Structured Log
324 | *.binlog
325 | 
326 | # NVidia Nsight GPU debugger configuration file
327 | *.nvuser
328 | 
329 | # MFractors (Xamarin productivity tool) working folder 
330 | .mfractor/
331 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MSMARCO
  2 | A Family of datasets built using technology and Data from Microsoft's Bing.
  3 | 
  4 | > MS MARCO: A Human Generated MAchine Reading COmprehension Dataset
  5 | > Paper URL : https://arxiv.org/abs/1611.09268
  6 | 
  7 | MS MARCO(Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking, Keyphrase Extraction, and Conversational Search Studies, or what the community thinks would be useful. 
  8 | 
  9 | First released at [NIPS 2016](https://arxiv.org/pdf/1611.09268.pdf), the current dataset has 1,010,916 unique real queries that were generated by sampling and anonymizing Bing usage logs. The dataset started off focusing on QnA but has since evolved to focus on any problem related to search. For task specifics please explore some of the tasks that have been built out of the dataset. If you think there is a relevant task we have missed please open an issue explaining your ideas?  
 10 | 
 11 | For more information about [TREC 2019 Deep Learning](https://github.com/microsoft/TREC-2019-Deep-Learning)
 12 | 
 13 | For more information about [Q&A](https://github.com/microsoft/MSMARCO-Question-Answering)
 14 | 
 15 | For more information about [Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking)
 16 | 
 17 | For more information about [Keyphrase Extraction](https://github.com/microsoft/MSMARCO-OpenKP)
 18 | 
 19 | For more information about [Conversational Search](https://github.com/microsoft/MSMARCO-Conversational-Search)
 20 | 
 21 | For more information about [Polite Crawling](https://github.com/microsoft/MSMARCO-Optimal-Freshness-Crawl-Under-Politeness-Constraints)
 22 | 
 23 | 
 24 | ## Dataset Generation, Data Format, And Statistics
 25 | What is the difference between MSMARCO and other MRC datasets? We believe the advantages that are special to MSMARCO are:
 26 | - Real questions: All questions have been sample from real anonymized bing queries.
 27 | - Real Documents: Most Url's that we have source the passages from contain the full web documents. These can be used as extra contextual information to improve systems or be used to compete in our expert task.
 28 | - Human Generated Answers: All questions have an answer written by a human. If there was no answer in the passages the judge read they have written 'No Answer Present.'
 29 | - Human Generated Well-Formed: Some questions contain extra human evaluation to create well formed answers that could be used by intelligent agents like Cortana, Siri, Google Assistant, and Alexa.
 30 | - Dataset Size: At over 1 million queries the dataset is large enough to train the most complex systems and also sample the data for specific applications.
 31 | 
 32 | ## Download the Dataset
 33 | To Download the MSMARCO Dataset please navigate to [msmarco.org](http://www.msmarco.org/dataset.aspx) and agree to our Terms and Conditions. If there is some data you think we are missing and would be useful please open an issue.
 34 | 
 35 | # Conversational Search
 36 | Truly Conversational Search is the next logic step in the journey to generate intelligent and useful AI. To understand what this may mean, researchers have voiced a continuous desire to study how people currently converse with search engines. 
 37 | 
 38 | 
 39 | ## Introduction
 40 | Traditionally, the desire to produce such a comprehensive dataset has been limited because those who have this data (Search Engines) have a responsibility to their users to maintain their privacy and cannot share the data publicly in a way that upholds the trusts users have in the Search Engines. Given these two powerful forces we believe we have a dataset and paradigm that meets both sets of needs: A artificial public dataset that approximates the true data and an ability to evaluate model performance on the real user behavior. What this means is we released a public dataset which is generated by creating artificial sessions using embedding similarity and will test on the original data. To say this again: we are not releasing any private user data but are releasing what we believe to be a good representation of true user interactions.
 41 | 
 42 | ## Corpus Generation
 43 | To generate our projection corpus, we took the 1,010,916 MSMARCO queries and generated the query vectors for each unique queries. Once we had these embedding spaces, we build an Approximate Nearest Neighbor Index using [ANNOY](https://github.com/spotify/annoy).
 44 | Next, we sampled our Bing usage log from 2018-06-01 to 2018-11-30 to find a sample of sessions that that had more than 1 query, shared a query that had a query embedding similar to a MSMARCO query, and were likely to be conversational in nature. Next we remove all navigation, bot, junk, and adult sessions. Once we did this, we now had 45,040,730 unique user sessions of 344,147 unique queries. The average session was 2.6 queries long and the longest session was 160 queries. Just like we did for our public queries, we generated embedding for each unique query. Finally, in order to merge the two, for each unique session we perform a nearest neighbor search given the real queries query vector in the MSMARCO ANN Index. This allows us to join the public queries to the private sessions generating an artificial user session grounded in true user behavior. 
 45 | 
 46 | An example of these search sessions is below.
 47 | ```
 48 | marco-gen-dev-40        what is the australian flag     what is the population of australia     what hemisphere is north australia                                                                       how big is sydney australia      is australia a country
 49 | marco-gen-dev-152       is elements on a periodic table what is the product of ch4      cost of solar system    what constellation is cassiopeia in                                                      what is the human skeleton       what is a isosceles triangle    convert a fraction to a % calculator    standard deviation difference calculator                                                         graph y = 1/2 x cubed how to     what is the volume of a pyramid what is american sign language definition       how to put word count on word
 50 | marco-gen-dev-157       define colonialism      what are migrant workers        office 365 cost to add a user   what does per capita means                                                               what are tariffs definition for tariffs  define urbanization
 51 | marco-gen-dev-218       icd 10 code rhinitis    icd diagnosis code for cva      icd code for psoriatic arthritis       icd 10 code for facet arthritis of knee                                           icd 10 code for copd     icd codes for cad       icd diagnosis code for cva      icd 10 code for personal history pvd
 52 | marco-gen-dev-385       circadian activity rhythms definition   what is the synonym of insomnia insomnia definition    define narcolepsy                                                                 causes for night terrors types of meditation     definition: meditation  pros and cons for death penalty
 53 | marco-gen-dev-397       cost of solar system    what is the hottest planet?     what is coldest planet  what is solar system is                                                                          what kind of galaxy is the milky way     spanish iris is what in english
 54 | marco-gen-dev-457       can we repeal donald trump      what was barack obama   is john mccain a republican     who is chuck schumer's daughter                                                          was bill clinton a democrat      is mike pence a veteran why did barack obama get a nobel peace prize    was mahatma gandhi awarded a peace prize                                                         why did dalai lama win the nobel peace prize
 55 | marco-gen-dev-485       definition of technology        define scientific method        definition of constant  graphs definition                                                                        definition of information technology     define axis     definition of  trial    legal definition of bias        define what an inference is
 56 | marco-gen-dev-496       illusory definition     illusion vs allusion definition declaration + definition        define: caste                                                                            define rhetoric  define race
 57 | marco-gen-dev-572       stock price tesla       fb stock price  home depot stock price t        amazon stock price     nxp semiconductors stock price
 58 | ```
 59 | 
 60 | We first release [BERT Based Sessions](https://msmarco.blob.core.windows.net/conversationalsearch/artificialSessionsBERT500k.tsv.gz) and [Query Embedding Based Sessions](https://msmarco.blob.core.windows.net/conversationalsearch/artificialSessionsQueryEncoding500kSample.tsv.gz) which we shared with a small group of researchers to get feedback on what worked better. Based on community feedback and data exploration our we are releasing our first full scale dataset described below.
 61 | 
 62 | 1. Split the 45,040,730 artificial sessions into a train, dev and test. To do so, any session that included a query from the QnA eval set was considered eval and the remaining sessions were split 90%/10% between train and dev. These files are called full_marco_sessions_ann_split.* and have the following sizes
 63 | ```
 64 | spacemanidol@spacemanidol:/mnt/c/Users/dacamp/Desktop$ wc -l full_marco_sessions_ann_split.*
 65 |    3656706 full_marco_sessions_ann_split.dev.tsv
 66 |    8480280 full_marco_sessions_ann_split.test.tsv
 67 |   32903744 full_marco_sessions_ann_split.train.tsv
 68 |   45040730 total
 69 | ```
 70 | 2. Use the query vectors to generate a similairty score between edges and filter out four types of edges
 71 |     * Topic Change: (cosine similarity) <= 0.4 
 72 |     * Explore: (cosine similarity) \in (0.4, 0.7]
 73 |     * Specify: (cosine similarity) \in (0.7, 0.85]
 74 |     * Paraphrase: (cosine similarity) \in (0.85, 1]
 75 | 3. Filter by session coherence
 76 |     * treat "topic change" as non-related and no edge, 
 77 |     * build the graph in each session, 
 78 |     * pick the biggest sub-graph in the session, 
 79 |     * throw away the rest queries not in the biggest sub-graph
 80 | 4. Filter by session length removing anything with less than 4 queries.
 81 | 5. Filter any session where the chain of queries were just paraphrases
 82 | 
 83 | Upon doing this, we have 3 sets of files: [Train](https://msmarco.blob.core.windows.net/conversationalsearch/ann_session_train.tar.gz),[Dev](https://msmarco.blob.core.windows.net/conversationalsearch/ann_session_dev.tar.gz), and eval. We are not currently sharing eval. Explicit size numbers below
 84 | ```
 85 |     75193 marco_ann_session.dev.all.tsv
 86 |    774724 marco_ann_session.test.all.tsv
 87 |    675334 marco_ann_session.train.all.tsv
 88 |   1525251 total
 89 | ```
 90 | 
 91 | To provide more exploratory content, we further filter the sessions in each split(Train, Dev, Eval) using three types of filtering:
 92 | 1. ".half_trans.jsonl": half (explore|specifiy) >= 50% adjacent edges in (0.4, 0.85]
 93 | 2. ".half_explore.jsonl": half (explore): >= 50% adjacent edges in (0.4, 0.7]
 94 | 3. ".half_specify.jsonl": half (specifiy): >= 50% adjacent edges in (0.7, 0.85]
 95 | 
 96 | Explicit size numbers below
 97 | ```
 98 |  spacemanidol@spacemanidol:/mnt/c/Users/dacamp/Desktop$ wc -l marco_ann_session.*.half*
 99 |     53791 marco_ann_session.dev.half_explore.tsv
100 |      5646 marco_ann_session.dev.half_specify.tsv
101 |     63854 marco_ann_session.dev.half_trans.tsv
102 |    548101 marco_ann_session.test.half_explore.tsv
103 |     42032 marco_ann_session.test.half_specify.tsv
104 |    648868 marco_ann_session.test.half_trans.tsv
105 |    482021 marco_ann_session.train.half_explore.tsv
106 |     50980 marco_ann_session.train.half_specify.tsv
107 |    572791 marco_ann_session.train.half_trans.tsv
108 |   2468084 total
109 | ```
110 | 
111 | 
112 | ## Corpus task
113 | We are currently assembling baselines and building an evaluation framework but the initial task will be as follows: Given a chain of queries q1, q2,...,q(n-1) predict q(n). Ideally systems will train and test on the public artificial and will be evaluated on both the private real data and public artificial eval data.
114 | 
115 | ### Initial 500k Sample Process
116 | 1. Downloaded the data
117 | ~~~
118 | ./downloadData.sh
119 | ~~~
120 | 2. Generate the querySets
121 | ~~~
122 | python generateQuerySets.py <msmarco train queries> <msmarco dev queries> <msmarco eval queries> <quoraQueries> <NQFolder>
123 | ~~~
124 | 3. Get Query Embeddings
125 | ~~~
126 | python generateQueryEmbeddings.py <url> allQueries.tsv queryEmbeddings.tsv
127 | ~~~
128 | 4. Generate BERT Query Embeddings
129 | To generate our Query Embeddings [we used BERT As A Service](https://github.com/hanxiao/bert-as-service) to generate a unique query embedding for each query in our set. If you want to go ahead and regenerate embeddings(or use it to generate another alternate query source for you model) you can follow what we did below.
130 | ~~~
131 | cd Data/BERT
132 | pip install bert-serving-server  # server
133 | pip install bert-serving-client  # client, independent of `bert-serving-server`
134 | wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip
135 | unzip cased_L-24_H-1024_A-16
136 | bert-serving-start -model_dir ~/Data/BERT/cased_L-24_H-1024_A-16  -num_worker=4 -device_map 0 #depending on your computer play around with these settings
137 | ~~~
138 | In a separate shell
139 | ~~~
140 | python3 generateQueryEmbeddingsBERT.py allQueries.tsv BERTQueryEmbeddings.tsv
141 | ~~~
142 | 5. Generate Sessions
143 | ~~~
144 | python generateArtificialSessions.py realQueries queryEmbeddings.tsv BERTQueryEmbeddings.tsv queryEmbeddings.ann sessions.tsv
145 | ~~~
146 | 
147 | 
148 | # Contributing
149 | 
150 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
151 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
152 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
153 | 
154 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
155 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
156 | provided by the bot. You will only need to do this once across all repos using our CLA.
157 | 
158 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
159 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
160 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
161 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate


--------------------------------------------------------------------------------
/downloadData.sh:
--------------------------------------------------------------------------------
 1 | export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)"
 2 | echo "deb http://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
 3 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
 4 | sudo apt-get update && sudo apt-get install google-cloud-sdk
 5 | sudo apt-get install google-cloud-sdk-app-engine-java
 6 | gcloud init
 7 | mkdir data
 8 | gsutil -m cp -R gs://natural_questions .
 9 | mkdir nq
10 | mv natural_questions/v1.0/dev/* nq/
11 | mv natural_questions/v1.0/train/* nq/
12 | rm -rf natural_questions
13 | wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv
14 | wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz
15 | tar -xzvf collectionandqueries.tar.gz
16 | rm collectionandqueries.tar.gz collection.tsv qrels.*
17 | cd nq
18 | gunzip *
19 | 


--------------------------------------------------------------------------------
/generateArtificialSessions.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import numpy as np
  4 | import sys
  5 | from annoy import AnnoyIndex
  6 | 
  7 | TREESIZE = 1000
  8 | 
  9 | 
 10 | def loadQueries(filename):
 11 |     queries = set()
 12 |     with open(filename,'r') as f:
 13 |         for l in f:
 14 |             queries.add(l.strip())
 15 |     return queries
 16 | 
 17 | def loadSessions(filename):
 18 |     sessions = []
 19 |     with open(filename,'r') as f:
 20 |         for l in f:
 21 |             sessions.append(l.strip().split('\t'))
 22 |     return sessions
 23 | 
 24 | 
 25 | def loadVectors(filename, realQueries, artificialQueries):
 26 |     i = 0
 27 |     j = 0
 28 |     artificial = [{},{},[]] #Query2Idx, idX2Query, id2Vector
 29 |     real = [{},{},[]] #Query2Idx, idX2Query, 
 30 |     with open(filename,'r') as f:
 31 |         for l in f:
 32 |             l = l.strip().split('\t')
 33 |             query = l[0]
 34 |             vectors = l[1].split(' ')
 35 |             if query in artificialQueries:
 36 |                 artificial[0][query] = i
 37 |                 artificial[1][i] = query
 38 |                 artificial[2].append(np.array(vectors,dtype=float))
 39 |                 i += 1
 40 |             if query in realQueries:
 41 |                 real[0][query] = j
 42 |                 real[1][j] = query
 43 |                 real[2].append(np.array(vectors,dtype=float))
 44 |                 j += 1
 45 |     return real, artificial
 46 | 
 47 | def generateAnnoy(real, artificial, annoyFilename, dimensions):
 48 |     idx2vec = np.array(artificial[2])
 49 |     t = AnnoyIndex(dimensions)
 50 |     for j in range(len(artificial[2])):
 51 |         t.add_item(j,idx2vec[j])
 52 |     print('Done Adding items to AnnoyIndex')
 53 |     t.build(TREESIZE)
 54 |     print('Done Building AnnoyIndex')
 55 |     t.save(annoyFilename)
 56 |     return t
 57 | 
 58 | def generateArtificialSessions(realQueryVectors, artificialQueryVectors, sessions, annoyEmbedding, filename):
 59 |     with open(filename,'w') as w:
 60 |         for session in sessions:
 61 |             queriesUsed = set()
 62 |             output = ''
 63 |             properArtificialSetGenerated = True
 64 |             for query in session:
 65 |                 if query not in realQueryVectors[0]:#Lookup issue so cant print session
 66 |                     break
 67 |                 artificialQueries = annoyEmbedding.get_nns_by_vector(realQueryVectors[2][realQueryVectors[0][query]], 15, search_k=-1, include_distances=False)
 68 |                 replacementFound = False
 69 |                 for i in range(len(artificialQueries)):
 70 |                     artificialQuery = artificialQueryVectors[1][artificialQueries[i]]
 71 |                     if artificialQuery not in queriesUsed: #ensure session isnt just repeating queries
 72 |                         queriesUsed.add(artificialQuery)
 73 |                         replacementFound = True
 74 |                         output += '{}\t'.format(artificialQuery)
 75 |                         break
 76 |                 if replacementFound == False:
 77 |                     properArtificialSetGenerated = False
 78 |                     break
 79 |             if properArtificialSetGenerated == True:       
 80 |                 w.write("{}\n".format(output[:-1]))
 81 | if __name__ == "__main__":
 82 |     if len(sys.argv) != 7:
 83 |         print("Usage: makeArtificialSessions.py <realQueries> <artificialQueries> <queryVectors> <BERTVectors> <annFilename> <sessions>")
 84 |         exit(-1)
 85 |     else:
 86 |         print("Loading Queries and Sessions")
 87 |         realQueries = loadQueries(sys.argv[1])
 88 |         artificialQueries = loadQueries(sys.argv[2])
 89 |         print("Loading Sessions")
 90 |         sessions = loadSessions(sys.argv[6])
 91 |         #Run regular embeddings
 92 |         print("Loading Query Vectors")
 93 |         real, artificial = loadVectors(sys.argv[3], realQueries, artificialQueries)
 94 |         print("Building Annnoy Query Embeddings")
 95 |         annoyEmbedding = generateAnnoy(real, artificial, sys.argv[5], 100)
 96 |         print("Generating Sessions Query Embeddings")
 97 |         generateArtificialSessions(real,artificial, sessions, annoyEmbedding, 'sessionsEmbedding.tsv')
 98 |         #Run on BERT embeddings
 99 |         print("Loading BERT Vectors")
100 |         real, artificial = loadVectors(sys.argv[4], realQueries, artificialQueries)
101 |         print("Building Annnoy Query Embeddings")
102 |         annoyEmbedding = generateAnnoy(real, artificial, 'BERT' + sys.argv[5], 1024)
103 |         print("Generating Sessions Query Embeddings")
104 |         generateArtificialSessions(real, artificial, sessions, annoyEmbedding, 'sessionsEmbeddingBERT.tsv')
105 | 


--------------------------------------------------------------------------------
/generateQueryEmbeddings.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import requests
 4 | def loadQueries(filename):
 5 |     queries = []
 6 |     with open(filename,'r') as f:
 7 |         for l in f:
 8 |             queries.append(l.strip())
 9 |     return queries
10 | 
11 | def generateOuput(responses):
12 |     output = ''
13 |     for response in responses:
14 |         output += '{}\t'.format(response['query'])
15 |         for v in response['vector'][:-1]:
16 |             output += '{} '.format(v)
17 |         output += '{}\n'.format(response['vector'][-1])
18 |     return output
19 | 
20 | def getVectors(queries, key, filename):
21 |     chunks = []
22 |     count = 1
23 |     for i in range(0, len(queries), 10): 
24 |         chunks.append(queries[i:i+10])
25 |     with open(filename,'w') as w:
26 |         for i, chunk in enumerate(chunks):
27 |             if i % 100 == 0:
28 |                 print('{} vectors retrieved'.format(i*10))
29 |             i += 1
30 |             try:
31 |                 w.write(generateOuput(requests.get(url=key + str(chunk) + "}").json()))
32 |             except:
33 |                 try:
34 |                     w.write(generateOuput(requests.get(url=key + str(chunk) + "}").json()))
35 |                 except:
36 |                     try:
37 |                         w.write(generateOuput(requests.get(url=key + str(chunk) + "}").json()))
38 |                     except:
39 |                         #Its dirty but sometimes the API fails and this is the easiest fix
40 |                         continue
41 | if __name__ == "__main__":
42 |     if len(sys.argv) != 4:
43 |         print('Usage:generateQueryEmbeddings.py <url> <queryFile> <outputfile>')
44 |         exit(-1)
45 |     else:
46 |         queries = loadQueries(sys.argv[2])
47 |         getVectors(queries, sys.argv[1], sys.argv[3])
48 | 


--------------------------------------------------------------------------------
/generateQueryEmbeddingsBERT.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import time
 4 | from bert_serving.client import BertClient
 5 | def loadQueries(filename):
 6 |     queries = []
 7 |     with open(filename,'r') as f:
 8 |         for l in f:
 9 |             queries.append(l.strip())
10 |     return queries
11 | def process(queryPack, response):
12 |     output = ''
13 |     for i in range(len(queryPack)):
14 |         output += "{}\t".format(queryPack[i])
15 |         for j in range(len(response[i])):
16 |             output += "{} ".format(response[i][j])
17 |         output += "\n"
18 |     return output
19 | def getVectors(queries, filename):
20 |     i = 1
21 |     bc = BertClient()
22 |     query = 'who founded microsoft'
23 |     print("Testing bc\nTesting Query:{}\nVector:{}".format(query, bc.encode([query])[0]))
24 |     with open(filename,'w') as w:
25 |         for j in range(0,len(queries), 100):
26 |             if i % 100 == 0:
27 |                 print('{} vectors retrieved'.format(i*100))
28 |             queryPack = queries[j:j+100]
29 |             response = bc.encode(queryPack)
30 |             w.write(process(queryPack, response))
31 |             i += 1
32 | if __name__ == "__main__":
33 |     if len(sys.argv) != 3:
34 |         print('Usage: generateQueryEmbeddingsBERT.py <queryFile> <outputfile>')
35 |         exit(-1)
36 |     else:
37 |         queries = loadQueries(sys.argv[1])
38 |         print("{} queries loaded".format(len(queries)))
39 |         getVectors(queries, sys.argv[2])
40 | 


--------------------------------------------------------------------------------
/generateQuerysets.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | import os
 4 | def loadNQ(dirPath):
 5 |     files = os.listdir(dirPath)
 6 |     queries = set()
 7 |     for file in files:
 8 |         with open(os.path.join(dirPath, file),'r') as f:
 9 |             for l in f:
10 |                 j = json.loads(l)
11 |                 query_text = j['question_text']
12 |                 queries.add(query_text)
13 |     return queries
14 | def loadQuora(filename):
15 |     #id      qid1    qid2    question1       question2       is_duplicate
16 |     #0       1       2       What is the step by step guide to invest in share market in india?      What is the step by step guide to invest in share market?       0
17 |     queries = set()
18 |     with open(filename) as f:
19 |         for l in f:
20 |             l = l.strip().split('\t')
21 |             if len(l) > 4:
22 |                 queries.add(l[3])
23 |                 queries.add(l[4])
24 |     return queries
25 | def loadMSMARCO(files):
26 |     #1048578 cost of endless pools/swim spa
27 |     queries = set()
28 |     for file in files:
29 |         with open(file,'r') as f:
30 |             for l in f:
31 |                 queries.add(l.strip().split('\t')[1])
32 |     return queries       
33 | def loadUnusedMSMARCO(filename):
34 |     #AnswerIsInPassage\tJudgeID\tHitGroupDataInt\tHitDataInt\tHitState\tJudgmentState\tJudgmentDataInt\tJudgmentDataIntName\tJudgmentSubmitTime\tJudgmentTypeID\tTimeSpentOnJudgment\tHitGroupID\tHitID\tQueryId\tQueryText\tPassagesJson\tJudgmentID\tjudgment\tanswerTextArea\tnopassageSubmit\texplainNoAnswer\texplainCantJudge\tSL_locer\tSL_lng_from\tSL_lng_to\tJudgmentType\tGoldHitType\tGoldHitComments\t@RealTimeAuditComment\tConsensus\n'
35 |     queries = set
36 |     with open(filename, 'r') as f:
37 |         for l in f:
38 |             l = l.split('\t')
39 |             queries.append(l[14])
40 |     return queries
41 | def loadSessions(filename):
42 |     #sessionid\tquery
43 |     sessions, queries = {}, set()
44 |     with open (filename,'r') as f:
45 |         for l in f:
46 |             l = l.split('\t')
47 |             sessionID = l[0]
48 |             query = l[1]
49 |             queries.add(query)
50 |             if sessionID not in sessions:
51 |                 sessions[sessionID] = []
52 |             sessions[sessionID].append(query)
53 |     return sessions, queries
54 | def writeData(msmarcoQueries, quoraQueries, nqQueries, unusedMSMARCOQueries, realQueries, sessions):
55 |     allQueries = realQueries.union(unusedMSMARCOQueries.union(msmarcoQueries.union(quoraQueries.union(nqQueries))))
56 |     with open('msmarcoQueries.tsv','w') as w:
57 |         for query in msmarcoQueries:
58 |             w.write('{}\n'.format(query))
59 |     with open('quoraQueries.tsv','w') as w:
60 |         for query in quoraQueries:
61 |             w.write('{}\n'.format(query))
62 |     with open('nqQueries.tsv','w') as w:
63 |         for query in nqQueries:
64 |             w.write('{}\n'.format(query))
65 |     with open('unusedMSMARCOQueries.tsv','w') as w:
66 |         for query in unusedMSMARCOQueries:
67 |             w.write('{}\n'.format(query))
68 |     with open('realQueries.tsv','w') as w:
69 |         for query in realQueries:
70 |             w.write('{}\n'.format(query))
71 |     with open('allQueries.tsv','w') as w:
72 |         for query in allQueries:
73 |             w.write('{}\n'.format(query))
74 |     with open('cleanedSessions.tsv','w') as w:
75 |         for session in sessions:
76 |             for query in sessions[session][:-1]:
77 |                 w.write('{}\t'.format(query))
78 |             w.write('{}\n'.format(sessions[session][-1]))
79 | if __name__ == "__main__":
80 |     if len(sys.argv) != 8:
81 |         print("Usage: generateQuerysets.py <msmarco train queries> <msmarco dev queries> <msmarco eval queries> <quoraQueries> <NQFolder> <unused msmarco> <sessions>")
82 |         exit(-1)
83 |     else:
84 |         msmarcoQueries = loadMSMARCO([sys.argv[1],sys.argv[2],sys.argv[3]])
85 |         print("Done reading MSMARCO")
86 |         quoraQueries = loadQuora(sys.argv[4])
87 |         print("Done reading quora")
88 |         #nqQueries = loadNQ(sys.argv[5])
89 |         nqQueries = [] # NQ TOS do not allow us to use the dataset for this type of research
90 |         print("Done reading NQ")
91 |         #unusedMSMARCO = loadUnusedMSMARCO(sys.argv[6])
92 |         unusedMSMARCO = []
93 |         print("Done reading Unused")
94 |         sessions, realQueries = loadSessions(sys.argv[7])
95 |         print("Done reading Sessions")
96 |         writeData(msmarcoQueries, quoraQueries, nqQueries, unusedMSMARCO, realQueries, sessions)
97 | 


--------------------------------------------------------------------------------
/querySetStats.txt:
--------------------------------------------------------------------------------
1 | Done reading MSMARCO
2 | Done reading quora
3 | Done reading NQ
4 | There are 1010916 unique MSMARCO Queries, 537359 unique Quora Duplicate Queries, 315203 unique NQ Queries for a total of 1862849 unique queries.
5 | 
6 | 


--------------------------------------------------------------------------------