├── .gitignore ├── LICENSE ├── README.md ├── SECURITY.md ├── _config.yml ├── downloadData.sh ├── generateArtificialSessions.py ├── generateQueryEmbeddings.py ├── generateQueryEmbeddingsBERT.py ├── generateQuerysets.py └── querySetStats.txt /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.suo 8 | *.user 9 | *.userosscache 10 | *.sln.docstates 11 | 12 | # User-specific files (MonoDevelop/Xamarin Studio) 13 | *.userprefs 14 | 15 | # Build results 16 | [Dd]ebug/ 17 | [Dd]ebugPublic/ 18 | [Rr]elease/ 19 | [Rr]eleases/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | [Bb]in/ 24 | [Oo]bj/ 25 | [Ll]og/ 26 | 27 | # Visual Studio 2015/2017 cache/options directory 28 | .vs/ 29 | # Uncomment if you have tasks that create the project's static files in wwwroot 30 | #wwwroot/ 31 | 32 | # Visual Studio 2017 auto generated files 33 | Generated\ Files/ 34 | 35 | # MSTest test Results 36 | [Tt]est[Rr]esult*/ 37 | [Bb]uild[Ll]og.* 38 | 39 | # NUNIT 40 | *.VisualState.xml 41 | TestResult.xml 42 | 43 | # Build Results of an ATL Project 44 | [Dd]ebugPS/ 45 | [Rr]eleasePS/ 46 | dlldata.c 47 | 48 | # Benchmark Results 49 | BenchmarkDotNet.Artifacts/ 50 | 51 | # .NET Core 52 | project.lock.json 53 | project.fragment.lock.json 54 | artifacts/ 55 | **/Properties/launchSettings.json 56 | 57 | # StyleCop 58 | StyleCopReport.xml 59 | 60 | # Files built by Visual Studio 61 | *_i.c 62 | *_p.c 63 | *_i.h 64 | *.ilk 65 | *.meta 66 | *.obj 67 | *.iobj 68 | *.pch 69 | *.pdb 70 | *.ipdb 71 | *.pgc 72 | *.pgd 73 | *.rsp 74 | *.sbr 75 | *.tlb 76 | *.tli 77 | *.tlh 78 | *.tmp 79 | *.tmp_proj 80 | *.log 81 | *.vspscc 82 | *.vssscc 83 | .builds 84 | *.pidb 85 | *.svclog 86 | *.scc 87 | 88 | # Chutzpah Test files 89 | _Chutzpah* 90 | 91 | # Visual C++ cache files 92 | ipch/ 93 | *.aps 94 | *.ncb 95 | *.opendb 96 | *.opensdf 97 | *.sdf 98 | *.cachefile 99 | *.VC.db 100 | *.VC.VC.opendb 101 | 102 | # Visual Studio profiler 103 | *.psess 104 | *.vsp 105 | *.vspx 106 | *.sap 107 | 108 | # Visual Studio Trace Files 109 | *.e2e 110 | 111 | # TFS 2012 Local Workspace 112 | $tf/ 113 | 114 | # Guidance Automation Toolkit 115 | *.gpState 116 | 117 | # ReSharper is a .NET coding add-in 118 | _ReSharper*/ 119 | *.[Rr]e[Ss]harper 120 | *.DotSettings.user 121 | 122 | # JustCode is a .NET coding add-in 123 | .JustCode 124 | 125 | # TeamCity is a build add-in 126 | _TeamCity* 127 | 128 | # DotCover is a Code Coverage Tool 129 | *.dotCover 130 | 131 | # AxoCover is a Code Coverage Tool 132 | .axoCover/* 133 | !.axoCover/settings.json 134 | 135 | # Visual Studio code coverage results 136 | *.coverage 137 | *.coveragexml 138 | 139 | # NCrunch 140 | _NCrunch_* 141 | .*crunch*.local.xml 142 | nCrunchTemp_* 143 | 144 | # MightyMoose 145 | *.mm.* 146 | AutoTest.Net/ 147 | 148 | # Web workbench (sass) 149 | .sass-cache/ 150 | 151 | # Installshield output folder 152 | [Ee]xpress/ 153 | 154 | # DocProject is a documentation generator add-in 155 | DocProject/buildhelp/ 156 | DocProject/Help/*.HxT 157 | DocProject/Help/*.HxC 158 | DocProject/Help/*.hhc 159 | DocProject/Help/*.hhk 160 | DocProject/Help/*.hhp 161 | DocProject/Help/Html2 162 | DocProject/Help/html 163 | 164 | # Click-Once directory 165 | publish/ 166 | 167 | # Publish Web Output 168 | *.[Pp]ublish.xml 169 | *.azurePubxml 170 | # Note: Comment the next line if you want to checkin your web deploy settings, 171 | # but database connection strings (with potential passwords) will be unencrypted 172 | *.pubxml 173 | *.publishproj 174 | 175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 176 | # checkin your Azure Web App publish settings, but sensitive information contained 177 | # in these scripts will be unencrypted 178 | PublishScripts/ 179 | 180 | # NuGet Packages 181 | *.nupkg 182 | # The packages folder can be ignored because of Package Restore 183 | **/[Pp]ackages/* 184 | # except build/, which is used as an MSBuild target. 185 | !**/[Pp]ackages/build/ 186 | # Uncomment if necessary however generally it will be regenerated when needed 187 | #!**/[Pp]ackages/repositories.config 188 | # NuGet v3's project.json files produces more ignorable files 189 | *.nuget.props 190 | *.nuget.targets 191 | 192 | # Microsoft Azure Build Output 193 | csx/ 194 | *.build.csdef 195 | 196 | # Microsoft Azure Emulator 197 | ecf/ 198 | rcf/ 199 | 200 | # Windows Store app package directories and files 201 | AppPackages/ 202 | BundleArtifacts/ 203 | Package.StoreAssociation.xml 204 | _pkginfo.txt 205 | *.appx 206 | 207 | # Visual Studio cache files 208 | # files ending in .cache can be ignored 209 | *.[Cc]ache 210 | # but keep track of directories ending in .cache 211 | !*.[Cc]ache/ 212 | 213 | # Others 214 | ClientBin/ 215 | ~$* 216 | *~ 217 | *.dbmdl 218 | *.dbproj.schemaview 219 | *.jfm 220 | *.pfx 221 | *.publishsettings 222 | orleans.codegen.cs 223 | 224 | # Including strong name files can present a security risk 225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 226 | #*.snk 227 | 228 | # Since there are multiple workflows, uncomment next line to ignore bower_components 229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 230 | #bower_components/ 231 | 232 | # RIA/Silverlight projects 233 | Generated_Code/ 234 | 235 | # Backup & report files from converting an old project file 236 | # to a newer Visual Studio version. Backup files are not needed, 237 | # because we have git ;-) 238 | _UpgradeReport_Files/ 239 | Backup*/ 240 | UpgradeLog*.XML 241 | UpgradeLog*.htm 242 | ServiceFabricBackup/ 243 | *.rptproj.bak 244 | 245 | # SQL Server files 246 | *.mdf 247 | *.ldf 248 | *.ndf 249 | 250 | # Business Intelligence projects 251 | *.rdl.data 252 | *.bim.layout 253 | *.bim_*.settings 254 | *.rptproj.rsuser 255 | 256 | # Microsoft Fakes 257 | FakesAssemblies/ 258 | 259 | # GhostDoc plugin setting file 260 | *.GhostDoc.xml 261 | 262 | # Node.js Tools for Visual Studio 263 | .ntvs_analysis.dat 264 | node_modules/ 265 | 266 | # Visual Studio 6 build log 267 | *.plg 268 | 269 | # Visual Studio 6 workspace options file 270 | *.opt 271 | 272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 273 | *.vbw 274 | 275 | # Visual Studio LightSwitch build output 276 | **/*.HTMLClient/GeneratedArtifacts 277 | **/*.DesktopClient/GeneratedArtifacts 278 | **/*.DesktopClient/ModelManifest.xml 279 | **/*.Server/GeneratedArtifacts 280 | **/*.Server/ModelManifest.xml 281 | _Pvt_Extensions 282 | 283 | # Paket dependency manager 284 | .paket/paket.exe 285 | paket-files/ 286 | 287 | # FAKE - F# Make 288 | .fake/ 289 | 290 | # JetBrains Rider 291 | .idea/ 292 | *.sln.iml 293 | 294 | # CodeRush 295 | .cr/ 296 | 297 | # Python Tools for Visual Studio (PTVS) 298 | __pycache__/ 299 | *.pyc 300 | 301 | # Cake - Uncomment if you are using it 302 | # tools/** 303 | # !tools/packages.config 304 | 305 | # Tabs Studio 306 | *.tss 307 | 308 | # Telerik's JustMock configuration file 309 | *.jmconfig 310 | 311 | # BizTalk build output 312 | *.btp.cs 313 | *.btm.cs 314 | *.odx.cs 315 | *.xsd.cs 316 | 317 | # OpenCover UI analysis results 318 | OpenCover/ 319 | 320 | # Azure Stream Analytics local run output 321 | ASALocalRun/ 322 | 323 | # MSBuild Binary and Structured Log 324 | *.binlog 325 | 326 | # NVidia Nsight GPU debugger configuration file 327 | *.nvuser 328 | 329 | # MFractors (Xamarin productivity tool) working folder 330 | .mfractor/ 331 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MSMARCO 2 | A Family of datasets built using technology and Data from Microsoft's Bing. 3 | 4 | > MS MARCO: A Human Generated MAchine Reading COmprehension Dataset 5 | > Paper URL : https://arxiv.org/abs/1611.09268 6 | 7 | MS MARCO(Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking, Keyphrase Extraction, and Conversational Search Studies, or what the community thinks would be useful. 8 | 9 | First released at [NIPS 2016](https://arxiv.org/pdf/1611.09268.pdf), the current dataset has 1,010,916 unique real queries that were generated by sampling and anonymizing Bing usage logs. The dataset started off focusing on QnA but has since evolved to focus on any problem related to search. For task specifics please explore some of the tasks that have been built out of the dataset. If you think there is a relevant task we have missed please open an issue explaining your ideas? 10 | 11 | For more information about [TREC 2019 Deep Learning](https://github.com/microsoft/TREC-2019-Deep-Learning) 12 | 13 | For more information about [Q&A](https://github.com/microsoft/MSMARCO-Question-Answering) 14 | 15 | For more information about [Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) 16 | 17 | For more information about [Keyphrase Extraction](https://github.com/microsoft/MSMARCO-OpenKP) 18 | 19 | For more information about [Conversational Search](https://github.com/microsoft/MSMARCO-Conversational-Search) 20 | 21 | For more information about [Polite Crawling](https://github.com/microsoft/MSMARCO-Optimal-Freshness-Crawl-Under-Politeness-Constraints) 22 | 23 | 24 | ## Dataset Generation, Data Format, And Statistics 25 | What is the difference between MSMARCO and other MRC datasets? We believe the advantages that are special to MSMARCO are: 26 | - Real questions: All questions have been sample from real anonymized bing queries. 27 | - Real Documents: Most Url's that we have source the passages from contain the full web documents. These can be used as extra contextual information to improve systems or be used to compete in our expert task. 28 | - Human Generated Answers: All questions have an answer written by a human. If there was no answer in the passages the judge read they have written 'No Answer Present.' 29 | - Human Generated Well-Formed: Some questions contain extra human evaluation to create well formed answers that could be used by intelligent agents like Cortana, Siri, Google Assistant, and Alexa. 30 | - Dataset Size: At over 1 million queries the dataset is large enough to train the most complex systems and also sample the data for specific applications. 31 | 32 | ## Download the Dataset 33 | To Download the MSMARCO Dataset please navigate to [msmarco.org](http://www.msmarco.org/dataset.aspx) and agree to our Terms and Conditions. If there is some data you think we are missing and would be useful please open an issue. 34 | 35 | # Conversational Search 36 | Truly Conversational Search is the next logic step in the journey to generate intelligent and useful AI. To understand what this may mean, researchers have voiced a continuous desire to study how people currently converse with search engines. 37 | 38 | 39 | ## Introduction 40 | Traditionally, the desire to produce such a comprehensive dataset has been limited because those who have this data (Search Engines) have a responsibility to their users to maintain their privacy and cannot share the data publicly in a way that upholds the trusts users have in the Search Engines. Given these two powerful forces we believe we have a dataset and paradigm that meets both sets of needs: A artificial public dataset that approximates the true data and an ability to evaluate model performance on the real user behavior. What this means is we released a public dataset which is generated by creating artificial sessions using embedding similarity and will test on the original data. To say this again: we are not releasing any private user data but are releasing what we believe to be a good representation of true user interactions. 41 | 42 | ## Corpus Generation 43 | To generate our projection corpus, we took the 1,010,916 MSMARCO queries and generated the query vectors for each unique queries. Once we had these embedding spaces, we build an Approximate Nearest Neighbor Index using [ANNOY](https://github.com/spotify/annoy). 44 | Next, we sampled our Bing usage log from 2018-06-01 to 2018-11-30 to find a sample of sessions that that had more than 1 query, shared a query that had a query embedding similar to a MSMARCO query, and were likely to be conversational in nature. Next we remove all navigation, bot, junk, and adult sessions. Once we did this, we now had 45,040,730 unique user sessions of 344,147 unique queries. The average session was 2.6 queries long and the longest session was 160 queries. Just like we did for our public queries, we generated embedding for each unique query. Finally, in order to merge the two, for each unique session we perform a nearest neighbor search given the real queries query vector in the MSMARCO ANN Index. This allows us to join the public queries to the private sessions generating an artificial user session grounded in true user behavior. 45 | 46 | An example of these search sessions is below. 47 | ``` 48 | marco-gen-dev-40 what is the australian flag what is the population of australia what hemisphere is north australia how big is sydney australia is australia a country 49 | marco-gen-dev-152 is elements on a periodic table what is the product of ch4 cost of solar system what constellation is cassiopeia in what is the human skeleton what is a isosceles triangle convert a fraction to a % calculator standard deviation difference calculator graph y = 1/2 x cubed how to what is the volume of a pyramid what is american sign language definition how to put word count on word 50 | marco-gen-dev-157 define colonialism what are migrant workers office 365 cost to add a user what does per capita means what are tariffs definition for tariffs define urbanization 51 | marco-gen-dev-218 icd 10 code rhinitis icd diagnosis code for cva icd code for psoriatic arthritis icd 10 code for facet arthritis of knee icd 10 code for copd icd codes for cad icd diagnosis code for cva icd 10 code for personal history pvd 52 | marco-gen-dev-385 circadian activity rhythms definition what is the synonym of insomnia insomnia definition define narcolepsy causes for night terrors types of meditation definition: meditation pros and cons for death penalty 53 | marco-gen-dev-397 cost of solar system what is the hottest planet? what is coldest planet what is solar system is what kind of galaxy is the milky way spanish iris is what in english 54 | marco-gen-dev-457 can we repeal donald trump what was barack obama is john mccain a republican who is chuck schumer's daughter was bill clinton a democrat is mike pence a veteran why did barack obama get a nobel peace prize was mahatma gandhi awarded a peace prize why did dalai lama win the nobel peace prize 55 | marco-gen-dev-485 definition of technology define scientific method definition of constant graphs definition definition of information technology define axis definition of trial legal definition of bias define what an inference is 56 | marco-gen-dev-496 illusory definition illusion vs allusion definition declaration + definition define: caste define rhetoric define race 57 | marco-gen-dev-572 stock price tesla fb stock price home depot stock price t amazon stock price nxp semiconductors stock price 58 | ``` 59 | 60 | We first release [BERT Based Sessions](https://msmarco.blob.core.windows.net/conversationalsearch/artificialSessionsBERT500k.tsv.gz) and [Query Embedding Based Sessions](https://msmarco.blob.core.windows.net/conversationalsearch/artificialSessionsQueryEncoding500kSample.tsv.gz) which we shared with a small group of researchers to get feedback on what worked better. Based on community feedback and data exploration our we are releasing our first full scale dataset described below. 61 | 62 | 1. Split the 45,040,730 artificial sessions into a train, dev and test. To do so, any session that included a query from the QnA eval set was considered eval and the remaining sessions were split 90%/10% between train and dev. These files are called full_marco_sessions_ann_split.* and have the following sizes 63 | ``` 64 | spacemanidol@spacemanidol:/mnt/c/Users/dacamp/Desktop$ wc -l full_marco_sessions_ann_split.* 65 | 3656706 full_marco_sessions_ann_split.dev.tsv 66 | 8480280 full_marco_sessions_ann_split.test.tsv 67 | 32903744 full_marco_sessions_ann_split.train.tsv 68 | 45040730 total 69 | ``` 70 | 2. Use the query vectors to generate a similairty score between edges and filter out four types of edges 71 | * Topic Change: (cosine similarity) <= 0.4 72 | * Explore: (cosine similarity) \in (0.4, 0.7] 73 | * Specify: (cosine similarity) \in (0.7, 0.85] 74 | * Paraphrase: (cosine similarity) \in (0.85, 1] 75 | 3. Filter by session coherence 76 | * treat "topic change" as non-related and no edge, 77 | * build the graph in each session, 78 | * pick the biggest sub-graph in the session, 79 | * throw away the rest queries not in the biggest sub-graph 80 | 4. Filter by session length removing anything with less than 4 queries. 81 | 5. Filter any session where the chain of queries were just paraphrases 82 | 83 | Upon doing this, we have 3 sets of files: [Train](https://msmarco.blob.core.windows.net/conversationalsearch/ann_session_train.tar.gz),[Dev](https://msmarco.blob.core.windows.net/conversationalsearch/ann_session_dev.tar.gz), and eval. We are not currently sharing eval. Explicit size numbers below 84 | ``` 85 | 75193 marco_ann_session.dev.all.tsv 86 | 774724 marco_ann_session.test.all.tsv 87 | 675334 marco_ann_session.train.all.tsv 88 | 1525251 total 89 | ``` 90 | 91 | To provide more exploratory content, we further filter the sessions in each split(Train, Dev, Eval) using three types of filtering: 92 | 1. ".half_trans.jsonl": half (explore|specifiy) >= 50% adjacent edges in (0.4, 0.85] 93 | 2. ".half_explore.jsonl": half (explore): >= 50% adjacent edges in (0.4, 0.7] 94 | 3. ".half_specify.jsonl": half (specifiy): >= 50% adjacent edges in (0.7, 0.85] 95 | 96 | Explicit size numbers below 97 | ``` 98 | spacemanidol@spacemanidol:/mnt/c/Users/dacamp/Desktop$ wc -l marco_ann_session.*.half* 99 | 53791 marco_ann_session.dev.half_explore.tsv 100 | 5646 marco_ann_session.dev.half_specify.tsv 101 | 63854 marco_ann_session.dev.half_trans.tsv 102 | 548101 marco_ann_session.test.half_explore.tsv 103 | 42032 marco_ann_session.test.half_specify.tsv 104 | 648868 marco_ann_session.test.half_trans.tsv 105 | 482021 marco_ann_session.train.half_explore.tsv 106 | 50980 marco_ann_session.train.half_specify.tsv 107 | 572791 marco_ann_session.train.half_trans.tsv 108 | 2468084 total 109 | ``` 110 | 111 | 112 | ## Corpus task 113 | We are currently assembling baselines and building an evaluation framework but the initial task will be as follows: Given a chain of queries q1, q2,...,q(n-1) predict q(n). Ideally systems will train and test on the public artificial and will be evaluated on both the private real data and public artificial eval data. 114 | 115 | ### Initial 500k Sample Process 116 | 1. Downloaded the data 117 | ~~~ 118 | ./downloadData.sh 119 | ~~~ 120 | 2. Generate the querySets 121 | ~~~ 122 | python generateQuerySets.py 123 | ~~~ 124 | 3. Get Query Embeddings 125 | ~~~ 126 | python generateQueryEmbeddings.py allQueries.tsv queryEmbeddings.tsv 127 | ~~~ 128 | 4. Generate BERT Query Embeddings 129 | To generate our Query Embeddings [we used BERT As A Service](https://github.com/hanxiao/bert-as-service) to generate a unique query embedding for each query in our set. If you want to go ahead and regenerate embeddings(or use it to generate another alternate query source for you model) you can follow what we did below. 130 | ~~~ 131 | cd Data/BERT 132 | pip install bert-serving-server # server 133 | pip install bert-serving-client # client, independent of `bert-serving-server` 134 | wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip 135 | unzip cased_L-24_H-1024_A-16 136 | bert-serving-start -model_dir ~/Data/BERT/cased_L-24_H-1024_A-16 -num_worker=4 -device_map 0 #depending on your computer play around with these settings 137 | ~~~ 138 | In a separate shell 139 | ~~~ 140 | python3 generateQueryEmbeddingsBERT.py allQueries.tsv BERTQueryEmbeddings.tsv 141 | ~~~ 142 | 5. Generate Sessions 143 | ~~~ 144 | python generateArtificialSessions.py realQueries queryEmbeddings.tsv BERTQueryEmbeddings.tsv queryEmbeddings.ann sessions.tsv 145 | ~~~ 146 | 147 | 148 | # Contributing 149 | 150 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 151 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 152 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 153 | 154 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 155 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 156 | provided by the bot. You will only need to do this once across all repos using our CLA. 157 | 158 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 159 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 160 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 161 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate -------------------------------------------------------------------------------- /downloadData.sh: -------------------------------------------------------------------------------- 1 | export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" 2 | echo "deb http://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list 3 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - 4 | sudo apt-get update && sudo apt-get install google-cloud-sdk 5 | sudo apt-get install google-cloud-sdk-app-engine-java 6 | gcloud init 7 | mkdir data 8 | gsutil -m cp -R gs://natural_questions . 9 | mkdir nq 10 | mv natural_questions/v1.0/dev/* nq/ 11 | mv natural_questions/v1.0/train/* nq/ 12 | rm -rf natural_questions 13 | wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv 14 | wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz 15 | tar -xzvf collectionandqueries.tar.gz 16 | rm collectionandqueries.tar.gz collection.tsv qrels.* 17 | cd nq 18 | gunzip * 19 | -------------------------------------------------------------------------------- /generateArtificialSessions.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import numpy as np 4 | import sys 5 | from annoy import AnnoyIndex 6 | 7 | TREESIZE = 1000 8 | 9 | 10 | def loadQueries(filename): 11 | queries = set() 12 | with open(filename,'r') as f: 13 | for l in f: 14 | queries.add(l.strip()) 15 | return queries 16 | 17 | def loadSessions(filename): 18 | sessions = [] 19 | with open(filename,'r') as f: 20 | for l in f: 21 | sessions.append(l.strip().split('\t')) 22 | return sessions 23 | 24 | 25 | def loadVectors(filename, realQueries, artificialQueries): 26 | i = 0 27 | j = 0 28 | artificial = [{},{},[]] #Query2Idx, idX2Query, id2Vector 29 | real = [{},{},[]] #Query2Idx, idX2Query, 30 | with open(filename,'r') as f: 31 | for l in f: 32 | l = l.strip().split('\t') 33 | query = l[0] 34 | vectors = l[1].split(' ') 35 | if query in artificialQueries: 36 | artificial[0][query] = i 37 | artificial[1][i] = query 38 | artificial[2].append(np.array(vectors,dtype=float)) 39 | i += 1 40 | if query in realQueries: 41 | real[0][query] = j 42 | real[1][j] = query 43 | real[2].append(np.array(vectors,dtype=float)) 44 | j += 1 45 | return real, artificial 46 | 47 | def generateAnnoy(real, artificial, annoyFilename, dimensions): 48 | idx2vec = np.array(artificial[2]) 49 | t = AnnoyIndex(dimensions) 50 | for j in range(len(artificial[2])): 51 | t.add_item(j,idx2vec[j]) 52 | print('Done Adding items to AnnoyIndex') 53 | t.build(TREESIZE) 54 | print('Done Building AnnoyIndex') 55 | t.save(annoyFilename) 56 | return t 57 | 58 | def generateArtificialSessions(realQueryVectors, artificialQueryVectors, sessions, annoyEmbedding, filename): 59 | with open(filename,'w') as w: 60 | for session in sessions: 61 | queriesUsed = set() 62 | output = '' 63 | properArtificialSetGenerated = True 64 | for query in session: 65 | if query not in realQueryVectors[0]:#Lookup issue so cant print session 66 | break 67 | artificialQueries = annoyEmbedding.get_nns_by_vector(realQueryVectors[2][realQueryVectors[0][query]], 15, search_k=-1, include_distances=False) 68 | replacementFound = False 69 | for i in range(len(artificialQueries)): 70 | artificialQuery = artificialQueryVectors[1][artificialQueries[i]] 71 | if artificialQuery not in queriesUsed: #ensure session isnt just repeating queries 72 | queriesUsed.add(artificialQuery) 73 | replacementFound = True 74 | output += '{}\t'.format(artificialQuery) 75 | break 76 | if replacementFound == False: 77 | properArtificialSetGenerated = False 78 | break 79 | if properArtificialSetGenerated == True: 80 | w.write("{}\n".format(output[:-1])) 81 | if __name__ == "__main__": 82 | if len(sys.argv) != 7: 83 | print("Usage: makeArtificialSessions.py ") 84 | exit(-1) 85 | else: 86 | print("Loading Queries and Sessions") 87 | realQueries = loadQueries(sys.argv[1]) 88 | artificialQueries = loadQueries(sys.argv[2]) 89 | print("Loading Sessions") 90 | sessions = loadSessions(sys.argv[6]) 91 | #Run regular embeddings 92 | print("Loading Query Vectors") 93 | real, artificial = loadVectors(sys.argv[3], realQueries, artificialQueries) 94 | print("Building Annnoy Query Embeddings") 95 | annoyEmbedding = generateAnnoy(real, artificial, sys.argv[5], 100) 96 | print("Generating Sessions Query Embeddings") 97 | generateArtificialSessions(real,artificial, sessions, annoyEmbedding, 'sessionsEmbedding.tsv') 98 | #Run on BERT embeddings 99 | print("Loading BERT Vectors") 100 | real, artificial = loadVectors(sys.argv[4], realQueries, artificialQueries) 101 | print("Building Annnoy Query Embeddings") 102 | annoyEmbedding = generateAnnoy(real, artificial, 'BERT' + sys.argv[5], 1024) 103 | print("Generating Sessions Query Embeddings") 104 | generateArtificialSessions(real, artificial, sessions, annoyEmbedding, 'sessionsEmbeddingBERT.tsv') 105 | -------------------------------------------------------------------------------- /generateQueryEmbeddings.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import requests 4 | def loadQueries(filename): 5 | queries = [] 6 | with open(filename,'r') as f: 7 | for l in f: 8 | queries.append(l.strip()) 9 | return queries 10 | 11 | def generateOuput(responses): 12 | output = '' 13 | for response in responses: 14 | output += '{}\t'.format(response['query']) 15 | for v in response['vector'][:-1]: 16 | output += '{} '.format(v) 17 | output += '{}\n'.format(response['vector'][-1]) 18 | return output 19 | 20 | def getVectors(queries, key, filename): 21 | chunks = [] 22 | count = 1 23 | for i in range(0, len(queries), 10): 24 | chunks.append(queries[i:i+10]) 25 | with open(filename,'w') as w: 26 | for i, chunk in enumerate(chunks): 27 | if i % 100 == 0: 28 | print('{} vectors retrieved'.format(i*10)) 29 | i += 1 30 | try: 31 | w.write(generateOuput(requests.get(url=key + str(chunk) + "}").json())) 32 | except: 33 | try: 34 | w.write(generateOuput(requests.get(url=key + str(chunk) + "}").json())) 35 | except: 36 | try: 37 | w.write(generateOuput(requests.get(url=key + str(chunk) + "}").json())) 38 | except: 39 | #Its dirty but sometimes the API fails and this is the easiest fix 40 | continue 41 | if __name__ == "__main__": 42 | if len(sys.argv) != 4: 43 | print('Usage:generateQueryEmbeddings.py ') 44 | exit(-1) 45 | else: 46 | queries = loadQueries(sys.argv[2]) 47 | getVectors(queries, sys.argv[1], sys.argv[3]) 48 | -------------------------------------------------------------------------------- /generateQueryEmbeddingsBERT.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import time 4 | from bert_serving.client import BertClient 5 | def loadQueries(filename): 6 | queries = [] 7 | with open(filename,'r') as f: 8 | for l in f: 9 | queries.append(l.strip()) 10 | return queries 11 | def process(queryPack, response): 12 | output = '' 13 | for i in range(len(queryPack)): 14 | output += "{}\t".format(queryPack[i]) 15 | for j in range(len(response[i])): 16 | output += "{} ".format(response[i][j]) 17 | output += "\n" 18 | return output 19 | def getVectors(queries, filename): 20 | i = 1 21 | bc = BertClient() 22 | query = 'who founded microsoft' 23 | print("Testing bc\nTesting Query:{}\nVector:{}".format(query, bc.encode([query])[0])) 24 | with open(filename,'w') as w: 25 | for j in range(0,len(queries), 100): 26 | if i % 100 == 0: 27 | print('{} vectors retrieved'.format(i*100)) 28 | queryPack = queries[j:j+100] 29 | response = bc.encode(queryPack) 30 | w.write(process(queryPack, response)) 31 | i += 1 32 | if __name__ == "__main__": 33 | if len(sys.argv) != 3: 34 | print('Usage: generateQueryEmbeddingsBERT.py ') 35 | exit(-1) 36 | else: 37 | queries = loadQueries(sys.argv[1]) 38 | print("{} queries loaded".format(len(queries))) 39 | getVectors(queries, sys.argv[2]) 40 | -------------------------------------------------------------------------------- /generateQuerysets.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import os 4 | def loadNQ(dirPath): 5 | files = os.listdir(dirPath) 6 | queries = set() 7 | for file in files: 8 | with open(os.path.join(dirPath, file),'r') as f: 9 | for l in f: 10 | j = json.loads(l) 11 | query_text = j['question_text'] 12 | queries.add(query_text) 13 | return queries 14 | def loadQuora(filename): 15 | #id qid1 qid2 question1 question2 is_duplicate 16 | #0 1 2 What is the step by step guide to invest in share market in india? What is the step by step guide to invest in share market? 0 17 | queries = set() 18 | with open(filename) as f: 19 | for l in f: 20 | l = l.strip().split('\t') 21 | if len(l) > 4: 22 | queries.add(l[3]) 23 | queries.add(l[4]) 24 | return queries 25 | def loadMSMARCO(files): 26 | #1048578 cost of endless pools/swim spa 27 | queries = set() 28 | for file in files: 29 | with open(file,'r') as f: 30 | for l in f: 31 | queries.add(l.strip().split('\t')[1]) 32 | return queries 33 | def loadUnusedMSMARCO(filename): 34 | #AnswerIsInPassage\tJudgeID\tHitGroupDataInt\tHitDataInt\tHitState\tJudgmentState\tJudgmentDataInt\tJudgmentDataIntName\tJudgmentSubmitTime\tJudgmentTypeID\tTimeSpentOnJudgment\tHitGroupID\tHitID\tQueryId\tQueryText\tPassagesJson\tJudgmentID\tjudgment\tanswerTextArea\tnopassageSubmit\texplainNoAnswer\texplainCantJudge\tSL_locer\tSL_lng_from\tSL_lng_to\tJudgmentType\tGoldHitType\tGoldHitComments\t@RealTimeAuditComment\tConsensus\n' 35 | queries = set 36 | with open(filename, 'r') as f: 37 | for l in f: 38 | l = l.split('\t') 39 | queries.append(l[14]) 40 | return queries 41 | def loadSessions(filename): 42 | #sessionid\tquery 43 | sessions, queries = {}, set() 44 | with open (filename,'r') as f: 45 | for l in f: 46 | l = l.split('\t') 47 | sessionID = l[0] 48 | query = l[1] 49 | queries.add(query) 50 | if sessionID not in sessions: 51 | sessions[sessionID] = [] 52 | sessions[sessionID].append(query) 53 | return sessions, queries 54 | def writeData(msmarcoQueries, quoraQueries, nqQueries, unusedMSMARCOQueries, realQueries, sessions): 55 | allQueries = realQueries.union(unusedMSMARCOQueries.union(msmarcoQueries.union(quoraQueries.union(nqQueries)))) 56 | with open('msmarcoQueries.tsv','w') as w: 57 | for query in msmarcoQueries: 58 | w.write('{}\n'.format(query)) 59 | with open('quoraQueries.tsv','w') as w: 60 | for query in quoraQueries: 61 | w.write('{}\n'.format(query)) 62 | with open('nqQueries.tsv','w') as w: 63 | for query in nqQueries: 64 | w.write('{}\n'.format(query)) 65 | with open('unusedMSMARCOQueries.tsv','w') as w: 66 | for query in unusedMSMARCOQueries: 67 | w.write('{}\n'.format(query)) 68 | with open('realQueries.tsv','w') as w: 69 | for query in realQueries: 70 | w.write('{}\n'.format(query)) 71 | with open('allQueries.tsv','w') as w: 72 | for query in allQueries: 73 | w.write('{}\n'.format(query)) 74 | with open('cleanedSessions.tsv','w') as w: 75 | for session in sessions: 76 | for query in sessions[session][:-1]: 77 | w.write('{}\t'.format(query)) 78 | w.write('{}\n'.format(sessions[session][-1])) 79 | if __name__ == "__main__": 80 | if len(sys.argv) != 8: 81 | print("Usage: generateQuerysets.py ") 82 | exit(-1) 83 | else: 84 | msmarcoQueries = loadMSMARCO([sys.argv[1],sys.argv[2],sys.argv[3]]) 85 | print("Done reading MSMARCO") 86 | quoraQueries = loadQuora(sys.argv[4]) 87 | print("Done reading quora") 88 | #nqQueries = loadNQ(sys.argv[5]) 89 | nqQueries = [] # NQ TOS do not allow us to use the dataset for this type of research 90 | print("Done reading NQ") 91 | #unusedMSMARCO = loadUnusedMSMARCO(sys.argv[6]) 92 | unusedMSMARCO = [] 93 | print("Done reading Unused") 94 | sessions, realQueries = loadSessions(sys.argv[7]) 95 | print("Done reading Sessions") 96 | writeData(msmarcoQueries, quoraQueries, nqQueries, unusedMSMARCO, realQueries, sessions) 97 | -------------------------------------------------------------------------------- /querySetStats.txt: -------------------------------------------------------------------------------- 1 | Done reading MSMARCO 2 | Done reading quora 3 | Done reading NQ 4 | There are 1010916 unique MSMARCO Queries, 537359 unique Quora Duplicate Queries, 315203 unique NQ Queries for a total of 1862849 unique queries. 5 | 6 | --------------------------------------------------------------------------------