├── README.md ├── index.html ├── references.html └── tfthing_files ├── backbone.min.js ├── cifar.jpg ├── citation_thumb.png ├── d3.v3.min.js ├── fonts.css ├── fonts └── AvenirNext │ ├── .DS_Store │ ├── DemiBold │ ├── avenirnextdemibold.ttf │ └── avenirnextdemibold.woff │ ├── Italic │ ├── avenirnextregularitalic.ttf │ └── avenirnextregularitalic.woff │ ├── Medium │ ├── avenirnextmedium.ttf │ └── avenirnextmedium.woff │ └── Regular │ ├── avenirnextregular.ttf │ └── avenirnextregular.woff ├── main.css ├── mcgill_logo_big.png ├── mcgill_seal.png ├── nserc.png ├── player.css ├── player.js ├── sait.png ├── splash_image.png ├── tweets.js ├── twitter_core.css ├── udem.png ├── underscore.min.js └── zepto.min.js /README.md: -------------------------------------------------------------------------------- 1 | # DialogDatasets 2 | A repository linking to publicly available dialog datasets. Feel free to send issues or pull requests. 3 | 4 | Please see the gh-pages website to view tables of all our gathered corpora: https://breakend.github.io/DialogDatasets 5 | 6 | This accompanies our paper: https://arxiv.org/abs/1512.05742 7 | 8 | If you find it helpful, please cite: 9 | 10 | 11 |
12 | @ARTICLE{2015serbansurvey,
13 |    author = {{Vlad Serban}, I. and {Lowe}, R. and {Henderson}, P. and {Charlin}, L. and 
14 | 	{Pineau}, J.},
15 |     title = "{A Survey of Available Corpora for Building Data-Driven Dialogue Systems}",
16 |   journal = {ArXiv e-prints},
17 | archivePrefix = "arXiv",
18 |    eprint = {1512.05742},
19 |  primaryClass = "cs.CL",
20 |  keywords = {Computer Science - Computation and Language, Computer Science - Artificial Intelligence, Computer Science - Human-Computer Interaction, Computer Science - Learning, Statistics - Machine Learning, 68T01, 68T05, 68T35, 68T50, I.2.6, I.2.7, I.2.1},
21 |      year = 2015,
22 |     month = dec
23 | }
24 | 
25 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | A Survey of Available Corpora for Building Data-Driven Dialogue Systems 8 | 9 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 |
23 | Associated Links 24 | 40 |
41 | 42 | 43 | 44 | 45 | 46 |
47 |
48 |
49 |
50 |
51 | McGill 52 | & 53 | UdeM 54 |
55 |
Papers
56 |
57 |
58 |
59 |
60 |
61 | A Survey of Available Corpora for Building Data-Driven Dialogue Systems 62 |
63 | 72 |
73 | Call for contributions! We're always looking for more datasets. Feel free to send us a pull request! 74 |
75 |
76 | 77 |
78 |
A basic outline of a dialog system.
79 |
80 |
81 |
Abstract
82 |
83 | During the past decade, several areas of speech and language understanding have witnessed substantial breakthroughs from the use of data-driven models. In the area of dialogue systems, the trend is less obvious, and most practical systems are still built through significant engineering and expert knowledge. Nevertheless, several recent results suggest that data-driven approaches are feasible and quite promising. To facilitate research in this area, we have carried out a wide survey of publicly available datasets suitable for data-driven learning of dialogue systems. We discuss important characteristics of these datasets, how they can be used to learn diverse dialogue strategies, and their other potential uses. We also examine methods for transfer learning between datasets and the use of external knowledge. Finally, we discuss appropriate choice of evaluation metrics for the learning objective. 84 |
85 |
86 |
87 |
Materials
88 | 96 |
97 |
98 |
Acknowledgements
99 | The authors gratefully acknowledge financial support by the Samsung Advanced Institute of Technology (SAIT), the Natural Sciences and Engineering Research Council of Canada (NSERC), the Canada Research Chairs, the Canadian Institute for Advanced Research (CIFAR) and Compute Canada. Early versions of the manuscript benefited greatly from the proofreading of Melanie Lyman-Abramovitch, and later versions were extensively revised by Genevieve Fried and Nicolas Angelard-Gontier. The authors also thank Nissan Pow, Michael Noseworthy, Chia-Wei Liu, Gabriel Forgues, Alessandro Sordoni, Yoshua Bengio and Aaron Courville for helpful discussions. 100 |
101 |
102 |
Citation
103 |
104 |
105 | 106 |
107 |
108 | 111 | 118 |
ArXiv, 2017
119 |
120 |
121 |
122 |
123 |
Dataset Statistics
124 |
125 | 126 |
Human-Machine Dialogue Datasets
127 | 128 |
129 | 130 | 131 |
132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 |
NameTypeTopicsAvg. # of turnsTotal # of dialoguesTotal # of wordsDescriptionLinks
146 | Let's Go!
[Raux et al., 2005] 147 |
SpokenBus schedules--171,128--Bus ride information systemInfo and download
159 | DSTC1
[Williams et al., 2013] 160 |
SpokenBus schedules13.5615,0003.7MBus ride information systemInfo and download
171 | DSTC2
[Henderson et al., 2014b] 172 |
SpokenRestaurants7.883,000432KRestaurant booking systemInfo and Download
183 | DSTC3
[Henderson et al., 2014a] 184 |
SpokenTourist information8.272,265403KInformation for touristsInfo and Download
CMU Communicator Corpus
[Bennett and Rudnicky, 2002]
SpokenTravel11.6715,4812M*Travel planning and booking systemInfo and Download
ATIS Pilot Corpus
[Hemphill et al., 1990]
SpokenTravel25.44111.4K*Travel planning and booking systemInfo
Download
Ritel Corpus
[Rosset and Petel, 2006]
SpokenUnrestricted/ Diverse Topics9.3*58260kAn annotated open-domain question answering spoken dialogue systemInfo
Contact corpus authors for download
DIALOG Mathematical Proofs [Wolska et al., 2004]SpokenMathematics12668.7K*Humans interact with computer system to do mathematical theorem provingInfo
Contact corpus authors for download
MATCH Corpus
[Georgila et al., 2010]
SpokenAppointment Scheduling14.044769K*A system for scheduling appointments.Info and download
Maluuba Frames
[El Asri et al., 2017]
Chat, QA & RecommendationTravel & Vacation Booking151369-For goal-driven dialogue systems. Semantic frames labeled and actions taken on a knowledge-base annotated.Info and Download
Key-Value Retrieval dataset
[Eric and Manning, 2017]
Chat, QACalendar, Weather, POI navigation 5.253031-For Task-oriented dialogue systems. Intent, slots and KB annotated for each session.Info and Download
MultiWOZ
[Budzianowski et al. 2018]
Chat, QA, RecommendationsTravel1410438-For goal-driven dialogue systems. Fully labelled on both user and system sides.Info and Download
278 |
279 | Table 1: Human-machine dialogue datasets. Starred (*) numbers are approximated based on the average number of words per utterance. 280 |
281 |
282 |
283 | 284 |
285 |
286 |
Human-Human Constrained Dialogue Datasets
287 |
288 |
289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 |
NameTopicsTotal # of dialoguesTotal # of wordsTotal lengthDescriptionLinks
HCRC Map Task Corpus [Anderson et al., 1991]Map-Reproducing Task128147k18hrsDialogues from HLAP Task in which speakers must collaborate verbally to reproduce on one participant’s map a route printed on the other’s.Info and Download
The Walking Around Corpus [Brennan et al., 2013]Location Finding Task36300k*33hrsPeople collaborating over telephone to find certain locations.Info and Download
Green Persuasive Database [Douglas-Cowie et al., 2007]Lifestyle835k*4hrsA persuader with (genuinely) strong pro-green feelings tries to convince persuadees to consider adopting more ‘green’ lifestyles.Info
Download
Intelligence Squared Debates [Zhang et al., 2016]Debates1081.8M200hrs*Various topics in Oxford-style debates, each constrained to one subject. Audience opinions provided pre- and post-debates.Info and Download
The Corpus of Professional Spoken American English [Barlow, 2000]Politics, Education2002M220hrs*Interactions from faculty meetings and White House press conferences.Info and Download
(Download may require purchase.)
MAHNOB Mimicry Database [Sun et al., 2011]Politics, Games54100k*11hrsTwo experiments: a discussion on a political topic, and a role-playing game.Info and Download
The IDIAP Wolf Corpus [Hung and Chittaranjan, 2010]Role-Playing Game1560k*7hrsA recording of Werewolf role-playing game with annotations related to game progress.Info and Download
SEMAINE corpus [McKeown et al., 2010]Emotional Conversations100450k*50hrsUsers were recorded while holding conversations with an operator who adopts roles designed to evoke emotional reactions.Info and Download
DSTC4/DSTC5 Corpora [Kim et al., 2015,Kim et al., 2016]Tourist35273k21hrsTourist information exchange over Skype.DSTC4

DSTC5

(DSTC4 Training Set with Chinese lang. Test Set)
Loqui Dialogue Corpus [Passonneau and Sachar, 2014]Library Inquiries8221K140*Telephone interactions between librarians and patrons. Annotated dialogue acts, discussion topics, frames (discourse units), question-answer pairs.Info and Download
MRDA Corpus [Shriberg et al., 2004]ICSI Meetings7511K*72hrsRecordings of ICSI meetings. Topics include: ICSI meeting recorder project itself, automatic speech recognition, natural language processing and neural theories of language. Dialogue acts, question-answer pairs, and hot spots.Info and Download
TRAINS 93 Dialogues Corpus [Heeman and Allen, 1995]Railroad Freight Route Planning9855K6.5hrsCollaborative planning of railroad freight routes.Info and Download
Verbmobil Corpus [Burger et al., 2000]Appointment Scheduling726270K38HrsSpontaneous speech data collected for the Verbmobil project. Full corpus is in English, German, and Japanese. We only show English statistics.Info
Download I
Download II
ICT Rapport Datasets [Gratch et al., 2007]Sexual Harassment Awareness165N/AN/AA speaker tells a story to a listener. The listener is asked to not speak during the story telling. Contains audio-visual data, transcriptions, and annotations.Info and Download
426 |
427 | Table 2: Human-human constrained spoken dialogue datasets. Starred (*) numbers are estimates based on the average rate of English speech from the National Center for Voice and Speech. 428 |
429 |
430 |
431 | 432 |
433 |
434 |
Human-Human Spontaneous Dialogue Datasets
435 |
436 |
437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 |
NameTopicsTotal # of dialoguesTotal # of wordsTotal lengthDescriptionLinks
449 | Switchboard [Godfrey et al., 1992] 450 | Casual Topics2,4003M300hrs*Telephone conversations on pre-specified topicsInfo and Download
British National Corpus (BNC) [Leech, 1992]Casual Topics85410M1,000hrs*British dialogues many contexts, from formal business or government meetings to radio shows and phone-ins.Info and Download
CALLHOME American English Speech [Canavan et al., 1997]Casual Topics120540k*60hrsTelephone conversations between family members or close friends.Info and Download
CALLFRIEND American English Non-Southern Dialect [Canavan and Zipperlen, 1996]Casual Topics60180k*20hrsTelephone conversations between Americans with a Non-Southern accent.Info and Download
The Bergen Corpus of London Teenage Language [Haslerud and Stenström, 1995]Unrestricted100500k55hrsSpontaneous teenage talk recorded in 1993. Conversations were recorded secretly.Info and Download
The Cambridge and Nottingham Corpus of Discourse in English [McCarthy, 1998]Casual Topics-5M550hrs*British dialogues from wide variety of informal contexts, such as hair salons, restaurants, etc.Info and Download
Note: CANCODE is a subset of the Cambridge English Corpus.
D64 Multimodal Conversation Corpus [Oertel et al., 2013]Unrestricted270k*8hrsSeveral hours of natural interaction between a group of peopleContact corpus authors for data.
AMI Meeting Corpus [Renals et al., 2007]Meetings175900k*100hrsFace-to-face meeting recordings.Info and Download
Cardiff Conversation Database (CCDb) [Aubrey et al., 2013]Unrestricted3020k*150minAudio-visual database with unscripted natural conversations, including visual annotations.Info and Download
4D Cardiff Conversation Database (4D CCDb) [Vandeventer et al., 2015]Unrestricted172.5k*17minA version of the CCDb with 3D videoInfo and Download
The Diachronic Corpus of Present-Day Spoken English [Aarts and Wallis, 2006]Casual Topics280800k80hrs*Selection of face-to-face, telephone, and public discussion dialogue from Britain.Info and Download
The Spoken Corpus of the Survey of English Dialects [Beare and Scott, 1999]Casual Topics314800k60hrsDialogue of people aged 60 or above talking about their memories, families, work and the folklore of the countryside from a century ago.Info
Contact corpus authors for download.
The Child Language Data Exchange System [MacWhinney and Snow, 1985]Unrestricted11K10M1,000hrs*International database organized for the study of first and second language acquisition.Info and Download
The Charlotte Narrative and Conversation Collection (CNCC) [Reppen and Ide, 2004]Casual Topics9520K2hrs*Narratives, conversations and interviews representative of the residents of Mecklenburg County, North Carolina.Info and Download
The Group Affect and Performance (GAP) Corpus [Braley and Murray, 2018]Survival2870K4hrs+A winter survival taskInfo and Download
The MULTISIMO Corpus [Koutsombogera and Vogel, 2018]Game1826K3hrs+Family Feud-like gameInfo and Download
596 |
597 | Table 3: Human-human spontaneous spoken dialogue datasets. Starred (*) numbers are estimates based on the average rate of English speech from the National Center for Voice and Speech. 598 |
599 |
600 |
601 | 602 |
603 |
604 |
Human-Human Scripted Dialogue Datasets
605 |
606 |
607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 |
NameTopicsTotal # of utterancesTotal # of dialoguesTotal # of worksTotal # of wordsDescriptionLinks
Movie-DiC [Banchs, 2012]Movie dialogues764k132K7536MMovie scripts of American films.Contact corpus authors for data.
Movie-Triples [Serban et al., 2016]Movie dialogues736k245K61413MTriples of utterances which are filtered to come from X-Y-X triples.Contact corpus authors for data.
Film Scripts Online SeriesMovie scripts1M*263K1,50016M*Two subsets of scripts (1000 American films and 500 mixed British/American films).Info and Download
Cornell Movie-Dialogue Corpus [Danescu-Niculescu-Mizil and Lee, 2011]Movie dialogues305K220K6179M*Short conversations from film scripts, annotated with character metadata.Info and Download
Filtered Movie Script Corpus [Nio et al., 2014]Movie dialogues173k86K1,7862M*Triples of utterances which are filtered to come from X-Y-X triples.Info and Download
American Soap Opera Corpus [Davies, 2012b]TV show scripts10M*1.2M22,000100MTranscripts of American soap operas.Info and Download
TVD Corpus [Roy et al., 2014]TV show scripts60k*10K191600k*TV scripts from a comedy (Big Bang Theory) and drama (Game of Thrones) show.Info and Download
Character Style from Film Corpus [Walker et al., 2012a]Movie scripts664k151K8629.6MScripts from IMSDb, annotated for linguistic structures and character archetypes.Contact corpus authors for data.
SubTle Corpus [Ameixa and Coheur, 2013]Movie subtitles6.7M3.35M6,18420MAligned interaction-response pairs from movie subtitles.Contact corpus authors for data.
OpenSubtitles [Tiedemann, 2012]Movie subtitles140M*36M207,9071BMovie subtitles which are not speaker-aligned.Info and Download
CED (1560-1760) Corpus [Kytö and Walker, 2006]Written Works & Trial Proceedings--1771.2MVarious scripted fictional works from (1560-1760) as well as court trial proceedings.Info and Download
729 |
730 | Table 4: Human-human scripted dialogue datasets. Quantities denoted with () indicate estimates based on average dialogues per movie seen in [Banchs, 2012] and the number of scripts or works. Dialogues may not be explicitly separated in these datasets. TV show datasets were adjusted based on the ratio of average film runtime (112 minutes) to average TV show runtime (36 minutes). This data was scraped from the IMBD database (http://www.imdb.com/interfaces). ( Starred (*) quantities are estimated based on the average number of words and utterances per film, and the average lengths of films and TV shows. Estimates derived from the Tameri Guide for Writers (http://www.tameri.com/format/wordcounts.html). 731 |
732 |
733 |
734 |
735 |
736 |
Human-Human Written Dialogue Datasets
737 |
738 |
739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 |
NameTypeTopicsAvg. # of turnsTotal # of dialoguesTotal # of wordsDescriptionLinks
NPS Chat Corpus [Forsyth and Martell, 2007]ChatUnrestricted 70415100MPosts from age-specific online chat rooms.Info and Download
Twitter Corpus [Ritter et al., 2010]MicroblogUnrestricted21.3M 125MTweets and replies extracted from TwitterContact corpus authors for data.
Twitter Triple Corpus [Sordoni et al., 2015]MicroblogUnrestricted34,232 65KA-B-A triples extracted from TwitterInfo and Download
UseNet Corpus [Shaoul and Westbury, 2009]MicroblogUnrestricted 68747860 7BUseNet forum postingsInfo and Download
NUS SMS Corpus [Chen and Kan, 2013]SMS messagesUnrestricted 18 3K580,668*[¯]SMS messages collected between two users, with timing analysis.Info and Download
Reddit Domestic Abuse Corpus [Schrading et al., 2015]ForumAbuse help17.5321,13319M-103M \triangleReddit posts from either domestic abuse subreddits, or general chat.Info and Download
Reddit All Comments CorpusForumGeneral------1.7 Billion Reddit comments.Info and Download
Settlers of Catan [Afantenos et al., 2012]ChatGame terms 9521-Conversations between players in the game `Settlers of Catan'.Info

Contact corpus authors for download.
Cards Corpus [Djalali et al., 2012]ChatGame terms38.11,266282KConversations between players playing `Cards world'.Info and Download
Agreement in Wikipedia Talk Pages [Andreas et al., 2012]ForumUnrestricted2822110KLiveJournal and Wikipedia Discussions forum threads. Agreement type and level annotated.Info and Download
Agreement by Create Debaters [Rosenthal and McKeown, 2015]ForumUnrestricted210K1.4MCreate Debate forum conversations. Annotated what type of agreement (e.g. paraphrase) or disagreement.Info and Download
Internet Argument Corpus [Walker et al., 2012b]ForumPolitics 35.45 11K 73MDebates about specific political or moral positions. A separate corpus (Argumentative Summary Corpus, [Walker et al., 2012b]) annotates a subset of this corpus with summaries of the arguments.Info and Download
Argument Summary Corpus
MPC Corpus [Shaikh et al., 2010]ChatSocial tasks5201458KConversations about general, political, and interview topics.Info and Download
Ubuntu Dialogue Corpus [Lowe et al., 2015a]ChatUbuntu Operating System7.71930K100MDialogues extracted from Ubuntu chat stream on IRC.Info and Download
Ubuntu Chat Corpus [Uthus and Aha, 2013]ChatUbuntu Operating System 3381.610665 2B*[¯]Chat stream scraped from IRC logs (no dialogues extracted).Info and Download
Movie Dialog Dataset [Dodge et al., 2015]Chat, QA & RecommendationMovies 3.3 3.1M\blacktriangledown 185MFor goal-driven dialogue systems. Includes movie metadata as knowledge triples.Info and Download
DailyDialog Dataset [Li et al., 2017]ChatDaily Life 7.9 13K 1.5MConversations extracted from English language educational texts. Labeled with emotions.Info and Download
923 |
924 | Table 5: Human-human written dialogue datasets. Starred (*) quantities are computed using word counts based on spaces (i.e. a word must be a sequence of characters preceded and followed by a space), but for certain corpora, such as IRC and SMS corpora, proper English words are sometimes concatenated together due to slang usage. Triangle (\triangle) indicates lower and upper bounds computed using average words per utterance estimated on a similar Reddit corpus Schrading [2015]. Square ([¯]) indicates estimates based only on English part of the corpus. Note that 2.1M dialogues from the Movie Dialog dataset (\blacktriangledown) are in the form of simulated QA pairs. Dialogs indicated by () are contiguous blocks of recorded conversation in a multi-participant chat. In the case of UseNet, we note the total number of newsgroups and find the average turns as average number of posts collected per newsgroup. () indicates an estimate based on a Twitter dataset of similar size and refers to tokens as well as words. 925 |
926 |
927 |
928 | 929 |
930 |
931 |
932 |
933 |
934 | 935 |
936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | -------------------------------------------------------------------------------- /tfthing_files/backbone.min.js: -------------------------------------------------------------------------------- 1 | (function(){var t=this;var e=t.Backbone;var i=[];var r=i.push;var s=i.slice;var n=i.splice;var a;if(typeof exports!=="undefined"){a=exports}else{a=t.Backbone={}}a.VERSION="1.1.0";var h=t._;if(!h&&typeof require!=="undefined")h=require("underscore");a.$=t.jQuery||t.Zepto||t.ender||t.$;a.noConflict=function(){t.Backbone=e;return this};a.emulateHTTP=false;a.emulateJSON=false;var o=a.Events={on:function(t,e,i){if(!l(this,"on",t,[e,i])||!e)return this;this._events||(this._events={});var r=this._events[t]||(this._events[t]=[]);r.push({callback:e,context:i,ctx:i||this});return this},once:function(t,e,i){if(!l(this,"once",t,[e,i])||!e)return this;var r=this;var s=h.once(function(){r.off(t,s);e.apply(this,arguments)});s._callback=e;return this.on(t,s,i)},off:function(t,e,i){var r,s,n,a,o,u,c,f;if(!this._events||!l(this,"off",t,[e,i]))return this;if(!t&&!e&&!i){this._events={};return this}a=t?[t]:h.keys(this._events);for(o=0,u=a.length;o").attr(t);this.setElement(e,false)}else{this.setElement(h.result(this,"el"),false)}}});a.sync=function(t,e,i){var r=T[t];h.defaults(i||(i={}),{emulateHTTP:a.emulateHTTP,emulateJSON:a.emulateJSON});var s={type:r,dataType:"json"};if(!i.url){s.url=h.result(e,"url")||U()}if(i.data==null&&e&&(t==="create"||t==="update"||t==="patch")){s.contentType="application/json";s.data=JSON.stringify(i.attrs||e.toJSON(i))}if(i.emulateJSON){s.contentType="application/x-www-form-urlencoded";s.data=s.data?{model:s.data}:{}}if(i.emulateHTTP&&(r==="PUT"||r==="DELETE"||r==="PATCH")){s.type="POST";if(i.emulateJSON)s.data._method=r;var n=i.beforeSend;i.beforeSend=function(t){t.setRequestHeader("X-HTTP-Method-Override",r);if(n)return n.apply(this,arguments)}}if(s.type!=="GET"&&!i.emulateJSON){s.processData=false}if(s.type==="PATCH"&&E){s.xhr=function(){return new ActiveXObject("Microsoft.XMLHTTP")}}var o=i.xhr=a.ajax(h.extend(s,i));e.trigger("request",e,o,i);return o};var E=typeof window!=="undefined"&&!!window.ActiveXObject&&!(window.XMLHttpRequest&&(new XMLHttpRequest).dispatchEvent);var T={create:"POST",update:"PUT",patch:"PATCH","delete":"DELETE",read:"GET"};a.ajax=function(){return a.$.ajax.apply(a.$,arguments)};var k=a.Router=function(t){t||(t={});if(t.routes)this.routes=t.routes;this._bindRoutes();this.initialize.apply(this,arguments)};var S=/\((.*?)\)/g;var $=/(\(\?)?:\w+/g;var H=/\*\w+/g;var A=/[\-{}\[\]+?.,\\\^$|#\s]/g;h.extend(k.prototype,o,{initialize:function(){},route:function(t,e,i){if(!h.isRegExp(t))t=this._routeToRegExp(t);if(h.isFunction(e)){i=e;e=""}if(!i)i=this[e];var r=this;a.history.route(t,function(s){var n=r._extractParameters(t,s);i&&i.apply(r,n);r.trigger.apply(r,["route:"+e].concat(n));r.trigger("route",e,n);a.history.trigger("route",r,e,n)});return this},navigate:function(t,e){a.history.navigate(t,e);return this},_bindRoutes:function(){if(!this.routes)return;this.routes=h.result(this,"routes");var t,e=h.keys(this.routes);while((t=e.pop())!=null){this.route(t,this.routes[t])}},_routeToRegExp:function(t){t=t.replace(A,"\\$&").replace(S,"(?:$1)?").replace($,function(t,e){return e?t:"([^/]+)"}).replace(H,"(.*?)");return new RegExp("^"+t+"$")},_extractParameters:function(t,e){var i=t.exec(e).slice(1);return h.map(i,function(t){return t?decodeURIComponent(t):null})}});var I=a.History=function(){this.handlers=[];h.bindAll(this,"checkUrl");if(typeof window!=="undefined"){this.location=window.location;this.history=window.history}};var N=/^[#\/]|\s+$/g;var O=/^\/+|\/+$/g;var P=/msie [\w.]+/;var C=/\/$/;var j=/[?#].*$/;I.started=false;h.extend(I.prototype,o,{interval:50,getHash:function(t){var e=(t||this).location.href.match(/#(.*)$/);return e?e[1]:""},getFragment:function(t,e){if(t==null){if(this._hasPushState||!this._wantsHashChange||e){t=this.location.pathname;var i=this.root.replace(C,"");if(!t.indexOf(i))t=t.slice(i.length)}else{t=this.getHash()}}return t.replace(N,"")},start:function(t){if(I.started)throw new Error("Backbone.history has already been started");I.started=true;this.options=h.extend({root:"/"},this.options,t);this.root=this.options.root;this._wantsHashChange=this.options.hashChange!==false;this._wantsPushState=!!this.options.pushState;this._hasPushState=!!(this.options.pushState&&this.history&&this.history.pushState);var e=this.getFragment();var i=document.documentMode;var r=P.exec(navigator.userAgent.toLowerCase())&&(!i||i<=7);this.root=("/"+this.root+"/").replace(O,"/");if(r&&this._wantsHashChange){this.iframe=a.$('