├── .gitignore ├── AI_ETHICS.md ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dataset_Stats.csv ├── DialogStudio: Towards Richest and Most Diverse Unified Dataset Collection for Conversational AI.pdf ├── LICENSE.txt ├── README.md ├── SECURITY.md ├── code ├── openai_dialog_quality_evaluation.py ├── preprocess_data_DialSum.py ├── preprocess_data_KG.py ├── preprocess_data_OD.py ├── preprocess_data_TOD.py └── utils │ ├── constant.py │ ├── constant_tod.py │ └── domain_mapping.py ├── conversational-recommendation-dialogues ├── DuRecDial-2.0 │ ├── LICENSE.txt │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── OpenDialKG │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── README.md ├── Redial │ ├── converted_examples.json │ └── original_examples.json └── SalesBot │ ├── README.md │ ├── converted_examples.json │ ├── original_examples.json │ └── otgy.json ├── dialogue-summarization ├── AMI │ ├── converted_examples.json │ └── original_examples.json ├── CRD3 │ ├── LICENSE.txt │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── ConvoSumm │ ├── LICENSE │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── DialogSum │ ├── LICENSE │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── ECTSum │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── ICSI │ ├── converted_examples.json │ └── original_examples.json ├── MediaSum │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── QMSum │ ├── LICENSE │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── README.md ├── SAMSum │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── SummScreen_ForeverDreaming │ ├── README.txt │ ├── converted_examples.json │ └── original_examples.json ├── SummScreen_TVMegaSite │ ├── README.txt │ ├── converted_examples.json │ └── original_examples.json └── TweetSumm │ ├── LICENSE │ ├── LICENSE.txt │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── figures ├── DialogStudio_Quality_Scores.png ├── DialogStudio_Stats.png ├── logo-color.png └── logo.png ├── knowledge-grounded-dialogues ├── CoQA │ ├── LICENSE.txt │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── CoSQL │ ├── LICENSE.md │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── CompWebQ │ ├── LICENSE.txt │ ├── README.md │ ├── README.txt │ ├── converted_examples.json │ └── original_examples.json ├── DART │ ├── LICENSE.txt │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── FeTaQA │ ├── LICENSE.txt │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── GrailQA │ ├── LICENSE.txt │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── HybridQA │ ├── LICENSE.txt │ ├── converted_examples.json │ └── original_examples.json ├── MMQA │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── MTOP │ ├── LICENSE.txt │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── MultiModalQA │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── README.md ├── SParC │ ├── LICENSE.md │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── SQA │ ├── LICENSE.md │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── Spider │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── ToTTo │ ├── LICENSE.md │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── WebQSP │ ├── README.txt │ ├── converted_examples.json │ └── original_examples.json ├── WikiSQL │ ├── LICENSE.txt │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── WikiTQ │ ├── LICENSE.txt │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── wizard_of_internet │ ├── LICENSE.txt │ ├── converted_examples.json │ └── original_examples.json └── wizard_of_wikipedia │ ├── LICENSE.txt │ ├── converted_examples.json │ └── original_examples.json ├── natural-language-understanding ├── ATIS-NER │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── ATIS │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── BANKING77-OOS │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── BANKING77 │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── CLINC-Single-Domain-OOS-banking │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── CLINC-Single-Domain-OOS-credit_cards │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── CLINC150 │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── DSTC8-SGD │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── HWU64 │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── MIT-Movie │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── MIT-Restaurant │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── README.md ├── RESTAURANTS8K │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── SNIPS-NER │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── SNIPS │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── TOP-NER │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json └── TOP │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── open-domain-dialogues ├── AntiScam │ ├── converted_examples.json │ └── original_examples.json ├── ConvAI2 │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── Empathetic │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── HH-RLHF │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── PLACES3.5 │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── Prosocial │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── README.md ├── SODA │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── ShareGPT │ ├── README.md │ ├── converted_example.json │ └── original_example.json └── chitchat-dataset │ ├── README.md │ ├── converted_examples.json │ └── original_examples.json ├── stats ├── count_domain.json ├── count_length.json └── domain_to_dataset.json └── task-oriented-dialogues ├── ABCD ├── LICENSE.txt ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json ├── AirDialogue ├── LICENSE.txt ├── converted_examples.json ├── original_examples.json └── readme.txt ├── BiTOD ├── LICENSE.txt ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json ├── CaSiNo ├── LICENSE ├── LICENSE.txt ├── README.md ├── converted_examples.json └── original_examples.json ├── CraigslistBargains ├── LICENSE.md ├── README.md ├── converted_examples.json ├── original_examples.json └── schema.json ├── DSTC2-Clean ├── LICENSE.txt ├── README.md ├── converted_examples.json ├── ontology_en.json └── original_examples.json ├── Disambiguation ├── converted_examples.json └── original_examples.json ├── DuRecDial-2.0 ├── LICENSE.txt ├── README.md ├── converted_examples.json └── original_examples.json ├── FRAMES ├── LICENSE.txt ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json ├── GECOR ├── LICENSE ├── converted_examples.json └── original_examples.json ├── HDSA-Dialog ├── .gitignore ├── LICENSE ├── README.md ├── converted_examples.json └── original_examples.json ├── KETOD ├── converted_examples.json └── original_examples.json ├── KVRET ├── README.md ├── converted_examples.json └── original_examples.json ├── MS-DC ├── LICENSE.pdf ├── README.md ├── converted_examples.json └── original_examples.json ├── MULTIWOZ2_2 ├── README.md ├── converted_examples.json ├── original_examples.json └── schema.json ├── MetaLWOZ ├── LICENSE.pdf ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json ├── MuDoCo ├── LICENSE.md ├── README.md ├── converted_examples.json └── original_examples.json ├── MulDoGO ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.txt ├── NOTICE ├── README.md ├── converted_examples.json └── original_examples.json ├── MultiWOZ_2.1 ├── converted_examples.json └── original_examples.json ├── OpenDialKG ├── README.md ├── converted_examples.json └── original_examples.json ├── README.md ├── SGD ├── LICENSE.txt ├── README.md ├── converted_examples.json └── original_examples.json ├── STAR ├── LICENSE.txt ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json ├── SalesBot ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json ├── SimJointGEN ├── README.md ├── converted_examples.json ├── db.json └── original_examples.json ├── SimJointMovie ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json ├── SimJointRestaurant ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json ├── Taskmaster1 ├── LICENSE.md ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json ├── Taskmaster2 ├── LICENSE.md ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json ├── Taskmaster3 ├── LICENSE.md ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json └── WOZ2_0 ├── LICENSE.txt ├── README.md ├── converted_examples.json ├── original_examples.json └── otgy.json /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | Icon? 3 | -------------------------------------------------------------------------------- /AI_ETHICS.md: -------------------------------------------------------------------------------- 1 | ## Ethics disclaimer for Salesforce AI models, data, code 2 | 3 | This release is for research purposes only in support of an academic 4 | paper. Our models, datasets, and code are not specifically designed or 5 | evaluated for all downstream purposes. We strongly recommend users 6 | evaluate and address potential concerns related to accuracy, safety, and 7 | fairness before deploying this model. We encourage users to consider the 8 | common limitations of AI, comply with applicable laws, and leverage best 9 | practices when selecting use cases, particularly for high-risk scenarios 10 | where errors or misuse could significantly impact people’s lives, rights, 11 | or safety. For further guidance on use cases, refer to our standard 12 | [AUP](https://www.salesforce.com/content/dam/web/en_us/www/documents/legal/Agreements/policies/ExternalFacing_Services_Policy.pdf) 13 | and [AI AUP](https://www.salesforce.com/content/dam/web/en_us/www/documents/legal/Agreements/policies/ai-acceptable-use-policy.pdf). 14 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Comment line immediately above ownership line is reserved for related other information. Please be careful while editing. 2 | #ECCN:Open Source 3 | -------------------------------------------------------------------------------- /DialogStudio: Towards Richest and Most Diverse Unified Dataset Collection for Conversational AI.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/DialogStudio/3e54576b4db71c56468ae4591a94ce9e419ed01c/DialogStudio: Towards Richest and Most Diverse Unified Dataset Collection for Conversational AI.pdf -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Security 2 | 3 | Please report any security issue to [security@salesforce.com](mailto:security@salesforce.com) 4 | as soon as it is discovered. This library limits its runtime dependencies in 5 | order to reduce the total cost of ownership as much as can be, but all consumers 6 | should remain vigilant and have their security stakeholders review all third-party 7 | products (3PP) like this one and their dependencies. -------------------------------------------------------------------------------- /code/utils/constant.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2023, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: Apache License 2.0 5 | For full license text, see the LICENSE file in the repo root or https://www.apache.org/licenses/LICENSE-2.0 6 | """ 7 | 8 | #!/usr/bin/env python3 9 | # 10 | 11 | # key used for direct usage 12 | SPEAKER1 = "user" 13 | SPEAKER2 = "system" 14 | ORI_DIAL_ID = "original dialog id" 15 | DIAL_IDX = "dialog index" 16 | ORI_DIAL_INFO = "original dialog info" 17 | TURN_ID = "turn id" 18 | USR_UTT = f"{SPEAKER1} utterance" 19 | SYS_UTT = f"{SPEAKER2} response" 20 | DIAL_HIST = "dialog history" 21 | ORI_USR_ANN = f"original {SPEAKER1} side information" 22 | ORI_SYS_ANN = f"original {SPEAKER2} side information" 23 | LOG = "log" 24 | 25 | # # # output for different task 26 | # domain prediction 27 | DOM = "domain" 28 | # intent prediction, including dialog act prediction if intent missing 29 | INTENT = "intent" 30 | INTENT_SPLIT = " , " 31 | # dst 32 | DST = "dst" 33 | DST_ACC = "dst accumulated" 34 | DST_SPLIT = " , " 35 | 36 | # # # used for external knowledge 37 | EK = "external knowledge" 38 | EK_DST = "dst knowledge" 39 | EK_INTENT = "intent knowledge" 40 | # non-flat external knowledge dictionary 41 | EK_ORI = "external knowledge non-flat" 42 | TOD_EK = "metadata" 43 | TOD_LENGTH = 10 44 | # DOM_EK = "domains" 45 | INTENT_EK = "intents" 46 | DST_EK = "slots and values" 47 | DST_LENGTH = 10 48 | 49 | # # # prompt for each dialog 50 | PROMPT = "prompt" 51 | PROMPT_DST = "prompt for dst task" 52 | PROMPT_INTENT = "prompt for intent prediction" 53 | 54 | MULTIWOZ_DOMAINS = ["taxi", "police", "hospital", "hotel","attraction","train","restaurant"] 55 | 56 | -------------------------------------------------------------------------------- /code/utils/constant_tod.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2023, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: Apache License 2.0 5 | For full license text, see the LICENSE file in the repo root or https://www.apache.org/licenses/LICENSE-2.0 6 | """ 7 | 8 | 9 | #!/usr/bin/env python3 10 | # 11 | 12 | # key used for direct usage 13 | SPEAKER1 = "user" 14 | SPEAKER2 = "system" 15 | ORI_DIAL_ID = "original dialog id" 16 | DIAL_IDX = "dialog index" 17 | ORI_DIAL_INFO = "original dialog info" 18 | TURN_ID = "turn id" 19 | USR_UTT = f"{SPEAKER1} utterance" 20 | SYS_UTT = f"{SPEAKER2} response" 21 | DIAL_HIST = "dialog history" 22 | ORI_USR_ANN = f"original {SPEAKER1} side information" 23 | ORI_SYS_ANN = f"original {SPEAKER2} side information" 24 | LOG = "log" 25 | 26 | # # # output for different task 27 | # domain prediction 28 | DOM = "domain" 29 | # intent prediction, including dialog act prediction if intent missing 30 | INTENT = "intent" 31 | INTENT_SPLIT = " , " 32 | # dst 33 | DST = "dst" 34 | DST_ACC = "dst accumulated" 35 | DST_SPLIT = " , " 36 | 37 | # # # used for external knowledge 38 | EK = "external knowledge" 39 | EK_DST = "dst knowledge" 40 | EK_INTENT = "intent knowledge" 41 | # non-flat external knowledge dictionary 42 | EK_ORI = "external knowledge non-flat" 43 | TOD_EK = "metadata" 44 | TOD_LENGTH = 10 45 | # DOM_EK = "domains" 46 | INTENT_EK = "intents" 47 | DST_EK = "slots and values" 48 | DST_LENGTH = 10 49 | 50 | # # # prompt for each dialog 51 | PROMPT = "prompt" 52 | PROMPT_DST = "prompt for dst task" 53 | PROMPT_INTENT = "prompt for intent prediction" 54 | 55 | MULTIWOZ_DOMAINS = ["taxi", "police", "hospital", "hotel","attraction","train","restaurant"] 56 | 57 | -------------------------------------------------------------------------------- /conversational-recommendation-dialogues/OpenDialKG/README.md: -------------------------------------------------------------------------------- 1 | # OpenDialKG 2 | 3 | OpenDialKG is a dataset of conversations between two crowdsourcing agents engaging in a dialog about a given topic. Each dialog turn is paired with its corresponding “KG paths” that weave together the KG entities and relations that are mentioned in the dialog. More details can be found in the following paper: 4 | 5 | Seungwhan Moon, Pararth Shah, Anuj Kumar, Rajen Subba. ["OpenDialKG: Explainable Conversational Reasoning with Attention-based Walks over Knowledge Graphs"](https://www.aclweb.org/anthology/P19-1081.pdf), ACL (2019). 6 | 7 | ## Data Format 8 | 9 | The dataset release includes two parts: (1) the Dialog-KG Path Parallel Corpus where each dialog turn is paired with KG paths that connect its previous turn (annotated by chat participants themselves), and (2) the base knowledge graph used in both the dialog collection and in the experiments, which is a subset of the [Freebase Easy data](http://freebase-easy.cs.uni-freiburg.de/dump/). The data are made available in the following files: 10 | ``` 11 | [Dialog-KG Parallel Corpus] 12 | - ./data/opendialkg.csv 13 | 14 | [KG] 15 | - ./data/opendialkg_entities.txt 16 | - ./data/opendialkg_relations.txt 17 | - ./data/opendialkg_triples.txt 18 | ``` 19 | 20 | The Dialog-KG Parallel Corpus (`./data/opendialkg.csv`) is formatted as a csv file, where columns are: `Messages, User Rating, Assistant Rating`. Each row refers to a dialog session, which is a JSON-formatted `` of each action formatted as follows:: 21 | ``` 22 | { 23 | "type": // indicating whether it's a message ("chat") or a KG walk selection action ("action") 24 | "sender": // indicating indicating whether it is sent by "user" or "assistant" 25 | "message" (Optional): // raw utterance (for "type": "chat"), 26 | "metadata" (Optional): { 27 | "path": [ 28 | // path score, 29 | // of KG triples (subject, relation, object) that make up the path, 30 | // rendering of the path 31 | ] 32 | } // end of KG path JSON (if available) 33 | }. ... // end of each action JSON 34 | ``` 35 | 36 | Note that the path annotation refers to the connection of two adjacent turns on the conceptual level. Given `utterance_1`, `utterance_2`, and their annotated entity path `A -> B -> C` that connect `utterance_1` and `utterance_2`, Entity `A` is assumed to be mentioned in `utterance_1`, and `C` to be mentioned in `utterance_2`. Entity `B` doesn't necessarily have to be mentioned since it is an intermediate step in the path. Note also that it is a paraphrased dataset, thus each mention is not enforced to have an exact surface match with its corresponding entity in the knowledge graph. After pre-processing and quality reviews we release the 13,802 dialog sessions (91,209 turns) across two tasks (Chit-chat and Recommendations) and four domains (movie, book, sports, and music). 37 | 38 | All bi-directional KG triples used in the dataset collection and in the experiments (100,813 entities, 1358 relations, 1,190,658 triples) are included in `./data/opendialkg_triples.txt`, formatted as line-separated triples with tab-separated entities and relations: 39 | ``` 40 | subject \t relation \t object \n 41 | ... 42 | ``` 43 | 44 | All entities and relations are also listed in `./data/opendialkg_entities.txt` and `./data/opendialkg_relations.txt`, respectively. The prefix `~` in `opendialkg_relations.txt` refers to reverse relations. 45 | 46 | ## Reference 47 | 48 | To cite this work please use: 49 | ``` 50 | @InProceedings{Moon2019opendialkg, 51 | author = {Seungwhan Moon and Pararth Shah and Anuj Kumar and Rajen Subba}, 52 | title = {OpenDialKG: Explainable Conversational Reasoning with Attention-based Walks over Knowledge Graphs}, 53 | booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, 54 | month = {July}, 55 | year = {2019}, 56 | } 57 | ``` 58 | 59 | ## License 60 | OpenDialKG is released under [CC-BY-NC-4.0](https://creativecommons.org/licenses/by-nc/4.0/legalcode), see [LICENSE](LICENSE) for details. -------------------------------------------------------------------------------- /conversational-recommendation-dialogues/README.md: -------------------------------------------------------------------------------- 1 | ### Conversational Recommendation Dialogues 2 | 3 | Conversational Recommendation Dialogues follow same format as task oriented dialogues. Below is the copy of ReadME from task oriented dialogues: 4 | 5 | Below is a general format for task oriented dialogues: 6 | 7 | ```js 8 | { 9 | "dataset_name--train/val/test--dialog_id": { 10 | "original dialog id": str, 11 | "dialog index": int, 12 | "original dialog info": dict, 13 | "log": [ 14 | { 15 | "turn id": int, 16 | "user utterance": str, 17 | "system response": str, 18 | "dialog history": str, 19 | "original user side information": dict, 20 | "original system side information": dict, 21 | "dst": str, 22 | "dst accumulated": str 23 | }, 24 | ... 25 | ], 26 | "external knowledge non-flat": { 27 | "metadata": dict, 28 | "slots and values": dict 29 | "intents": dict, 30 | ... 31 | }, 32 | "external knowledge": str, 33 | "intent knowledge": str, 34 | "prompt": [ 35 | "This is a bot helping users to get navigation. Given the dialog context and external database, please generate a relevant system response for the user.", 36 | ... 37 | ] 38 | }, 39 | ... 40 | } 41 | ``` 42 | 43 | In general, datasets have the "external knowledge non-flat" and "external knowledge" in the whole dialogue level. There are also some datasets where every turn in "log" has own "external knowledge non-flat" and "external knowledge". 44 | 45 | Here are datasets with turn-level "external knowledge": 46 | ``` 47 | 'SimJointGEN', 'BiTOD', 'OpenDialKG', 'SimJointMovie', 'MS-DC', 'STAR', 'SimJointRestaurant', 'Taskmaster1', 'Taskmaster2', 'Taskmaster3' 48 | ``` 49 | And below is a general format for such datasets: 50 | ```js 51 | { 52 | "dataset_name--train/val/test--dialog_id": { 53 | "original dialog id": str, 54 | "dialog index": int, 55 | "original dialog info": dict, 56 | "log": [ 57 | { 58 | "turn id": int, 59 | "user utterance": str, 60 | "system response": str, 61 | "dialog history": str, 62 | "original user side information": dict, 63 | "original system side information": dict, 64 | "dst": str, 65 | "dst accumulated": str 66 | "external knowledge non-flat": list, 67 | "external knowledge": str, 68 | }, 69 | ... 70 | ] 71 | "prompt": [ 72 | "This is a bot helping users to get navigation. Given the dialog context and external database, please generate a relevant system response for the user.", 73 | ... 74 | ] 75 | }, 76 | ... 77 | } 78 | ``` 79 | Please refer to each dataset folder for more details. 80 | -------------------------------------------------------------------------------- /conversational-recommendation-dialogues/SalesBot/README.md: -------------------------------------------------------------------------------- 1 | # SalesBot: Transitioning from Chit-Chat to Task-Oriented Dialogues 2 | 3 | ## Framework 4 |

5 | 6 |

7 | This paper focuses on investigating the conversations starting from open-domain social chatting and then gradually transitioning to task-oriented purposes, and releases a large-scale dataset with detailed annotations for encouraging this research direction. To achieve this goal, this paper proposes a framework to automatically generate many dialogues without human involvement, in which any powerful open-domain dialogue generation model can be easily leveraged. 8 | 9 | ## Dependency 10 | Check the packages needed or simply run the command 11 | ```console 12 | conda env create -f environment.yml 13 | ``` 14 | 15 | ## Data 16 | * selfchat: 17 | ```console 18 | mkdir selfchat 19 | parlai self_chat --model-file zoo:blender/blender_1Bdistill/model --inference nucleus --num-self-chats 20 --task blended_skill_talk --include-personas True --include-initial-utterances True --outfile selfchat/merge_sgd_20.json 20 | parlai self_chat --model-file zoo:blender/blender_1Bdistill/model --inference nucleus --num-self-chats 20 --task blended_skill_talk --include-personas True --include-initial-utterances True --outfile selfchat/simulators_20.json 21 | ``` 22 | * intent detection model: 23 | ```console 24 | python3 qa_inference.py --data_file selfchat/merge_sgd_20.jsonl --output_file merge_sgd_intent.json --device 0 25 | python3 qa_inference.py --data_file selfchat/simulators_20.jsonl --output_file simulators_intent.json --device 0 26 | ``` 27 | * task-oriented simulators: 28 | ```console 29 | python3 combine_simulators.py simulators_intent.json 30 | ``` 31 | * merge SGD: 32 | ```console 33 | # SGD_delex is the version preprocessed by "ACCENTOR: Adding Chit-Chat to Enhance Task-Oriented Dialogues" 34 | unzip SGD_delex 35 | mkdir sgd_intent_dialog 36 | python3 collect_sgd_intent.py SGD_delex 37 | python3 combine_sgd.py merge_sgd_intent.json 38 | 39 | ``` 40 | * transition: 41 | ```console 42 | python3 transition.py combine_sgd.json 43 | python3 transition.py combine_simulators.json 44 | ``` 45 | 46 | ## Citation 47 | 48 | Please cite our paper if you use SalesBot in your work: 49 | 50 | ```bibtex 51 | @inproceedings{chiu2022salesbot, 52 | title={{SalesBot}: Transitioning from Chit-Chat to Task-Oriented Dialogues}, 53 | author={Chiu, Ssu and Li, Maolin and Lin, Yen-Ting and Chen, Yun-Nung}, 54 | booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (ACL)}, 55 | year={2022} 56 | } 57 | ``` 58 | -------------------------------------------------------------------------------- /conversational-recommendation-dialogues/SalesBot/otgy.json: -------------------------------------------------------------------------------- 1 | [ 2 | "GetTimesForMovie", 3 | "LookupSong", 4 | "FindMovies", 5 | "LookupMusic", 6 | "PlaySong", 7 | "FindAttractions" 8 | ] -------------------------------------------------------------------------------- /dialogue-summarization/ConvoSumm/README.md: -------------------------------------------------------------------------------- 1 | # ConvoSumm 2 | Data, code, and model checkpoints for the ACL 2021 paper [ConvoSumm: Conversation Summarization Benchmark and Improved Abstractive Summarization with Argument Mining](https://arxiv.org/pdf/2106.00829.pdf)! 3 |
4 | 5 | ## Data 6 | The data can be accessed from this [Google Drive link](https://drive.google.com/drive/folders/1HfyCMa1fQ5DkzME9RQZkytZQfyDjE1EK?usp=sharing).
7 | 8 | The `data-non-processed` contains the original, non-processed data and is 27MB, while `data-processed` contains the data for vanilla, **-arg-filtered**, and **-arg-graph** experiments, as well as model outputs, and is 611 MB.
9 | 10 | Using the [gdrive cli](https://github.com/prasmussen/gdrive), download the folders with the following command
11 | ``` 12 | gdrive download --recursive 1HfyCMa1fQ5DkzME9RQZkytZQfyDjE1EK 13 | ``` 14 | 15 | The data can also be downloaded from this [S3 bucket](https://s3.console.aws.amazon.com/s3/buckets/convosumm).
16 | ``` 17 | aws s3 cp --recursive s3://convosumm/data/ ./data 18 | ``` 19 | 20 | 21 | ## Code and Model Checkpoints 22 | Please see this [README](https://github.com/Yale-LILY/ConvoSumm/blob/master/code/README.md) for code details.
23 | 24 | Model checkpoints can be downloaded from the S3 bucket (~80GB):
25 | ``` 26 | aws s3 cp --recursive s3://convosumm/checkpoints/ ./checkpoints 27 | ``` 28 | 29 | -------------------------------------------------------------------------------- /dialogue-summarization/DialogSum/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Yulong Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dialogue-summarization/ECTSum/README.md: -------------------------------------------------------------------------------- 1 | # ECTSum: A New Benchmark Dataset For Bullet Point Summarization of Long Earnings Call Transcripts 2 | 3 | Long Paper Accepted at the EMNLP 2022 Main Conference!
4 |
  • Paper: https://aclanthology.org/2022.emnlp-main.748/
  • 5 |
  • Poster: https://rajdeep345.github.io/files/pdf/research/ECTSum_EMNLP2022_Poster.pdf
  • 6 |
  • Pre-recorded Video: https://drive.google.com/file/d/1DW2i2ApgiE6V7ViiayX5zdJSRXdAEbsy/view
  • 7 | 8 | ## Dataset 9 | The ECTSum dataset can be found under the `data` folder. 10 | 11 | ## Codes 12 | Codes and instructions for our proposed model ECT-BPS can be found under `codes/ECT-BPS`
    13 | Codes and instructions for our baseline models can be found under `codes/baselines` 14 | 15 | 16 | 17 | ## Data Preparation for ECT-BPS 18 | ### Preparing the data for training the Extractive Module 19 | 20 | #### Imports 21 | `pip install sentence-transformers`
    22 | `pip install num2words`
    23 | `pip install word2number`
    24 | 25 | #### Prepare the data 26 | `python prepare_data_ectbps_ext.py` 27 | 28 | #### Data Location 29 | The data is saved at `codes/ECT-BPS/ectbps_ext/data/`.
    30 | Processed data is already uploaded at this location. 31 | 32 | 33 | ### Preparing the data for training the Paraphrasing Module 34 | 35 | #### Imports 36 | `pip install sentence-transformers`
    37 | `pip install num2words`
    38 | `pip install word2number`
    39 | 40 | #### Prepare the data 41 | `python prepare_data_ectbps_para.py` 42 | 43 | #### Data Location 44 | The data is saved at `codes/ECT-BPS/ectbps_para/data/para/`.
    45 | Processed data is already uploaded at this location. 46 | 47 | #### Prepare the data with numericals masked 48 | `python prepare_data_ectbps_para_mask.py` 49 | 50 | #### Data Location 51 | The data is saved at `codes/ECT-BPS/ectbps_para/data/para_mask/`.
    52 | Processed data is already uploaded at this location. 53 | 54 | 55 | 56 | ## Updates 57 |
  • 1st November 2022 - ECTSum Dataset released
  • 58 |
  • 30th November 2022 - Codes and Instructions released for training the Extractive Module of ECT-BPS
  • 59 |
  • 3rd March 2023 - Added the Prediction Pipeline for the Extractive module.
  • 60 |
  • 5th March 2023 - Codes released to prepare the data for training the Paraphrasing Module
  • 61 |
  • 7th March 2023 - Codes released to train the Paraphrasing Module of ECT-BPS
  • 62 |
  • 8th March 2023 - Google Colab Notebook released for training and testing the Paraphrasing Module
  • 63 | -------------------------------------------------------------------------------- /dialogue-summarization/MediaSum/README.md: -------------------------------------------------------------------------------- 1 | # MediaSum 2 | This large-scale media interview dataset contains 463.6K transcripts with abstractive summaries, collected from interview transcripts and overview / topic descriptions from NPR and CNN. 3 | 4 | Please restrict your usage of this dataset to research purpose only. And please cite our paper: 5 | 6 | **MediaSum: A Large-scale Media Interview Dataset for Dialogue Summarization** 7 | 8 | _Chenguang Zhu*, Yang Liu*, Jie Mei and Michael Zeng (*: Equal contribution)_ 9 | 10 | _North American Chapter of the Association for Computational Linguistics (**NAACL**), Mexico City, Mexico, 2021._ 11 | 12 | • Sample data: 13 | ``` 14 | { 15 | "id": "NPR-11", 16 | "program": "Day to Day", 17 | "date": "2008-06-10", 18 | "url": "https://www.npr.org/templates/story/story.php?storyId=91356794", 19 | "title": "Researchers Find Discriminating Plants", 20 | "summary": "The \"sea rocket\" shows preferential treatment to plants that are its kin. Evolutionary plant ecologist Susan Dudley of McMaster University in Ontario discusses her discovery.", 21 | "utt": [ 22 | "This is Day to Day. I'm Madeleine Brand.", 23 | "And I'm Alex Cohen.", 24 | "Coming up, the question of who wrote a famous religious poem turns into a very unchristian battle.", 25 | "First, remember the 1970s? People talked to their houseplants, played them classical music. They were convinced plants were sensuous beings and there was that 1979 movie, \"The Secret Life of Plants.\"", 26 | "Only a few daring individuals, from the scientific establishment, have come forward with offers to replicate his experiments, or test his results. The great majority are content simply to condemn his efforts without taking the trouble to investigate their validity.", 27 | ... 28 | "OK. Thank you.", 29 | "That's Susan Dudley. She's an associate professor of biology at McMaster University in Hamilt on Ontario. She discovered that there is a social life of plants." 30 | ], 31 | "speaker": [ 32 | "MADELEINE BRAND, host", 33 | "ALEX COHEN, host", 34 | "ALEX COHEN, host", 35 | "MADELEINE BRAND, host", 36 | "Unidentified Male", 37 | ..." 38 | Professor SUSAN DUDLEY (Biology, McMaster University)", 39 | "MADELEINE BRAND, host" 40 | ] 41 | } 42 | ``` 43 | 44 | • Data split: 45 |

    46 | data_split 47 |

    48 | 49 | 50 | • Comparison with previous dialogue summarization datasets: 51 |

    52 | data_split 53 |

    54 | 55 | • Data distribution: 56 |

    57 | data_split 58 |

    59 | 60 | ## Ethics: 61 | We have used only the publicly available transcripts data from the media sources and adhere to their only-for-research-purpose guideline. 62 | 63 | As media and guests may have biased views, the transcripts and summaries will likely contain them. The content of the transcripts and summaries only reflect the views of the media and guests, and should be viewed with discretion. 64 | 65 | 66 | ## Citation 67 | If you are using MediaSum in your work, please cite using the following Bibtex entry: 68 | 69 | ``` 70 | @article{zhu2021mediasum, 71 | title={MediaSum: A Large-scale Media Interview Dataset for Dialogue Summarization}, 72 | author={Zhu, Chenguang and Liu, Yang and Mei, Jie and Zeng, Michael}, 73 | journal={arXiv preprint arXiv:2103.06410}, 74 | year={2021} 75 | } 76 | ``` 77 | 78 | -------------------------------------------------------------------------------- /dialogue-summarization/QMSum/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Yale-LILY 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dialogue-summarization/README.md: -------------------------------------------------------------------------------- 1 | ### Dialogue Summarizations 2 | 3 | General formats: 4 | 5 | ```js 6 | { 7 | "dataset_name--train/val/test--dialog_id": { 8 | "original dialog id": str, 9 | "dialog index": int, 10 | "original dialog info": { 11 | "summary": str, 12 | }, 13 | "log": [ 14 | { 15 | "turn id": int, 16 | "user utterance": str, 17 | "system response": str, 18 | "dialog history": str, 19 | "original user side information": dict, 20 | "original system side information": dict, 21 | }, 22 | ... 23 | ], 24 | "prompt": [ 25 | "This is a conversation between two speakers. Given the dialogue context, please generate a summarization about the dialogue.", 26 | ... 27 | ] 28 | }, 29 | ``` 30 | 31 | Notice that we cannot form a turn with utterances from only two sides for those datasets consisting of multi-party dialogs (e.g. AMI, ICSI). Therefore, we dump the dialog context and summary in the "original dialog info" and leave the "log" blank. 32 | -------------------------------------------------------------------------------- /dialogue-summarization/SAMSum/README.md: -------------------------------------------------------------------------------- 1 | ## Dataset Summary 2 | The SAMSum dataset contains about 16k messenger-like conversations with summaries. Conversations were created and written down by linguists fluent in English. Linguists were asked to create conversations similar to those they write on a daily basis, reflecting the proportion of topics of their real-life messenger convesations. The style and register are diversified - conversations could be informal, semi-formal or formal, they may contain slang words, emoticons and typos. Then, the conversations were annotated with summaries. It was assumed that summaries should be a concise brief of what people talked about in the conversation in third person. The SAMSum dataset was prepared by Samsung R&D Institute Poland and is distributed for research purposes. 3 | 4 | 5 | ## Data Instances 6 | 7 | The created dataset is made of 16369 conversations distributed uniformly into 4 groups based on the number of utterances in con- versations: 3-6, 7-12, 13-18 and 19-30. Each utterance contains the name of the speaker. Most conversations consist of dialogues between two interlocutors (about 75% of all conversations), the rest is between three or more people 8 | 9 | The first instance in the training set: {'id': '13818513', 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.', 'dialogue': "Amanda: I baked cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)"} 10 | 11 | ## Data Fields 12 | 13 | - dialogue: text of dialogue. 14 | - summary: human written summary of the dialogue. 15 | - id: unique id of an example. 16 | 17 | ## Data Splits 18 | 19 | - train: 14732 20 | - val: 818 21 | - test: 819 22 | 23 | ## License 24 | non-commercial licence: CC BY-NC-ND 4.0 -------------------------------------------------------------------------------- /dialogue-summarization/SAMSum/original_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": [ 3 | "13818513", 4 | "13728867", 5 | "13681000", 6 | "13730747", 7 | "13728094" 8 | ], 9 | "dialogue": [ 10 | "Amanda: I baked cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)", 11 | "Olivia: Who are you voting for in this election? \r\nOliver: Liberals as always.\r\nOlivia: Me too!!\r\nOliver: Great", 12 | "Tim: Hi, what's up?\r\nKim: Bad mood tbh, I was going to do lots of stuff but ended up procrastinating\r\nTim: What did you plan on doing?\r\nKim: Oh you know, uni stuff and unfucking my room\r\nKim: Maybe tomorrow I'll move my ass and do everything\r\nKim: We were going to defrost a fridge so instead of shopping I'll eat some defrosted veggies\r\nTim: For doing stuff I recommend Pomodoro technique where u use breaks for doing chores\r\nTim: It really helps\r\nKim: thanks, maybe I'll do that\r\nTim: I also like using post-its in kaban style", 13 | "Edward: Rachel, I think I'm in ove with Bella..\r\nrachel: Dont say anything else..\r\nEdward: What do you mean??\r\nrachel: Open your fu**ing door.. I'm outside", 14 | "Sam: hey overheard rick say something\r\nSam: i don't know what to do :-/\r\nNaomi: what did he say??\r\nSam: he was talking on the phone with someone\r\nSam: i don't know who\r\nSam: and he was telling them that he wasn't very happy here\r\nNaomi: damn!!!\r\nSam: he was saying he doesn't like being my roommate\r\nNaomi: wow, how do you feel about it?\r\nSam: i thought i was a good rommate\r\nSam: and that we have a nice place\r\nNaomi: that's true man!!!\r\nNaomi: i used to love living with you before i moved in with me boyfriend\r\nNaomi: i don't know why he's saying that\r\nSam: what should i do???\r\nNaomi: honestly if it's bothering you that much you should talk to him\r\nNaomi: see what's going on\r\nSam: i don't want to get in any kind of confrontation though\r\nSam: maybe i'll just let it go\r\nSam: and see how it goes in the future\r\nNaomi: it's your choice sam\r\nNaomi: if i were you i would just talk to him and clear the air" 15 | ], 16 | "summary": [ 17 | "Amanda baked cookies and will bring Jerry some tomorrow.", 18 | "Olivia and Olivier are voting for liberals in this election. ", 19 | "Kim may try the pomodoro technique recommended by Tim to get more stuff done.", 20 | "Edward thinks he is in love with Bella. Rachel wants Edward to open his door. Rachel is outside. ", 21 | "Sam is confused, because he overheard Rick complaining about him as a roommate. Naomi thinks Sam should talk to Rick. Sam is not sure what to do." 22 | ] 23 | } -------------------------------------------------------------------------------- /dialogue-summarization/SummScreen_ForeverDreaming/README.txt: -------------------------------------------------------------------------------- 1 | Each line in the files is a dictionary object with three keys: "Recap" (i.e., short summary), "Transcript", and "filename", where "filename" contains information about show title, season and episode number (e.g., "Alias_01x02.json" suggests the 2nd episode in the 1st season for the show "Alias"). "*_anonymize_*.json" contains the anonymized instances. The recaps and transcripts are already tokenized using spaCy and segmented into subword units (using https://github.com/rsennrich/subword-nmt). 2 | 3 | More details are in our paper: 4 | SummScreen: A Dataset for Abstractive Screenplay Summarization 5 | Mingda Chen, Zewei Chu, Sam Wiseman, Kevin Gimpel 6 | https://arxiv.org/abs/2104.07091 7 | 8 | Mingda Chen 9 | 04/20/2021 10 | -------------------------------------------------------------------------------- /dialogue-summarization/SummScreen_TVMegaSite/README.txt: -------------------------------------------------------------------------------- 1 | Each line in the files is a dictionary object with three keys: "Recap" (i.e., short summary), "Transcript", and "filename", where "filename" contains information about show title, season and episode number (e.g., "Alias_01x02.json" suggests the 2nd episode in the 1st season for the show "Alias"). "*_anonymize_*.json" contains the anonymized instances. The recaps and transcripts are already tokenized using spaCy and segmented into subword units (using https://github.com/rsennrich/subword-nmt). 2 | 3 | More details are in our paper: 4 | SummScreen: A Dataset for Abstractive Screenplay Summarization 5 | Mingda Chen, Zewei Chu, Sam Wiseman, Kevin Gimpel 6 | https://arxiv.org/abs/2104.07091 7 | 8 | Mingda Chen 9 | 04/20/2021 10 | -------------------------------------------------------------------------------- /figures/DialogStudio_Quality_Scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/DialogStudio/3e54576b4db71c56468ae4591a94ce9e419ed01c/figures/DialogStudio_Quality_Scores.png -------------------------------------------------------------------------------- /figures/DialogStudio_Stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/DialogStudio/3e54576b4db71c56468ae4591a94ce9e419ed01c/figures/DialogStudio_Stats.png -------------------------------------------------------------------------------- /figures/logo-color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/DialogStudio/3e54576b4db71c56468ae4591a94ce9e419ed01c/figures/logo-color.png -------------------------------------------------------------------------------- /figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/DialogStudio/3e54576b4db71c56468ae4591a94ce9e419ed01c/figures/logo.png -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/CoQA/LICENSE.txt: -------------------------------------------------------------------------------- 1 | CoQA contains passages from seven domains. We make five of these public under the following licenses: 2 | 3 | Literature and Wikipedia passages are shared under CC BY-SA 4.0 license. 4 | Children's stories are collected from MCTest which comes with MSR-LA license. 5 | Middle/High school exam passages are collected from RACE which comes with its own license. 6 | News passages are collected from the DeepMind CNN dataset which comes with Apache license. -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/CoSQL/LICENSE.md: -------------------------------------------------------------------------------- 1 | Creative Commons License
    This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/CoSQL/README.md: -------------------------------------------------------------------------------- 1 | # What is CoSQL? 2 | 3 | CoSQL is a corpus for building cross-domain Conversational text-to-SQL systems. It is the dialogue version of the Spider and SParC tasks. CoSQL consists of 30k+ turns plus 10k+ annotated SQL queries, obtained from a Wizard-of-Oz collection of 3k dialogues querying 200 complex databases spanning 138 domains. Each dialogue simulates a real-world DB query scenario with a crowd worker as a user exploring the database and a SQL expert retrieving answers with SQL, clarifying ambiguous questions, or otherwise informing of unanswerable questions. 4 | 5 | # CoSQL includes three tasks: 6 | - SQL-grounded dialogue state tracking to map user utterances into SQL queries if possible given the interaction history 7 | - natural language response generation based on an executed SQL and its results for user verification 8 | - user dialogue act prediction to detect and resolve ambiguous and unanswerable questions 9 | 10 | # Citation: 11 | ```commandline 12 | @article{yu2019cosql, 13 | title={Cosql: A conversational text-to-sql challenge towards cross-domain natural language interfaces to databases}, 14 | author={Yu, Tao and Zhang, Rui and Er, He Yang and Li, Suyi and Xue, Eric and Pang, Bo and Lin, Xi Victoria and Tan, Yi Chern and Shi, Tianze and Li, Zihan and others}, 15 | journal={arXiv preprint arXiv:1909.05378}, 16 | year={2019} 17 | } 18 | ``` 19 | 20 | -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/CompWebQ/README.md: -------------------------------------------------------------------------------- 1 | ## A dataset for answering complex questions that require reasoning over multiple web snippets. 2 | 3 | ComplexWebQuestions is a new dataset that contains a large set of complex questions in natural language, and can be used in multiple ways: 4 | 5 | By interacting with a search engine, which is the focus of our paper (Talmor and Berant, 2018); 6 | 7 | As a reading comprehension task: we release 12,725,989 web snippets that are relevant for the questions, and were collected during the development of our model; 8 | 9 | As a semantic parsing task: each question is paired with a SPARQL query that can be executed against Freebase to retrieve the answer. 10 | 11 | 12 | ### Citation 13 | 14 | 15 | ``` 16 | @inproceedings{talmor18compwebq, 17 | author = {A. Talmor and J. Berant}, 18 | booktitle = {North American Association for Computational Linguistics (NAACL)}, 19 | title = {The Web as a Knowledge-base for Answering Complex Questions}, 20 | year = {2018}, 21 | } 22 | ``` -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/CompWebQ/README.txt: -------------------------------------------------------------------------------- 1 | WebComplexQuestions - v1.1.0 - 2018-06-29 2 | ---------------------------------------------- 3 | 4 | This package contains ComplexWebQuestions, a dataset that contains a large set of complex questions in natural language. 5 | 6 | (c) 2018. Alon Talmor, Tel-Aviv University. 7 | 8 | LICENSE 9 | 10 | The software is licensed under the full GPL v2+. Please see the file LICENCE.txt 11 | 12 | For more information, bug reports, and fixes, contact: 13 | Alon Talmor 14 | alontalmor@mail.tau.ac.il 15 | 16 | 17 | CONTACT 18 | 19 | For questions about this distribution, please contact Tel-Aviv University NLP group 20 | at alontalmor@mail.tau.ac.il. We provide assistance on a best-effort 21 | basis. 22 | 23 | 24 | QUESTION FILES 25 | 26 | The dataset contains 34,689 examples divided into 27,734 train, 3,480 dev, 3,475 test. 27 | each containing: 28 | 29 | "ID”: The unique ID of the example; 30 | "webqsp_ID": The original WebQuestionsSP ID from which the question was constructed; 31 | "webqsp_question": The WebQuestionsSP Question from which the question was constructed; 32 | "machine_question": The artificial complex question, before paraphrasing; 33 | "question": The natural language complex question; 34 | "sparql": Freebase SPARQL query for the question. Note that the SPARQL was constructed for the machine question, the actual question after paraphrasing 35 | may differ from the SPARQL. 36 | "compositionality_type": An estimation of the type of compositionally. {composition, conjunction, comparative, superlative}. The estimation has not been manually verified, 37 | the question after paraphrasing may differ from this estimation. 38 | "answers": a list of answers each containing answer: the actual answer; answer_id: the Freebase answer id; aliases: freebase extracted aliases for the answer. 39 | "created": creation time 40 | 41 | NOTE: test set does not contain “answer” field. For test evaluation please send email to 42 | alontalmor@mail.tau.ac.il. 43 | 44 | 45 | WEB SNIPPET FILES 46 | 47 | 48 | The snippets files consist of 12,725,989 snippets each containing 49 | PLEASE DON”T USE CHROME WHEN DOWNLOADING THESE FROM DROPBOX (THE UNZIP COULD FAIL) 50 | 51 | "question_ID”: the ID of related question, containing at least 3 instances of the same ID (full question, split1, split2); 52 | "question": The natural language complex question; 53 | "web_query": Query sent to the search engine. 54 | “split_source”: 'noisy supervision split' or ‘ptrnet split’, please train on examples containing “ptrnet split” when comparing to Split+Decomp from https://arxiv.org/abs/1807.09623 55 | “split_type”: 'full_question' or ‘split_part1' or ‘split_part2’ please use ‘composition_answer’ in question of type composition and split_type: “split_part1” when training a reading comprehension model on splits as in Split+Decomp from https://arxiv.org/abs/1807.09623 (in the rest of the cases use the original answer). 56 | "web_snippets": ~100 web snippets per query. Each snippet includes Title,Snippet. They are ordered according to Google results. 57 | 58 | With a total of 59 | 10,035,571 training set snippets 60 | 1,350,950 dev set snippets 61 | 1,339,468 test set snippets 62 | 63 | 64 | -------------------- 65 | CHANGES 66 | -------------------- 67 | 68 | 2018-06-29 1.1 Second release 69 | The Question file format remains the same, except that we added an additional field related to the answer of decomposed questions. Average number of snippets per question increased. See https://arxiv.org/abs/1807.09623 70 | 71 | 2018-03-01 1.0 Initial release 72 | 73 | -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/DART/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Language, Information, and Learning at Yale 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/FeTaQA/README.md: -------------------------------------------------------------------------------- 1 | # FeTaQA: Free-form Table Question Answering 2 | 3 | FeTaQA is a **F**r**e**e-form **Ta**ble **Q**uestion **A**nswering dataset with 10K Wikipedia-based {*table, question, free-form answer, supporting table cells*} pairs. It yields a more challenging table QA setting because it requires generating free-form text answers after retrieval, inference, and integration of multiple discontinuous facts from a structured knowledge source. Unlike datasets of generative QA over text in which answers are prevalent with copies of short text spans from the source, answers in our dataset are human-generated explanations involving entities and their high-level relations. 4 | 5 | You can find more details, analyses, and baseline results in [our paper](https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00446/109273/FeTaQA-Free-form-Table-Question-Answering). 6 | 7 | # Baselines 8 | 9 | ## T5 end2end model 10 | Script adapted from [huggingface examples](https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_seq2seq.py). 11 | 12 | ``` 13 | cd end2end 14 | conda create env -f env.yml 15 | conda activate fetaqa-e2e 16 | ``` 17 | Then, convert dataset format from jsonl to json `python dataset_format.py inputdir outputdir`. 18 | 19 | (Preprocessed version can be found in `end2end/data`) 20 | 21 | Choose a config json file from `end2end/config`, then 22 | 23 | ``` 24 | #supports multi-gpu 25 | export CUDA_VISIBLE_DEVICES=0,1,2,3 26 | python train.py configs/t5-large.json 27 | ``` 28 | More details about the config setup can be found [here](https://github.com/Yale-LILY/FeTaQA/tree/main/end2end). 29 | 30 | ## TAPAS Pipeline Model 31 | To be released... 32 | 33 | 34 | ## License 35 | Shield: [![CC BY-SA 4.0][cc-by-sa-shield]][cc-by-sa] 36 | 37 | The FeTaQA dataset is distributed under a 38 | [Creative Commons Attribution-ShareAlike 4.0 International License][cc-by-sa]. 39 | 40 | [![CC BY-SA 4.0][cc-by-sa-image]][cc-by-sa] 41 | 42 | [cc-by-sa]: http://creativecommons.org/licenses/by-sa/4.0/ 43 | [cc-by-sa-image]: https://licensebuttons.net/l/by-sa/4.0/88x31.png 44 | [cc-by-sa-shield]: https://img.shields.io/badge/License-CC%20BY--SA%204.0-lightgrey.svg 45 | 46 | 47 | ## Citation 48 | ```bibtex 49 | @article{Nan2021FeTaQAFT, 50 | title={FeTaQA: Free-form Table Question Answering}, 51 | author={Nan, Linyong and Hsieh, Chiachun and Mao, Ziming and Lin, Xi Victoria and Verma, Neha and Zhang, Rui and Kryściński, Wojciech and Schoelkopf, Hailey and Kong, Riley and Tang, Xiangru and Mutuma, Mutethia and Rosand, Ben and Trindade, Isabel and Bandaru, Renusree and Cunningham, Jacob and Xiong, Caiming and Radev, Dragomir}, 52 | journal={Transactions of the Association for Computational Linguistics}, 53 | year={2022}, 54 | volume={10}, 55 | pages={35-49} 56 | } 57 | ``` 58 | -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/HybridQA/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 wenhu chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/MTOP/README.md: -------------------------------------------------------------------------------- 1 | ## Multilingual TOP dataset for semantic parsing 2 | 3 | This repository contains the multilingual TOP dataset created in the paper: 4 | 5 | _"Multilingual Neural Semantic Parsing for Low-Resourced Languages". Menglin Xia, Emilio Monti. *SEM2021._ [\[arxiv\]](https://arxiv.org/abs/2106.03469) 6 | 7 | Please cite our paper if you use this dataset. 8 | 9 | 10 | ### Description 11 | 12 | The multilingual TOP dataset is a multilingual semantic parsing dataset in English, Italian and Japanese, based on the public Facebook Task Oriented 13 | Parsing (TOP) dataset in English. The original TOP dataset can be found here: [\[paper\]](https://research.fb.com/publications/semantic-parsing-for-task-oriented-dialog-using-hierarchical-representations/) [\[data\]](http://fb.me/semanticparsingdialog). 14 | 15 | The Multilingual TOP dataset contains ˜30k training and validation data and ˜8k test data in English, Italian and Japanese. The Italian and Japanese training and validation data are machine-translated from English TOP, and the test data are manually translated. 16 | 17 | 18 | ### Abstract 19 | 20 | Multilingual semantic parsing is a cost-effective method that allows a single model to understand different languages. However, researchers face a great imbalance of availability of training data, with English being resource rich, and other languages having much less data. To tackle the data limitation problem, we propose using machine translation to bootstrap multilingual training data from the more abundant English data. To compensate for the data quality of machine translated training data, we utilize transfer learning from pretrained multilingual encoders to further improve the model. To evaluate our multilingual models on human-written sentences as opposed to machine translated ones, we introduce a new multilingual semantic parsing dataset in English, Italian and Japanese based on the Facebook Task Oriented Parsing (TOP) dataset. We show that joint multilingual training with pretrained encoders substantially outperforms our baselines on the TOP dataset and outperforms the state-of-the-art model on the public NLMaps dataset. We also establish a new baseline for zero-shot learning on the TOP dataset. We find that a semantic parser trained only on English data achieves a zero-shot performance of 44.9% exact-match accuracy on Italian sentences. 21 | 22 | ### Directories 23 | 24 | The `raw_test_data` directory contains the manually translated test sets in Italian and Japanese in xml. 25 | 26 | The `processed_data` directory contains the processed multilingual TOP semantic parsing data (train, dev, test) used in the paper. 27 | 28 | ### Citation 29 | 30 | If you use the dataset, you can use the following citation: 31 | ``` 32 | @inproceedings{xia2021multilingual, 33 | title={Multilingual Neural Semantic Parsing for Low-Resourced Languages}, 34 | author={Xia, Menglin and Monti, Emilio}, 35 | booktitle={The Tenth Joint Conference on Lexical and Computational Semantics}, 36 | year = {2021} 37 | } 38 | ``` 39 | 40 | 41 | ## License Summary 42 | 43 | The documentation is made available under the Creative Commons Attribution-ShareAlike 4.0 International License. See the LICENSE file. 44 | 45 | The sample code within this documentation is made available under the MIT-0 license. See the LICENSE-SAMPLECODE file. 46 | 47 | -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/README.md: -------------------------------------------------------------------------------- 1 | ### Knowledge Grounded Dialogues 2 | 3 | Below is a general format for knowledge-grounded dialogues: 4 | ```js 5 | { 6 | "dataset_name--train/val/test--dialog_id": { 7 | "original dialog id": str, 8 | "dialog index": int, 9 | "original dialog info": dict, 10 | "log": [ 11 | { 12 | "turn id": int, 13 | "user utterance": str, 14 | "system response": str, 15 | "dialog history": str, 16 | "original user side information": dict, 17 | "original system side information": dict, 18 | "external knowledge": str, 19 | }, 20 | ... 21 | ] 22 | // "prompt": list, # To be added 23 | }, 24 | ... 25 | } 26 | ``` 27 | Please refer to each dataset folder for more details. 28 | 29 | 30 | Acknowledgement: Under this folder, a portion of the datasets has been further refined based on the work done by [UnifiedSKG](https://github.com/HKUNLP/UnifiedSKG). We extend our profound appreciation for their valuable work. If you use their work, please also give them due citation. Thank you! 31 | 32 | -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/SParC/LICENSE.md: -------------------------------------------------------------------------------- 1 | Creative Commons License
    This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/SQA/LICENSE.md: -------------------------------------------------------------------------------- 1 | Creative Commons License
    This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/ToTTo/LICENSE.md: -------------------------------------------------------------------------------- 1 | Creative Commons License
    This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/WebQSP/README.txt: -------------------------------------------------------------------------------- 1 | List of files: 2 | 3 | ReadMe.txt - This file 4 | doc/WebQSP.pdf - The main document that details the usage, format and other specifics about this WebQuestionsSP dataset 5 | doc/LabelingInstructions.pdf - The annotation guidelines 6 | data/WebQSP.train.json - The training set of the primary dataset 7 | data/WebQSP.test.json - The testing set of the primary dataset 8 | data/WebQSP.train.partial.json - Partial annotations to bad or descriptive questions in the original training set 9 | data/WebQSP.test.partial.json - Partial annotations to bad or descriptive questions in the original testing set 10 | eval/eval.py - The evaluation script in Python 11 | eval/Pred.sem.json - Output of the STAGG system trained using the full semantic parses 12 | 13 | For evaluation, please use "WebQSP.test.json". Detailed descriptions of the roles of WebQSP.[train|test].[partial|_].json can be found in "WebQSP.pdf". 14 | 15 | Usage of the evaluation script: 16 | python eval.py goldData predAnswers 17 | 18 | $ python eval/eval.py data/WebQSP.test.json eval/Pred.sem.json 19 | Number of questions: 1639 20 | Average precision over questions: 0.709 21 | Average recall over questions: 0.803 22 | Average f1 over questions (accuracy): 0.717 23 | F1 of average recall and average precision: 0.753 24 | True accuracy (ratio of questions answered exactly correctly): 0.639 25 | -------------------------------------------------------------------------------- /knowledge-grounded-dialogues/WikiSQL/LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Salesforce Research 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /natural-language-understanding/ATIS-NER/README.md: -------------------------------------------------------------------------------- 1 | # The ATIS (Airline Travel Information System) Dataset 2 | 3 | The ATIS dataset is formatted based on [ATIS_dataset](https://github.com/howl-anderson/ATIS_dataset), we follow [Few-Shot-Intent-Detection](https://github.com/jianguoz/Few-Shot-Intent-Detection) to split 500 examples from 'train' into the 'validation' set. You can merge 'validation' back to 'train' if you don't need the validation set. 4 | 5 | -------------------------------------------------------------------------------- /natural-language-understanding/ATIS-NER/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "i want to fly from baltimore to dallas round trip\n", 4 | "output": "O O O O O B-fromloc.city_name O B-toloc.city_name B-round_trip I-round_trip\n" 5 | }, 6 | { 7 | "input": "round trip fares from baltimore to philadelphia less than 1000 dollars round trip fares from denver to philadelphia less than 1000 dollars round trip fares from pittsburgh to philadelphia less than 1000 dollars\n", 8 | "output": "B-round_trip I-round_trip O O B-fromloc.city_name O B-toloc.city_name B-cost_relative O B-fare_amount I-fare_amount B-round_trip I-round_trip O O B-fromloc.city_name O B-toloc.city_name B-cost_relative O B-fare_amount I-fare_amount B-round_trip I-round_trip O O B-fromloc.city_name O B-toloc.city_name B-cost_relative O B-fare_amount I-fare_amount\n" 9 | }, 10 | { 11 | "input": "show me the flights arriving on baltimore on june fourteenth\n", 12 | "output": "O O O O O O B-toloc.city_name O B-arrive_date.month_name B-arrive_date.day_number\n" 13 | }, 14 | { 15 | "input": "what are the flights which depart from san francisco fly to washington via indianapolis and arrive by 9 pm\n", 16 | "output": "O O O O O O O B-fromloc.city_name I-fromloc.city_name O O B-toloc.city_name O B-stoploc.city_name O O B-arrive_time.time_relative B-arrive_time.time I-arrive_time.time\n" 17 | }, 18 | { 19 | "input": "which airlines fly from boston to washington dc via other cities\n", 20 | "output": "O O O O B-fromloc.city_name O B-toloc.city_name B-toloc.state_code O O O\n" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/ATIS/README.md: -------------------------------------------------------------------------------- 1 | # The ATIS (Airline Travel Information System) Dataset 2 | 3 | The ATIS dataset is formatted based on [ATIS_dataset](https://github.com/howl-anderson/ATIS_dataset), we follow [Few-Shot-Intent-Detection](https://github.com/jianguoz/Few-Shot-Intent-Detection) to split 500 examples from 'train' into the 'validation' set. You can merge 'validation' back to 'train' if you don't need the validation set. 4 | 5 | -------------------------------------------------------------------------------- /natural-language-understanding/ATIS/converted_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "ATIS--train--1": { 3 | "original dialog id": "", 4 | "dialog index": 1, 5 | "original dialog info": { 6 | "task": "Intent Detection. The system response is the corresponding intent." 7 | }, 8 | "log": { 9 | "turn id": 1, 10 | "user utterance": "i want to fly from baltimore to dallas round trip", 11 | "system response": "atis_flight", 12 | "dialog history": "", 13 | "original user side information": {}, 14 | "original system side information": {}, 15 | "external knowledge": "" 16 | } 17 | }, 18 | "ATIS--train--2": { 19 | "original dialog id": "", 20 | "dialog index": 2, 21 | "original dialog info": { 22 | "task": "Intent Detection. The system response is the corresponding intent." 23 | }, 24 | "log": { 25 | "turn id": 1, 26 | "user utterance": "round trip fares from baltimore to philadelphia less than 1000 dollars round trip fares from denver to philadelphia less than 1000 dollars round trip fares from pittsburgh to philadelphia less than 1000 dollars", 27 | "system response": "atis_airfare", 28 | "dialog history": "", 29 | "original user side information": {}, 30 | "original system side information": {}, 31 | "external knowledge": "" 32 | } 33 | }, 34 | "ATIS--train--3": { 35 | "original dialog id": "", 36 | "dialog index": 3, 37 | "original dialog info": { 38 | "task": "Intent Detection. The system response is the corresponding intent." 39 | }, 40 | "log": { 41 | "turn id": 1, 42 | "user utterance": "show me the flights arriving on baltimore on june fourteenth", 43 | "system response": "atis_flight", 44 | "dialog history": "", 45 | "original user side information": {}, 46 | "original system side information": {}, 47 | "external knowledge": "" 48 | } 49 | }, 50 | "ATIS--train--4": { 51 | "original dialog id": "", 52 | "dialog index": 4, 53 | "original dialog info": { 54 | "task": "Intent Detection. The system response is the corresponding intent." 55 | }, 56 | "log": { 57 | "turn id": 1, 58 | "user utterance": "what are the flights which depart from san francisco fly to washington via indianapolis and arrive by 9 pm", 59 | "system response": "atis_flight", 60 | "dialog history": "", 61 | "original user side information": {}, 62 | "original system side information": {}, 63 | "external knowledge": "" 64 | } 65 | }, 66 | "ATIS--train--5": { 67 | "original dialog id": "", 68 | "dialog index": 5, 69 | "original dialog info": { 70 | "task": "Intent Detection. The system response is the corresponding intent." 71 | }, 72 | "log": { 73 | "turn id": 1, 74 | "user utterance": "which airlines fly from boston to washington dc via other cities", 75 | "system response": "atis_airline", 76 | "dialog history": "", 77 | "original user side information": {}, 78 | "original system side information": {}, 79 | "external knowledge": "" 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /natural-language-understanding/ATIS/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "i want to fly from baltimore to dallas round trip", 4 | "output": "atis_flight" 5 | }, 6 | { 7 | "input": "round trip fares from baltimore to philadelphia less than 1000 dollars round trip fares from denver to philadelphia less than 1000 dollars round trip fares from pittsburgh to philadelphia less than 1000 dollars", 8 | "output": "atis_airfare" 9 | }, 10 | { 11 | "input": "show me the flights arriving on baltimore on june fourteenth", 12 | "output": "atis_flight" 13 | }, 14 | { 15 | "input": "what are the flights which depart from san francisco fly to washington via indianapolis and arrive by 9 pm", 16 | "output": "atis_flight" 17 | }, 18 | { 19 | "input": "which airlines fly from boston to washington dc via other cities", 20 | "output": "atis_airline" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/BANKING77-OOS/converted_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "BANKING77-OOS--train--1": { 3 | "original dialog id": "", 4 | "dialog index": 1, 5 | "original dialog info": { 6 | "task": "Intent Detection. The system response is the corresponding intent." 7 | }, 8 | "log": { 9 | "turn id": 1, 10 | "user utterance": "where is the tracking number for the card?", 11 | "system response": "card_arrival", 12 | "dialog history": "", 13 | "original user side information": {}, 14 | "original system side information": {}, 15 | "external knowledge": "" 16 | } 17 | }, 18 | "BANKING77-OOS--train--2": { 19 | "original dialog id": "", 20 | "dialog index": 2, 21 | "original dialog info": { 22 | "task": "Intent Detection. The system response is the corresponding intent." 23 | }, 24 | "log": { 25 | "turn id": 1, 26 | "user utterance": "can the card was sen to me be tracked?", 27 | "system response": "card_arrival", 28 | "dialog history": "", 29 | "original user side information": {}, 30 | "original system side information": {}, 31 | "external knowledge": "" 32 | } 33 | }, 34 | "BANKING77-OOS--train--3": { 35 | "original dialog id": "", 36 | "dialog index": 3, 37 | "original dialog info": { 38 | "task": "Intent Detection. The system response is the corresponding intent." 39 | }, 40 | "log": { 41 | "turn id": 1, 42 | "user utterance": "i haven't gotten my credit card in the mail.", 43 | "system response": "card_arrival", 44 | "dialog history": "", 45 | "original user side information": {}, 46 | "original system side information": {}, 47 | "external knowledge": "" 48 | } 49 | }, 50 | "BANKING77-OOS--train--4": { 51 | "original dialog id": "", 52 | "dialog index": 4, 53 | "original dialog info": { 54 | "task": "Intent Detection. The system response is the corresponding intent." 55 | }, 56 | "log": { 57 | "turn id": 1, 58 | "user utterance": "my card still hasn't been delivered", 59 | "system response": "card_arrival", 60 | "dialog history": "", 61 | "original user side information": {}, 62 | "original system side information": {}, 63 | "external knowledge": "" 64 | } 65 | }, 66 | "BANKING77-OOS--train--5": { 67 | "original dialog id": "", 68 | "dialog index": 5, 69 | "original dialog info": { 70 | "task": "Intent Detection. The system response is the corresponding intent." 71 | }, 72 | "log": { 73 | "turn id": 1, 74 | "user utterance": "how to track the card you sent", 75 | "system response": "card_arrival", 76 | "dialog history": "", 77 | "original user side information": {}, 78 | "original system side information": {}, 79 | "external knowledge": "" 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /natural-language-understanding/BANKING77-OOS/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "where is the tracking number for the card?\n", 4 | "output": "card_arrival\n" 5 | }, 6 | { 7 | "input": "can the card was sen to me be tracked?\n", 8 | "output": "card_arrival\n" 9 | }, 10 | { 11 | "input": "i haven't gotten my credit card in the mail.\n", 12 | "output": "card_arrival\n" 13 | }, 14 | { 15 | "input": "my card still hasn't been delivered\n", 16 | "output": "card_arrival\n" 17 | }, 18 | { 19 | "input": "how to track the card you sent\n", 20 | "output": "card_arrival\n" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/BANKING77/converted_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "BANKING77--train--1": { 3 | "original dialog id": "", 4 | "dialog index": 1, 5 | "original dialog info": { 6 | "task": "Intent Detection. The system response is the corresponding intent." 7 | }, 8 | "log": { 9 | "turn id": 1, 10 | "user utterance": "i am still waiting on my card?", 11 | "system response": "card_arrival", 12 | "dialog history": "", 13 | "original user side information": {}, 14 | "original system side information": {}, 15 | "external knowledge": "" 16 | } 17 | }, 18 | "BANKING77--train--2": { 19 | "original dialog id": "", 20 | "dialog index": 2, 21 | "original dialog info": { 22 | "task": "Intent Detection. The system response is the corresponding intent." 23 | }, 24 | "log": { 25 | "turn id": 1, 26 | "user utterance": "what can i do if my card still hasn't arrived after 2 weeks?", 27 | "system response": "card_arrival", 28 | "dialog history": "", 29 | "original user side information": {}, 30 | "original system side information": {}, 31 | "external knowledge": "" 32 | } 33 | }, 34 | "BANKING77--train--3": { 35 | "original dialog id": "", 36 | "dialog index": 3, 37 | "original dialog info": { 38 | "task": "Intent Detection. The system response is the corresponding intent." 39 | }, 40 | "log": { 41 | "turn id": 1, 42 | "user utterance": "i have been waiting over a week. is the card still coming?", 43 | "system response": "card_arrival", 44 | "dialog history": "", 45 | "original user side information": {}, 46 | "original system side information": {}, 47 | "external knowledge": "" 48 | } 49 | }, 50 | "BANKING77--train--4": { 51 | "original dialog id": "", 52 | "dialog index": 4, 53 | "original dialog info": { 54 | "task": "Intent Detection. The system response is the corresponding intent." 55 | }, 56 | "log": { 57 | "turn id": 1, 58 | "user utterance": "can i track my card while it is in the process of delivery?", 59 | "system response": "card_arrival", 60 | "dialog history": "", 61 | "original user side information": {}, 62 | "original system side information": {}, 63 | "external knowledge": "" 64 | } 65 | }, 66 | "BANKING77--train--5": { 67 | "original dialog id": "", 68 | "dialog index": 5, 69 | "original dialog info": { 70 | "task": "Intent Detection. The system response is the corresponding intent." 71 | }, 72 | "log": { 73 | "turn id": 1, 74 | "user utterance": "how do i know if i will get my card, or if it is lost?", 75 | "system response": "card_arrival", 76 | "dialog history": "", 77 | "original user side information": {}, 78 | "original system side information": {}, 79 | "external knowledge": "" 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /natural-language-understanding/BANKING77/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "i am still waiting on my card?", 4 | "output": "card_arrival" 5 | }, 6 | { 7 | "input": "what can i do if my card still hasn't arrived after 2 weeks?", 8 | "output": "card_arrival" 9 | }, 10 | { 11 | "input": "i have been waiting over a week. is the card still coming?", 12 | "output": "card_arrival" 13 | }, 14 | { 15 | "input": "can i track my card while it is in the process of delivery?", 16 | "output": "card_arrival" 17 | }, 18 | { 19 | "input": "how do i know if i will get my card, or if it is lost?", 20 | "output": "card_arrival" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/CLINC-Single-Domain-OOS-banking/converted_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "CLINC-Single-Domain-OOS-banking--train--1": { 3 | "original dialog id": "", 4 | "dialog index": 1, 5 | "original dialog info": { 6 | "task": "Intent Detection. The system response is the corresponding intent." 7 | }, 8 | "log": { 9 | "turn id": 1, 10 | "user utterance": "may i get all of the food transactions that were made last month", 11 | "system response": "transactions", 12 | "dialog history": "", 13 | "original user side information": {}, 14 | "original system side information": {}, 15 | "external knowledge": "" 16 | } 17 | }, 18 | "CLINC-Single-Domain-OOS-banking--train--2": { 19 | "original dialog id": "", 20 | "dialog index": 2, 21 | "original dialog info": { 22 | "task": "Intent Detection. The system response is the corresponding intent." 23 | }, 24 | "log": { 25 | "turn id": 1, 26 | "user utterance": "looking at january, show all wine purchases", 27 | "system response": "transactions", 28 | "dialog history": "", 29 | "original user side information": {}, 30 | "original system side information": {}, 31 | "external knowledge": "" 32 | } 33 | }, 34 | "CLINC-Single-Domain-OOS-banking--train--3": { 35 | "original dialog id": "", 36 | "dialog index": 3, 37 | "original dialog info": { 38 | "task": "Intent Detection. The system response is the corresponding intent." 39 | }, 40 | "log": { 41 | "turn id": 1, 42 | "user utterance": "what was my last transaction", 43 | "system response": "transactions", 44 | "dialog history": "", 45 | "original user side information": {}, 46 | "original system side information": {}, 47 | "external knowledge": "" 48 | } 49 | }, 50 | "CLINC-Single-Domain-OOS-banking--train--4": { 51 | "original dialog id": "", 52 | "dialog index": 4, 53 | "original dialog info": { 54 | "task": "Intent Detection. The system response is the corresponding intent." 55 | }, 56 | "log": { 57 | "turn id": 1, 58 | "user utterance": "can you let me know my latest transactions", 59 | "system response": "transactions", 60 | "dialog history": "", 61 | "original user side information": {}, 62 | "original system side information": {}, 63 | "external knowledge": "" 64 | } 65 | }, 66 | "CLINC-Single-Domain-OOS-banking--train--5": { 67 | "original dialog id": "", 68 | "dialog index": 5, 69 | "original dialog info": { 70 | "task": "Intent Detection. The system response is the corresponding intent." 71 | }, 72 | "log": { 73 | "turn id": 1, 74 | "user utterance": "show me my transactions on mcdonalds", 75 | "system response": "transactions", 76 | "dialog history": "", 77 | "original user side information": {}, 78 | "original system side information": {}, 79 | "external knowledge": "" 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /natural-language-understanding/CLINC-Single-Domain-OOS-banking/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "may i get all of the food transactions that were made last month\n", 4 | "output": "transactions\n" 5 | }, 6 | { 7 | "input": "looking at january, show all wine purchases\n", 8 | "output": "transactions\n" 9 | }, 10 | { 11 | "input": "what was my last transaction\n", 12 | "output": "transactions\n" 13 | }, 14 | { 15 | "input": "can you let me know my latest transactions\n", 16 | "output": "transactions\n" 17 | }, 18 | { 19 | "input": "show me my transactions on mcdonalds\n", 20 | "output": "transactions\n" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/CLINC-Single-Domain-OOS-credit_cards/converted_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "CLINC-Single-Domain-OOS-credit_cards--train--1": { 3 | "original dialog id": "", 4 | "dialog index": 1, 5 | "original dialog info": { 6 | "task": "Intent Detection. The system response is the corresponding intent." 7 | }, 8 | "log": { 9 | "turn id": 1, 10 | "user utterance": "i wanna know my credit rating now", 11 | "system response": "credit_score", 12 | "dialog history": "", 13 | "original user side information": {}, 14 | "original system side information": {}, 15 | "external knowledge": "" 16 | } 17 | }, 18 | "CLINC-Single-Domain-OOS-credit_cards--train--2": { 19 | "original dialog id": "", 20 | "dialog index": 2, 21 | "original dialog info": { 22 | "task": "Intent Detection. The system response is the corresponding intent." 23 | }, 24 | "log": { 25 | "turn id": 1, 26 | "user utterance": "i really wanna know my credit score", 27 | "system response": "credit_score", 28 | "dialog history": "", 29 | "original user side information": {}, 30 | "original system side information": {}, 31 | "external knowledge": "" 32 | } 33 | }, 34 | "CLINC-Single-Domain-OOS-credit_cards--train--3": { 35 | "original dialog id": "", 36 | "dialog index": 3, 37 | "original dialog info": { 38 | "task": "Intent Detection. The system response is the corresponding intent." 39 | }, 40 | "log": { 41 | "turn id": 1, 42 | "user utterance": "whats my credit rating", 43 | "system response": "credit_score", 44 | "dialog history": "", 45 | "original user side information": {}, 46 | "original system side information": {}, 47 | "external knowledge": "" 48 | } 49 | }, 50 | "CLINC-Single-Domain-OOS-credit_cards--train--4": { 51 | "original dialog id": "", 52 | "dialog index": 4, 53 | "original dialog info": { 54 | "task": "Intent Detection. The system response is the corresponding intent." 55 | }, 56 | "log": { 57 | "turn id": 1, 58 | "user utterance": "can you tell me my credit score", 59 | "system response": "credit_score", 60 | "dialog history": "", 61 | "original user side information": {}, 62 | "original system side information": {}, 63 | "external knowledge": "" 64 | } 65 | }, 66 | "CLINC-Single-Domain-OOS-credit_cards--train--5": { 67 | "original dialog id": "", 68 | "dialog index": 5, 69 | "original dialog info": { 70 | "task": "Intent Detection. The system response is the corresponding intent." 71 | }, 72 | "log": { 73 | "turn id": 1, 74 | "user utterance": "please look up my credit score", 75 | "system response": "credit_score", 76 | "dialog history": "", 77 | "original user side information": {}, 78 | "original system side information": {}, 79 | "external knowledge": "" 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /natural-language-understanding/CLINC-Single-Domain-OOS-credit_cards/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "i wanna know my credit rating now\n", 4 | "output": "credit_score\n" 5 | }, 6 | { 7 | "input": "i really wanna know my credit score\n", 8 | "output": "credit_score\n" 9 | }, 10 | { 11 | "input": "whats my credit rating\n", 12 | "output": "credit_score\n" 13 | }, 14 | { 15 | "input": "can you tell me my credit score\n", 16 | "output": "credit_score\n" 17 | }, 18 | { 19 | "input": "please look up my credit score\n", 20 | "output": "credit_score\n" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/CLINC150/README.md: -------------------------------------------------------------------------------- 1 | # An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction 2 | Repository that accompanies [An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction](https://www.aclweb.org/anthology/D19-1131/). 3 | 4 | 5 | ## FAQs 6 | ### 1. What are the relevant files? 7 | See `data/data_full.json` for the "full" dataset. This is the dataset used in Table 1 (the "Full" columns). This file contains 150 "in-scope" intent classes, each with 100 train, 20 validation, and 30 test samples. There are 100 train and validation out-of-scope samples, and 1000 out-of-scope test samples. 8 | 9 | ### 2. What is the name of the dataset? 10 | The dataset was not given a name in the original paper, but [others](https://arxiv.org/pdf/2003.04807.pdf) have called it `CLINC150`. 11 | 12 | ### 3. What is this dataset for? 13 | This dataset is for evaluating the performance of intent classification systems in the presence of "out-of-scope" queries. By "out-of-scope", we mean queries that do not fall into any of the system-supported intent classes. Most datasets include only data that is "in-scope". Our dataset includes both in-scope and out-of-scope data. You might also know the term "out-of-scope" by other terms, including "out-of-domain" or "out-of-distribution". 14 | 15 | ### 4. What language is the dataset in? 16 | All queries are in English. 17 | 18 | ### 5. How does your dataset/evaluation handle multi-intent queries? 19 | All samples/queries in our dataset are single-intent samples. We consider the problem of multi-intent classification to be future work. 20 | 21 | ### 6. How did you gather the dataset? 22 | We used crowdsourcing to generate the dataset. We asked crowd workers to either paraphrase "seed" phrases, or respond to scenarios (e.g. "pretend you need to book a flight, what would you say?"). We used crowdsourcing to generate data for both in-scope and out-of-scope data. 23 | 24 | ## Citation 25 | 26 | If you find our dataset useful, please be sure to cite: 27 | 28 | ``` 29 | @inproceedings{larson-etal-2019-evaluation, 30 | title = "An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction", 31 | author = "Larson, Stefan and 32 | Mahendran, Anish and 33 | Peper, Joseph J. and 34 | Clarke, Christopher and 35 | Lee, Andrew and 36 | Hill, Parker and 37 | Kummerfeld, Jonathan K. and 38 | Leach, Kevin and 39 | Laurenzano, Michael A. and 40 | Tang, Lingjia and 41 | Mars, Jason", 42 | booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)", 43 | year = "2019", 44 | url = "https://www.aclweb.org/anthology/D19-1131" 45 | } 46 | ``` -------------------------------------------------------------------------------- /natural-language-understanding/CLINC150/converted_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "CLINC150--train--1": { 3 | "original dialog id": "", 4 | "dialog index": 1, 5 | "original dialog info": { 6 | "task": "Intent Detection. The system response is the corresponding intent." 7 | }, 8 | "log": { 9 | "turn id": 1, 10 | "user utterance": "what expression would i use to say i love you if i were an italian", 11 | "system response": "translate", 12 | "dialog history": "", 13 | "original user side information": {}, 14 | "original system side information": {}, 15 | "external knowledge": "" 16 | } 17 | }, 18 | "CLINC150--train--2": { 19 | "original dialog id": "", 20 | "dialog index": 2, 21 | "original dialog info": { 22 | "task": "Intent Detection. The system response is the corresponding intent." 23 | }, 24 | "log": { 25 | "turn id": 1, 26 | "user utterance": "can you tell me how to say 'i do not speak much spanish', in spanish", 27 | "system response": "translate", 28 | "dialog history": "", 29 | "original user side information": {}, 30 | "original system side information": {}, 31 | "external knowledge": "" 32 | } 33 | }, 34 | "CLINC150--train--3": { 35 | "original dialog id": "", 36 | "dialog index": 3, 37 | "original dialog info": { 38 | "task": "Intent Detection. The system response is the corresponding intent." 39 | }, 40 | "log": { 41 | "turn id": 1, 42 | "user utterance": "what is the equivalent of, 'life is good' in french", 43 | "system response": "translate", 44 | "dialog history": "", 45 | "original user side information": {}, 46 | "original system side information": {}, 47 | "external knowledge": "" 48 | } 49 | }, 50 | "CLINC150--train--4": { 51 | "original dialog id": "", 52 | "dialog index": 4, 53 | "original dialog info": { 54 | "task": "Intent Detection. The system response is the corresponding intent." 55 | }, 56 | "log": { 57 | "turn id": 1, 58 | "user utterance": "tell me how to say, 'it is a beautiful morning' in italian", 59 | "system response": "translate", 60 | "dialog history": "", 61 | "original user side information": {}, 62 | "original system side information": {}, 63 | "external knowledge": "" 64 | } 65 | }, 66 | "CLINC150--train--5": { 67 | "original dialog id": "", 68 | "dialog index": 5, 69 | "original dialog info": { 70 | "task": "Intent Detection. The system response is the corresponding intent." 71 | }, 72 | "log": { 73 | "turn id": 1, 74 | "user utterance": "if i were mongolian, how would i say that i am a tourist", 75 | "system response": "translate", 76 | "dialog history": "", 77 | "original user side information": {}, 78 | "original system side information": {}, 79 | "external knowledge": "" 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /natural-language-understanding/CLINC150/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "what expression would i use to say i love you if i were an italian", 4 | "output": "translate" 5 | }, 6 | { 7 | "input": "can you tell me how to say 'i do not speak much spanish', in spanish", 8 | "output": "translate" 9 | }, 10 | { 11 | "input": "what is the equivalent of, 'life is good' in french", 12 | "output": "translate" 13 | }, 14 | { 15 | "input": "tell me how to say, 'it is a beautiful morning' in italian", 16 | "output": "translate" 17 | }, 18 | { 19 | "input": "if i were mongolian, how would i say that i am a tourist", 20 | "output": "translate" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/DSTC8-SGD/README.md: -------------------------------------------------------------------------------- 1 | DSTC8-SGD is a slot filling task dataset based on the [Schema-Guided-Dialogue (SGD)](https://github.com/google-research-datasets/dstc8-schema-guided-dialogue) dataset. -------------------------------------------------------------------------------- /natural-language-understanding/DSTC8-SGD/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "userInput": { 4 | "text": "I want to visit New York." 5 | }, 6 | "context": { 7 | "requestedSlots": [ 8 | "to_location" 9 | ] 10 | }, 11 | "labels": [ 12 | { 13 | "slot": "to_location", 14 | "valueSpan": { 15 | "startIndex": 16, 16 | "endIndex": 24 17 | } 18 | } 19 | ], 20 | "id": "27_001082", 21 | "splitKey": 1.0 22 | }, 23 | { 24 | "userInput": { 25 | "text": "I am leaving out from Washington on the 11th of March." 26 | }, 27 | "context": { 28 | "requestedSlots": [ 29 | "leaving_date", 30 | "from_location" 31 | ] 32 | }, 33 | "labels": [ 34 | { 35 | "slot": "leaving_date", 36 | "valueSpan": { 37 | "startIndex": 40, 38 | "endIndex": 53 39 | } 40 | }, 41 | { 42 | "slot": "from_location", 43 | "valueSpan": { 44 | "startIndex": 22, 45 | "endIndex": 32 46 | } 47 | } 48 | ], 49 | "id": "27_001084", 50 | "splitKey": 1.0 51 | }, 52 | { 53 | "userInput": { 54 | "text": "Is there a bus that leaves from Anaheim to SD?" 55 | }, 56 | "context": {}, 57 | "labels": [ 58 | { 59 | "slot": "to_location", 60 | "valueSpan": { 61 | "startIndex": 43, 62 | "endIndex": 45 63 | } 64 | }, 65 | { 66 | "slot": "from_location", 67 | "valueSpan": { 68 | "startIndex": 32, 69 | "endIndex": 39 70 | } 71 | } 72 | ], 73 | "id": "27_001090", 74 | "splitKey": 1.0 75 | }, 76 | { 77 | "userInput": { 78 | "text": "Thursday next week for four people." 79 | }, 80 | "context": { 81 | "requestedSlots": [ 82 | "leaving_date" 83 | ] 84 | }, 85 | "labels": [ 86 | { 87 | "slot": "leaving_date", 88 | "valueSpan": { 89 | "endIndex": 18 90 | } 91 | } 92 | ], 93 | "id": "27_001092", 94 | "splitKey": 1.0 95 | }, 96 | { 97 | "userInput": { 98 | "text": "Anything else for three people from LAX?" 99 | }, 100 | "context": {}, 101 | "labels": [ 102 | { 103 | "slot": "from_location", 104 | "valueSpan": { 105 | "startIndex": 36, 106 | "endIndex": 39 107 | } 108 | } 109 | ], 110 | "id": "27_001094", 111 | "splitKey": 1.0 112 | } 113 | ] -------------------------------------------------------------------------------- /natural-language-understanding/HWU64/converted_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "HWU64--train--1": { 3 | "original dialog id": "", 4 | "dialog index": 1, 5 | "original dialog info": { 6 | "task": "Intent Detection. The system response is the corresponding intent." 7 | }, 8 | "log": { 9 | "turn id": 1, 10 | "user utterance": "what alarms do i have set right now", 11 | "system response": "alarm_query", 12 | "dialog history": "", 13 | "original user side information": {}, 14 | "original system side information": {}, 15 | "external knowledge": "" 16 | } 17 | }, 18 | "HWU64--train--2": { 19 | "original dialog id": "", 20 | "dialog index": 2, 21 | "original dialog info": { 22 | "task": "Intent Detection. The system response is the corresponding intent." 23 | }, 24 | "log": { 25 | "turn id": 1, 26 | "user utterance": "checkout today alarm of meeting", 27 | "system response": "alarm_query", 28 | "dialog history": "", 29 | "original user side information": {}, 30 | "original system side information": {}, 31 | "external knowledge": "" 32 | } 33 | }, 34 | "HWU64--train--3": { 35 | "original dialog id": "", 36 | "dialog index": 3, 37 | "original dialog info": { 38 | "task": "Intent Detection. The system response is the corresponding intent." 39 | }, 40 | "log": { 41 | "turn id": 1, 42 | "user utterance": "report alarm settings", 43 | "system response": "alarm_query", 44 | "dialog history": "", 45 | "original user side information": {}, 46 | "original system side information": {}, 47 | "external knowledge": "" 48 | } 49 | }, 50 | "HWU64--train--4": { 51 | "original dialog id": "", 52 | "dialog index": 4, 53 | "original dialog info": { 54 | "task": "Intent Detection. The system response is the corresponding intent." 55 | }, 56 | "log": { 57 | "turn id": 1, 58 | "user utterance": "see see for me the alarms that you have set tomorrow morning", 59 | "system response": "alarm_query", 60 | "dialog history": "", 61 | "original user side information": {}, 62 | "original system side information": {}, 63 | "external knowledge": "" 64 | } 65 | }, 66 | "HWU64--train--5": { 67 | "original dialog id": "", 68 | "dialog index": 5, 69 | "original dialog info": { 70 | "task": "Intent Detection. The system response is the corresponding intent." 71 | }, 72 | "log": { 73 | "turn id": 1, 74 | "user utterance": "is there an alarm for ten am", 75 | "system response": "alarm_query", 76 | "dialog history": "", 77 | "original user side information": {}, 78 | "original system side information": {}, 79 | "external knowledge": "" 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /natural-language-understanding/HWU64/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "what alarms do i have set right now", 4 | "output": "alarm_query" 5 | }, 6 | { 7 | "input": "checkout today alarm of meeting", 8 | "output": "alarm_query" 9 | }, 10 | { 11 | "input": "report alarm settings", 12 | "output": "alarm_query" 13 | }, 14 | { 15 | "input": "see see for me the alarms that you have set tomorrow morning", 16 | "output": "alarm_query" 17 | }, 18 | { 19 | "input": "is there an alarm for ten am", 20 | "output": "alarm_query" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/MIT-Movie/README.md: -------------------------------------------------------------------------------- 1 | The [MIT Movie Corpus](https://groups.csail.mit.edu/sls/downloads/movie) is a semantically tagged training and test corpus in BIO format. The eng corpus are simple queries, and the trivia10k13 corpus are more complex queries. 2 | 3 | 4 | ### Citations 5 | 6 | When using the MT-Movie dataset in your work, please cite [ASGARD: A PORTABLE ARCHITECTURE FOR MULTILINGUAL DIALOGUE SYSTEMS](https://groups.csail.mit.edu/sls/publications/2013/Liu_ICASSP-2013.pdf). 7 | 8 | ```bibtex 9 | @inproceedings{liu2013asgard, 10 | title={Asgard: A portable architecture for multilingual dialogue systems}, 11 | author={Liu, Jingjing and Pasupat, Panupong and Cyphers, Scott and Glass, Jim}, 12 | booktitle={2013 IEEE International Conference on Acoustics, Speech and Signal Processing}, 13 | pages={8386--8390}, 14 | year={2013}, 15 | organization={IEEE} 16 | } 17 | 18 | ``` -------------------------------------------------------------------------------- /natural-language-understanding/MIT-Movie/converted_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "MIT-Movie--train--1": { 3 | "original dialog id": "", 4 | "dialog index": 1, 5 | "original dialog info": { 6 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 7 | }, 8 | "log": { 9 | "turn id": 1, 10 | "user utterance": "what movies star bruce willis", 11 | "system response": "O O O B-ACTOR I-ACTOR", 12 | "dialog history": "", 13 | "original user side information": {}, 14 | "original system side information": {}, 15 | "external knowledge": "" 16 | } 17 | }, 18 | "MIT-Movie--train--2": { 19 | "original dialog id": "", 20 | "dialog index": 2, 21 | "original dialog info": { 22 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 23 | }, 24 | "log": { 25 | "turn id": 1, 26 | "user utterance": "show me films with drew barrymore from the 1980s", 27 | "system response": "O O O O B-ACTOR I-ACTOR O O B-YEAR", 28 | "dialog history": "", 29 | "original user side information": {}, 30 | "original system side information": {}, 31 | "external knowledge": "" 32 | } 33 | }, 34 | "MIT-Movie--train--3": { 35 | "original dialog id": "", 36 | "dialog index": 3, 37 | "original dialog info": { 38 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 39 | }, 40 | "log": { 41 | "turn id": 1, 42 | "user utterance": "what movies starred both al pacino and robert deniro", 43 | "system response": "O O O O B-ACTOR I-ACTOR O B-ACTOR I-ACTOR", 44 | "dialog history": "", 45 | "original user side information": {}, 46 | "original system side information": {}, 47 | "external knowledge": "" 48 | } 49 | }, 50 | "MIT-Movie--train--4": { 51 | "original dialog id": "", 52 | "dialog index": 4, 53 | "original dialog info": { 54 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 55 | }, 56 | "log": { 57 | "turn id": 1, 58 | "user utterance": "find me all of the movies that starred harold ramis and bill murray", 59 | "system response": "O O O O O O O O B-ACTOR I-ACTOR O B-ACTOR I-ACTOR", 60 | "dialog history": "", 61 | "original user side information": {}, 62 | "original system side information": {}, 63 | "external knowledge": "" 64 | } 65 | }, 66 | "MIT-Movie--train--5": { 67 | "original dialog id": "", 68 | "dialog index": 5, 69 | "original dialog info": { 70 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 71 | }, 72 | "log": { 73 | "turn id": 1, 74 | "user utterance": "find me a movie with a quote about baseball in it", 75 | "system response": "O O O O O O O O O O O", 76 | "dialog history": "", 77 | "original user side information": {}, 78 | "original system side information": {}, 79 | "external knowledge": "" 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /natural-language-understanding/MIT-Movie/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "what movies star bruce willis", 4 | "output": "O O O B-ACTOR I-ACTOR" 5 | }, 6 | { 7 | "input": "show me films with drew barrymore from the 1980s", 8 | "output": "O O O O B-ACTOR I-ACTOR O O B-YEAR" 9 | }, 10 | { 11 | "input": "what movies starred both al pacino and robert deniro", 12 | "output": "O O O O B-ACTOR I-ACTOR O B-ACTOR I-ACTOR" 13 | }, 14 | { 15 | "input": "find me all of the movies that starred harold ramis and bill murray", 16 | "output": "O O O O O O O O B-ACTOR I-ACTOR O B-ACTOR I-ACTOR" 17 | }, 18 | { 19 | "input": "find me a movie with a quote about baseball in it", 20 | "output": "O O O O O O O O O O O" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/MIT-Restaurant/README.md: -------------------------------------------------------------------------------- 1 | The [MIT Restaurant Corpus](https://groups.csail.mit.edu/sls/downloads/restaurant) is a semantically tagged training and test corpus in BIO format. 2 | 3 | ### Citations 4 | 5 | When using the MT-Restaurant dataset in your work, please cite [ASGARD: A PORTABLE ARCHITECTURE FOR MULTILINGUAL DIALOGUE SYSTEMS](https://groups.csail.mit.edu/sls/publications/2013/Liu_ICASSP-2013.pdf). 6 | 7 | ```bibtex 8 | @inproceedings{liu2013asgard, 9 | title={Asgard: A portable architecture for multilingual dialogue systems}, 10 | author={Liu, Jingjing and Pasupat, Panupong and Cyphers, Scott and Glass, Jim}, 11 | booktitle={2013 IEEE International Conference on Acoustics, Speech and Signal Processing}, 12 | pages={8386--8390}, 13 | year={2013}, 14 | organization={IEEE} 15 | } 16 | 17 | ``` -------------------------------------------------------------------------------- /natural-language-understanding/MIT-Restaurant/converted_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "MIT-Restaurant--train--1": { 3 | "original dialog id": "", 4 | "dialog index": 1, 5 | "original dialog info": { 6 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 7 | }, 8 | "log": { 9 | "turn id": 1, 10 | "user utterance": "2 start restaurants with inside dining", 11 | "system response": "B-Rating I-Rating O O B-Amenity I-Amenity", 12 | "dialog history": "", 13 | "original user side information": {}, 14 | "original system side information": {}, 15 | "external knowledge": "" 16 | } 17 | }, 18 | "MIT-Restaurant--train--2": { 19 | "original dialog id": "", 20 | "dialog index": 2, 21 | "original dialog info": { 22 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 23 | }, 24 | "log": { 25 | "turn id": 1, 26 | "user utterance": "34", 27 | "system response": "O", 28 | "dialog history": "", 29 | "original user side information": {}, 30 | "original system side information": {}, 31 | "external knowledge": "" 32 | } 33 | }, 34 | "MIT-Restaurant--train--3": { 35 | "original dialog id": "", 36 | "dialog index": 3, 37 | "original dialog info": { 38 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 39 | }, 40 | "log": { 41 | "turn id": 1, 42 | "user utterance": "5 star resturants in my town", 43 | "system response": "B-Rating I-Rating O B-Location I-Location I-Location", 44 | "dialog history": "", 45 | "original user side information": {}, 46 | "original system side information": {}, 47 | "external knowledge": "" 48 | } 49 | }, 50 | "MIT-Restaurant--train--4": { 51 | "original dialog id": "", 52 | "dialog index": 4, 53 | "original dialog info": { 54 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 55 | }, 56 | "log": { 57 | "turn id": 1, 58 | "user utterance": "98 hong kong restaurant reasonable prices", 59 | "system response": "O B-Restaurant_Name I-Restaurant_Name O B-Price O", 60 | "dialog history": "", 61 | "original user side information": {}, 62 | "original system side information": {}, 63 | "external knowledge": "" 64 | } 65 | }, 66 | "MIT-Restaurant--train--5": { 67 | "original dialog id": "", 68 | "dialog index": 5, 69 | "original dialog info": { 70 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 71 | }, 72 | "log": { 73 | "turn id": 1, 74 | "user utterance": "a great lunch spot but open till 2 a m passims kitchen", 75 | "system response": "O O O O O B-Hours I-Hours I-Hours I-Hours I-Hours B-Restaurant_Name I-Restaurant_Name", 76 | "dialog history": "", 77 | "original user side information": {}, 78 | "original system side information": {}, 79 | "external knowledge": "" 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /natural-language-understanding/MIT-Restaurant/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "2 start restaurants with inside dining", 4 | "output": "B-Rating I-Rating O O B-Amenity I-Amenity" 5 | }, 6 | { 7 | "input": "34", 8 | "output": "O" 9 | }, 10 | { 11 | "input": "5 star resturants in my town", 12 | "output": "B-Rating I-Rating O B-Location I-Location I-Location" 13 | }, 14 | { 15 | "input": "98 hong kong restaurant reasonable prices", 16 | "output": "O B-Restaurant_Name I-Restaurant_Name O B-Price O" 17 | }, 18 | { 19 | "input": "a great lunch spot but open till 2 a m passims kitchen", 20 | "output": "O O O O O B-Hours I-Hours I-Hours I-Hours I-Hours B-Restaurant_Name I-Restaurant_Name" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/README.md: -------------------------------------------------------------------------------- 1 | ### Natural Language Understanding Dialogues 2 | 3 | Below is a general format for Natural Language Understanding dialogues: 4 | ```js 5 | { 6 | "dataset_name--train/val/test--dialog_id": { 7 | "original dialog id": str, 8 | "dialog index": int, 9 | "original dialog info": dict, 10 | "log": [ 11 | { 12 | "turn id": int, 13 | "user utterance": str, 14 | "system response": str, 15 | "dialog history": str, 16 | "original user side information": dict, 17 | "original system side information": dict, 18 | "external knowledge": str, 19 | }, 20 | ... 21 | ] 22 | // "prompt": list, # To be added 23 | }, 24 | ... 25 | } 26 | ``` 27 | Please refer to each dataset folder for more details. 28 | -------------------------------------------------------------------------------- /natural-language-understanding/RESTAURANTS8K/README.md: -------------------------------------------------------------------------------- 1 | RESTAURANTS-8K is a new challenging data set of 8,198 utterances, compiled from actual conversations in the restaurant booking domain. The dataset is proposed by the acl 2020 work [Span-ConveRT: Few-shot Span Extraction for Dialog with Pretrained Conversational Representations](https://arxiv.org/abs/2005.08866). -------------------------------------------------------------------------------- /natural-language-understanding/RESTAURANTS8K/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "userInput": { 4 | "text": "There will be 5 adults and 1 child." 5 | }, 6 | "context": { 7 | "requestedSlots": [ 8 | "people" 9 | ] 10 | }, 11 | "labels": [ 12 | { 13 | "slot": "people", 14 | "valueSpan": { 15 | "startIndex": 14, 16 | "endIndex": 34 17 | } 18 | } 19 | ] 20 | }, 21 | { 22 | "userInput": { 23 | "text": "We will require and outside table to seat 9 people on August 23rd" 24 | }, 25 | "labels": [ 26 | { 27 | "slot": "people", 28 | "valueSpan": { 29 | "startIndex": 42, 30 | "endIndex": 50 31 | } 32 | }, 33 | { 34 | "slot": "date", 35 | "valueSpan": { 36 | "startIndex": 54, 37 | "endIndex": 65 38 | } 39 | } 40 | ] 41 | }, 42 | { 43 | "userInput": { 44 | "text": "Do you have room for 11 of us?" 45 | }, 46 | "labels": [ 47 | { 48 | "slot": "people", 49 | "valueSpan": { 50 | "startIndex": 21, 51 | "endIndex": 23 52 | } 53 | } 54 | ] 55 | }, 56 | { 57 | "userInput": { 58 | "text": "We are 13 and have a table." 59 | }, 60 | "labels": [ 61 | { 62 | "slot": "people", 63 | "valueSpan": { 64 | "startIndex": 7, 65 | "endIndex": 9 66 | } 67 | } 68 | ] 69 | }, 70 | { 71 | "userInput": { 72 | "text": "6 a.m." 73 | }, 74 | "context": { 75 | "requestedSlots": [ 76 | "time" 77 | ] 78 | }, 79 | "labels": [ 80 | { 81 | "slot": "time", 82 | "valueSpan": { 83 | "endIndex": 6 84 | } 85 | } 86 | ] 87 | } 88 | ] -------------------------------------------------------------------------------- /natural-language-understanding/SNIPS-NER/README.md: -------------------------------------------------------------------------------- 1 | # Snips NLU 2 | 3 | [Snips NLU](https://snips-nlu.readthedocs.io) is a Python library designed for extracting structured information from sentences written in natural language. 4 | 5 | ## What is Snips NLU about? 6 | 7 | At the heart of every chatbot and voice assistant is a shared technology: Natural Language Understanding (NLU). Any time a user interacts with AI via natural language, their expressions must be translated into a format that a machine can understand. 8 | 9 | The NLU engine first identifies the user's intent, then extracts the query's parameters, also known as 'slots'. Developers can then use this structured data to determine the appropriate action or response. 10 | 11 | Let’s take an example to illustrate this, and consider the following sentence: 12 | 13 | 14 | "What will be the weather in paris at 9pm?" 15 | 16 | Properly trained, the Snips NLU engine will be able to extract structured data such as: 17 | 18 | 19 | { 20 | "intent": { 21 | "intentName": "searchWeatherForecast", 22 | "probability": 0.95 23 | }, 24 | "slots": [ 25 | { 26 | "value": "paris", 27 | "entity": "locality", 28 | "slotName": "forecast_locality" 29 | }, 30 | { 31 | "value": { 32 | "kind": "InstantTime", 33 | "value": "2018-02-08 20:00:00 +00:00" 34 | }, 35 | "entity": "snips/datetime", 36 | "slotName": "forecast_start_datetime" 37 | } 38 | ] 39 | } 40 | 41 | In this case, the identified intent is ``searchWeatherForecast`` and two slots were extracted, a locality and a datetime. As you can see, Snips NLU does an extra step on top of extracting entities: it resolves them. The extracted datetime value has indeed been converted into a handy ISO format. 42 | 43 | Check out our `blog post`_ to get more details about why we built Snips NLU and how it works under the hood. We also published a `paper on arxiv`_, presenting the machine learning architecture of the Snips Voice Platform. 44 | 45 | Citing Snips NLU 46 | ---------------- 47 | 48 | Please cite the following paper when using Snips NLU: 49 | ```bibtex 50 | @article{coucke2018snips, 51 | title = {Snips Voice Platform: an embedded Spoken Language Understanding system for private-by-design voice interfaces}, 52 | author = {Coucke, Alice and Saade, Alaa and Ball, Adrien and Bluche, Th{\'e}odore and Caulier, Alexandre and Leroy, David and Doumouro, Cl{\'e}ment and Gisselbrecht, Thibault and Caltagirone, Francesco and Lavril, Thibaut and others}, 53 | journal = {arXiv preprint arXiv:1805.10190}, 54 | pages = {12--16}, 55 | year = {2018} 56 | } 57 | ``` 58 | 59 | -------------------------------------------------------------------------------- /natural-language-understanding/SNIPS-NER/converted_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "SNIPS-NER--train--1": { 3 | "original dialog id": "", 4 | "dialog index": 1, 5 | "original dialog info": { 6 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 7 | }, 8 | "log": { 9 | "turn id": 1, 10 | "user utterance": "listen to westbam alumb allergic on google music", 11 | "system response": "O O B-artist O B-album O B-service I-service", 12 | "dialog history": "", 13 | "original user side information": {}, 14 | "original system side information": {}, 15 | "external knowledge": "" 16 | } 17 | }, 18 | "SNIPS-NER--train--2": { 19 | "original dialog id": "", 20 | "dialog index": 2, 21 | "original dialog info": { 22 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 23 | }, 24 | "log": { 25 | "turn id": 1, 26 | "user utterance": "add step to me to the 50 cl\u00e1sicos playlist", 27 | "system response": "O B-entity_name I-entity_name I-entity_name O O B-playlist I-playlist O", 28 | "dialog history": "", 29 | "original user side information": {}, 30 | "original system side information": {}, 31 | "external knowledge": "" 32 | } 33 | }, 34 | "SNIPS-NER--train--3": { 35 | "original dialog id": "", 36 | "dialog index": 3, 37 | "original dialog info": { 38 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 39 | }, 40 | "log": { 41 | "turn id": 1, 42 | "user utterance": "i give this current textbook a rating value of 1 and a best rating of 6", 43 | "system response": "O O O B-object_select B-object_type O O O O B-rating_value O O O O O B-best_rating", 44 | "dialog history": "", 45 | "original user side information": {}, 46 | "original system side information": {}, 47 | "external knowledge": "" 48 | } 49 | }, 50 | "SNIPS-NER--train--4": { 51 | "original dialog id": "", 52 | "dialog index": 4, 53 | "original dialog info": { 54 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 55 | }, 56 | "log": { 57 | "turn id": 1, 58 | "user utterance": "play the song little robin redbreast", 59 | "system response": "O O B-music_item B-track I-track I-track", 60 | "dialog history": "", 61 | "original user side information": {}, 62 | "original system side information": {}, 63 | "external knowledge": "" 64 | } 65 | }, 66 | "SNIPS-NER--train--5": { 67 | "original dialog id": "", 68 | "dialog index": 5, 69 | "original dialog info": { 70 | "task": "BIO tag for Named Entity Recognition. The system response is the corresponding BIO format." 71 | }, 72 | "log": { 73 | "turn id": 1, 74 | "user utterance": "please add iris dement to my playlist this is selena", 75 | "system response": "O O B-artist I-artist O B-playlist_owner O B-playlist I-playlist I-playlist", 76 | "dialog history": "", 77 | "original user side information": {}, 78 | "original system side information": {}, 79 | "external knowledge": "" 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /natural-language-understanding/SNIPS-NER/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "listen to westbam alumb allergic on google music\n", 4 | "output": "O O B-artist O B-album O B-service I-service \n" 5 | }, 6 | { 7 | "input": "add step to me to the 50 cl\u00e1sicos playlist \n", 8 | "output": "O B-entity_name I-entity_name I-entity_name O O B-playlist I-playlist O \n" 9 | }, 10 | { 11 | "input": "i give this current textbook a rating value of 1 and a best rating of 6\n", 12 | "output": "O O O B-object_select B-object_type O O O O B-rating_value O O O O O B-best_rating \n" 13 | }, 14 | { 15 | "input": "play the song little robin redbreast\n", 16 | "output": "O O B-music_item B-track I-track I-track \n" 17 | }, 18 | { 19 | "input": "please add iris dement to my playlist this is selena\n", 20 | "output": "O O B-artist I-artist O B-playlist_owner O B-playlist I-playlist I-playlist \n" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/SNIPS/README.md: -------------------------------------------------------------------------------- 1 | # Snips NLU 2 | 3 | [Snips NLU](https://snips-nlu.readthedocs.io) is a Python library designed for extracting structured information from sentences written in natural language. 4 | 5 | ## What is Snips NLU about? 6 | 7 | At the heart of every chatbot and voice assistant is a shared technology: Natural Language Understanding (NLU). Any time a user interacts with AI via natural language, their expressions must be translated into a format that a machine can understand. 8 | 9 | The NLU engine first identifies the user's intent, then extracts the query's parameters, also known as 'slots'. Developers can then use this structured data to determine the appropriate action or response. 10 | 11 | Let’s take an example to illustrate this, and consider the following sentence: 12 | 13 | 14 | "What will be the weather in paris at 9pm?" 15 | 16 | Properly trained, the Snips NLU engine will be able to extract structured data such as: 17 | 18 | 19 | { 20 | "intent": { 21 | "intentName": "searchWeatherForecast", 22 | "probability": 0.95 23 | }, 24 | "slots": [ 25 | { 26 | "value": "paris", 27 | "entity": "locality", 28 | "slotName": "forecast_locality" 29 | }, 30 | { 31 | "value": { 32 | "kind": "InstantTime", 33 | "value": "2018-02-08 20:00:00 +00:00" 34 | }, 35 | "entity": "snips/datetime", 36 | "slotName": "forecast_start_datetime" 37 | } 38 | ] 39 | } 40 | 41 | In this case, the identified intent is ``searchWeatherForecast`` and two slots were extracted, a locality and a datetime. As you can see, Snips NLU does an extra step on top of extracting entities: it resolves them. The extracted datetime value has indeed been converted into a handy ISO format. 42 | 43 | Check out our `blog post`_ to get more details about why we built Snips NLU and how it works under the hood. We also published a `paper on arxiv`_, presenting the machine learning architecture of the Snips Voice Platform. 44 | 45 | Citing Snips NLU 46 | ---------------- 47 | 48 | Please cite the following paper when using Snips NLU: 49 | ```bibtex 50 | @article{coucke2018snips, 51 | title = {Snips Voice Platform: an embedded Spoken Language Understanding system for private-by-design voice interfaces}, 52 | author = {Coucke, Alice and Saade, Alaa and Ball, Adrien and Bluche, Th{\'e}odore and Caulier, Alexandre and Leroy, David and Doumouro, Cl{\'e}ment and Gisselbrecht, Thibault and Caltagirone, Francesco and Lavril, Thibaut and others}, 53 | journal = {arXiv preprint arXiv:1805.10190}, 54 | pages = {12--16}, 55 | year = {2018} 56 | } 57 | ``` 58 | 59 | -------------------------------------------------------------------------------- /natural-language-understanding/SNIPS/converted_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "SNIPS--train--1": { 3 | "original dialog id": "", 4 | "dialog index": 1, 5 | "original dialog info": { 6 | "task": "Intent Detection. The system response is the corresponding intent." 7 | }, 8 | "log": { 9 | "turn id": 1, 10 | "user utterance": "listen to westbam alumb allergic on google music", 11 | "system response": "PlayMusic", 12 | "dialog history": "", 13 | "original user side information": {}, 14 | "original system side information": {}, 15 | "external knowledge": "" 16 | } 17 | }, 18 | "SNIPS--train--2": { 19 | "original dialog id": "", 20 | "dialog index": 2, 21 | "original dialog info": { 22 | "task": "Intent Detection. The system response is the corresponding intent." 23 | }, 24 | "log": { 25 | "turn id": 1, 26 | "user utterance": "add step to me to the 50 cl\u00e1sicos playlist", 27 | "system response": "AddToPlaylist", 28 | "dialog history": "", 29 | "original user side information": {}, 30 | "original system side information": {}, 31 | "external knowledge": "" 32 | } 33 | }, 34 | "SNIPS--train--3": { 35 | "original dialog id": "", 36 | "dialog index": 3, 37 | "original dialog info": { 38 | "task": "Intent Detection. The system response is the corresponding intent." 39 | }, 40 | "log": { 41 | "turn id": 1, 42 | "user utterance": "i give this current textbook a rating value of 1 and a best rating of 6", 43 | "system response": "RateBook", 44 | "dialog history": "", 45 | "original user side information": {}, 46 | "original system side information": {}, 47 | "external knowledge": "" 48 | } 49 | }, 50 | "SNIPS--train--4": { 51 | "original dialog id": "", 52 | "dialog index": 4, 53 | "original dialog info": { 54 | "task": "Intent Detection. The system response is the corresponding intent." 55 | }, 56 | "log": { 57 | "turn id": 1, 58 | "user utterance": "play the song little robin redbreast", 59 | "system response": "PlayMusic", 60 | "dialog history": "", 61 | "original user side information": {}, 62 | "original system side information": {}, 63 | "external knowledge": "" 64 | } 65 | }, 66 | "SNIPS--train--5": { 67 | "original dialog id": "", 68 | "dialog index": 5, 69 | "original dialog info": { 70 | "task": "Intent Detection. The system response is the corresponding intent." 71 | }, 72 | "log": { 73 | "turn id": 1, 74 | "user utterance": "please add iris dement to my playlist this is selena", 75 | "system response": "AddToPlaylist", 76 | "dialog history": "", 77 | "original user side information": {}, 78 | "original system side information": {}, 79 | "external knowledge": "" 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /natural-language-understanding/SNIPS/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "listen to westbam alumb allergic on google music", 4 | "output": "PlayMusic" 5 | }, 6 | { 7 | "input": "add step to me to the 50 cl\u00e1sicos playlist ", 8 | "output": "AddToPlaylist" 9 | }, 10 | { 11 | "input": "i give this current textbook a rating value of 1 and a best rating of 6", 12 | "output": "RateBook" 13 | }, 14 | { 15 | "input": "play the song little robin redbreast", 16 | "output": "PlayMusic" 17 | }, 18 | { 19 | "input": "please add iris dement to my playlist this is selena", 20 | "output": "AddToPlaylist" 21 | } 22 | ] -------------------------------------------------------------------------------- /natural-language-understanding/TOP-NER/README.md: -------------------------------------------------------------------------------- 1 | TOP is a Semantic Parsing for Task Oriented Dialog dataset. It collects a total of 44783 annotations with 2 | 25 intents and 36 slots, randomly split into 31279 3 | training, 4462 validation and 9042 test utterances. The data is originally from the EMNLP 2018 work [Semantic Parsing for Task Oriented Dialog 4 | using Hierarchical Representations](https://aclanthology.org/D18-1300.pdf) -------------------------------------------------------------------------------- /natural-language-understanding/TOP-NER/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | "How:O long:O will:O it:O take:O to:O drive:B-SL_METHOD_TRAVEL from:O Chicago:B-SL_SOURCE to:O Mississippi:B-SL_DESTINATION <=> IN_GET_ESTIMATED_DURATION", 3 | "Will:O it:O take:O shorter:O to:O get:O to:O the:O White:O House:O by:O bus:O or:O taxi:O ?:O <=> IN_UNSUPPORTED_NAVIGATION", 4 | "will:O I:O make:O it:O to:O the:B-SL_DESTINATION_IN_GET_LOCATION_SL_CATEGORY_LOCATION beach:I-SL_DESTINATION_IN_GET_LOCATION_SL_CATEGORY_LOCATION by:B-SL_DATE_TIME_ARRIVAL noon:I-SL_DATE_TIME_ARRIVAL if:O I:O leave:O now:B-SL_DATE_TIME_DEPARTURE <=> IN_GET_ESTIMATED_ARRIVAL", 5 | "When:O should:O I:O leave:O my:B-SL_SOURCE_IN_GET_LOCATION_HOME_SL_CONTACT house:I-SL_SOURCE_IN_GET_LOCATION_HOME to:O get:O to:O the:B-SL_DESTINATION_IN_GET_LOCATION_SL_POINT_ON_MAP Hamilton:I-SL_DESTINATION_IN_GET_LOCATION_SL_POINT_ON_MAP Mall:I-SL_DESTINATION_IN_GET_LOCATION_SL_POINT_ON_MAP right:B-SL_DATE_TIME_ARRIVAL when:I-SL_DATE_TIME_ARRIVAL it:I-SL_DATE_TIME_ARRIVAL opens:I-SL_DATE_TIME_ARRIVAL on:I-SL_DATE_TIME_ARRIVAL Saturday:I-SL_DATE_TIME_ARRIVAL <=> IN_GET_ESTIMATED_DEPARTURE", 6 | "I:O need:O to:O know:O if:O there:O 's:O a:O lot:O of:O traffic:O on:O my:O way:O home:B-SL_DESTINATION_IN_GET_LOCATION_HOME <=> IN_GET_INFO_TRAFFIC" 7 | ] -------------------------------------------------------------------------------- /natural-language-understanding/TOP/README.md: -------------------------------------------------------------------------------- 1 | TOP is a Semantic Parsing for Task Oriented Dialog dataset. It collects a total of 44783 annotations with 2 | 25 intents and 36 slots, randomly split into 31279 3 | training, 4462 validation and 9042 test utterances. The data is originally from the EMNLP 2018 work [Semantic Parsing for Task Oriented Dialog 4 | using Hierarchical Representations](https://aclanthology.org/D18-1300.pdf) -------------------------------------------------------------------------------- /natural-language-understanding/TOP/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | "How:O long:O will:O it:O take:O to:O drive:B-SL_METHOD_TRAVEL from:O Chicago:B-SL_SOURCE to:O Mississippi:B-SL_DESTINATION <=> IN_GET_ESTIMATED_DURATION", 3 | "Will:O it:O take:O shorter:O to:O get:O to:O the:O White:O House:O by:O bus:O or:O taxi:O ?:O <=> IN_UNSUPPORTED_NAVIGATION", 4 | "will:O I:O make:O it:O to:O the:B-SL_DESTINATION_IN_GET_LOCATION_SL_CATEGORY_LOCATION beach:I-SL_DESTINATION_IN_GET_LOCATION_SL_CATEGORY_LOCATION by:B-SL_DATE_TIME_ARRIVAL noon:I-SL_DATE_TIME_ARRIVAL if:O I:O leave:O now:B-SL_DATE_TIME_DEPARTURE <=> IN_GET_ESTIMATED_ARRIVAL", 5 | "When:O should:O I:O leave:O my:B-SL_SOURCE_IN_GET_LOCATION_HOME_SL_CONTACT house:I-SL_SOURCE_IN_GET_LOCATION_HOME to:O get:O to:O the:B-SL_DESTINATION_IN_GET_LOCATION_SL_POINT_ON_MAP Hamilton:I-SL_DESTINATION_IN_GET_LOCATION_SL_POINT_ON_MAP Mall:I-SL_DESTINATION_IN_GET_LOCATION_SL_POINT_ON_MAP right:B-SL_DATE_TIME_ARRIVAL when:I-SL_DATE_TIME_ARRIVAL it:I-SL_DATE_TIME_ARRIVAL opens:I-SL_DATE_TIME_ARRIVAL on:I-SL_DATE_TIME_ARRIVAL Saturday:I-SL_DATE_TIME_ARRIVAL <=> IN_GET_ESTIMATED_DEPARTURE", 6 | "I:O need:O to:O know:O if:O there:O 's:O a:O lot:O of:O traffic:O on:O my:O way:O home:B-SL_DESTINATION_IN_GET_LOCATION_HOME <=> IN_GET_INFO_TRAFFIC" 7 | ] -------------------------------------------------------------------------------- /open-domain-dialogues/ConvAI2/README.md: -------------------------------------------------------------------------------- 1 | Dialogue systems and conversational agents -- including chatbots, personal assistants and voice control interfaces -- are becoming increasingly widespread in our daily lives. NIPS is sponsoring an open competition to create a chatbot that can hold an intelligent conversation with a human partner. 2 | 3 | ## Overview 4 | Recent advances in machine learning have sparked a renewed interest for dialogue systems in the research community. In addition to the growing real-world applications, the ability to converse is closely related to the overall [goal of AI](http://www.turingarchive.org/browse.php/B/9 "Turing, Alan M. Computing machinery and intelligence. Mind 59.236 (1950): 433-460."). This NIPS Live Competition aims to unify the community around the challenging task: building systems capable of intelligent conversations. Teams are expected to submit dialogue systems able to carry out intelligent and natural conversations about specific news articles with humans. At the final stage of the competition participants, as well as volunteers, will be randomly matched with a bot or a human to chat and evaluate answers of a peer. We expect the competition to have two major outcomes: (1) a measure of quality of state-of-the-art dialogue systems, and (2) an open-source dataset collected from evaluated dialogues. 5 | 6 | ## Dataset 7 | 8 | Dataset collected during competition will be distributed under Apache 2.0 license. -------------------------------------------------------------------------------- /open-domain-dialogues/Empathetic/README.md: -------------------------------------------------------------------------------- 1 | # EmpatheticDialogues 2 | 3 | PyTorch original implementation of Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset (https://arxiv.org/abs/1811.00207). 4 | 5 | We provide a novel dataset of 25k conversations grounded in emotional situations. The code in this repo demonstrates that automated metrics (P@1,100 and BLEU) are improved both when using candidates from our dataset and when fine-tuning on it. 6 | 7 | ## Dataset 8 | 9 | To download the EmpatheticDialogues dataset: 10 | 11 | ``` 12 | wget https://dl.fbaipublicfiles.com/parlai/empatheticdialogues/empatheticdialogues.tar.gz 13 | ``` 14 | 15 | ## References 16 | 17 | Please cite [[1]](https://arxiv.org/abs/1811.00207) if you found the resources in this repository useful. 18 | 19 | ### Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset 20 | 21 | [1] H. Rashkin, E. M. Smith, M. Li, Y. Boureau [*Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset*](https://arxiv.org/abs/1811.00207) 22 | 23 | ``` 24 | @inproceedings{rashkin2019towards, 25 | title = {Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset}, 26 | author = {Hannah Rashkin and Eric Michael Smith and Margaret Li and Y-Lan Boureau}, 27 | booktitle = {ACL}, 28 | year = {2019}, 29 | } 30 | ``` -------------------------------------------------------------------------------- /open-domain-dialogues/Empathetic/original_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "conv_id": [ 3 | "hit:0_conv:0", 4 | "hit:0_conv:0", 5 | "hit:0_conv:0", 6 | "hit:0_conv:0", 7 | "hit:0_conv:0" 8 | ], 9 | "utterance_idx": [ 10 | 1, 11 | 2, 12 | 3, 13 | 4, 14 | 5 15 | ], 16 | "context": [ 17 | "guilty", 18 | "guilty", 19 | "guilty", 20 | "guilty", 21 | "guilty" 22 | ], 23 | "prompt": [ 24 | "I felt guilty when I was driving home one night and a person tried to fly into my lane_comma_ and didn't see me. I honked and they swerved back into their lane_comma_ slammed on their brakes_comma_ and hit the water cones.", 25 | "I felt guilty when I was driving home one night and a person tried to fly into my lane_comma_ and didn't see me. I honked and they swerved back into their lane_comma_ slammed on their brakes_comma_ and hit the water cones.", 26 | "I felt guilty when I was driving home one night and a person tried to fly into my lane_comma_ and didn't see me. I honked and they swerved back into their lane_comma_ slammed on their brakes_comma_ and hit the water cones.", 27 | "I felt guilty when I was driving home one night and a person tried to fly into my lane_comma_ and didn't see me. I honked and they swerved back into their lane_comma_ slammed on their brakes_comma_ and hit the water cones.", 28 | "I felt guilty when I was driving home one night and a person tried to fly into my lane_comma_ and didn't see me. I honked and they swerved back into their lane_comma_ slammed on their brakes_comma_ and hit the water cones." 29 | ], 30 | "speaker_idx": [ 31 | 0, 32 | 1, 33 | 0, 34 | 1, 35 | 0 36 | ], 37 | "utterance": [ 38 | "Yeah about 10 years ago I had a horrifying experience. It was 100% their fault but they hit the water barrels and survived. They had no injuries but they almost ran me off the road.", 39 | "Did you suffer any injuries?", 40 | "No I wasn't hit. It turned out they were drunk. I felt guilty but realized it was his fault.", 41 | "Why did you feel guilty? People really shouldn't drive drunk.", 42 | "I don't know I was new to driving and hadn't experienced anything like that. I felt like my horn made him swerve into the water barrels." 43 | ], 44 | "selfeval": [ 45 | "2|2|5_5|5|5", 46 | "2|2|5_5|5|5", 47 | "2|2|5_5|5|5", 48 | "2|2|5_5|5|5", 49 | "2|2|5_5|5|5" 50 | ], 51 | "tags": [ 52 | "", 53 | "", 54 | "", 55 | "", 56 | "" 57 | ] 58 | } -------------------------------------------------------------------------------- /open-domain-dialogues/PLACES3.5/README.md: -------------------------------------------------------------------------------- 1 | # PLACES: Prompting Language Models for Social Conversation Synthesis 2 | 3 | A synthesizing a multi-party conversational dataset 4 | 5 | # Citation: 6 | ```commandline 7 | @inproceedings{chen2023places, 8 | title={PLACES: Prompting Language Models for Social Conversation Synthesis}, 9 | author={Chen, Maximillian and Papangelis, Alexandros and Tao, Chenyang and Kim, Seokhwan and Rosenbaum, Andy and Liu, Yang and Yu, Zhou, and Hakkani-Tur, Dilek}, 10 | booktitle={Findings of the Association for Computational Linguistics: EACL 2023}, 11 | pages={to appear}, 12 | year={2023} 13 | } 14 | ``` 15 | 16 | -------------------------------------------------------------------------------- /open-domain-dialogues/Prosocial/README.md: -------------------------------------------------------------------------------- 1 | # ProsocialDialog 2 | 3 | 4 | 5 | #### Dataset format when downloaded directly using the link above 6 | 7 | Each split (train/valid/test) contains a list of dialogues. \ 8 | A dialogue has the following structure: 9 | 10 | ``` 11 | [ 12 | { 13 | "text": ... # utterance 1 14 | "labels": [ 15 | ... # utterance 2 16 | ], 17 | "rots": [ 18 | ... 19 | ], 20 | "safety_label": ..., 21 | "safety_annotations": [ 22 | ... 23 | ], 24 | "safety_annotation_reasons": [ 25 | ... 26 | ], 27 | "source": ..., 28 | "etc": ..., 29 | "episode_done": false 30 | }, 31 | ..., 32 | { 33 | "text": "I don't care what people think about me. I just want to make someone else's life miserable. Maybe I should instead threaten him with this.", # utterance 5 34 | "labels": [ 35 | "You need to stop this. He might be having medical difficulties with this issue, so please try to help him out instead. It's important to look out for others when they're having trouble." # utterance 6 36 | ], 37 | "rots": [ 38 | "It's wrong to threaten someone.", 39 | "You should look out for others." 40 | ], 41 | "safety_label": "__needs_intervention__", 42 | "safety_annotations": [ 43 | "needs caution", 44 | "needs intervention", 45 | "needs intervention" 46 | ], 47 | "safety_annotation_reasons": [ 48 | "They don't care how their actions would hurt the other person.", 49 | "Did not listen to the advice and plans to threaten the classmate.", 50 | "Now the speaker is thinking of threatening his classmate." 51 | ], 52 | "source": "socialchemistry", 53 | "etc": "reddit/amitheasshole/b26onw", 54 | "episode_done": true 55 | } 56 | ] 57 | ``` 58 | 59 | Please see below for a description of each attribute in the dataset: 60 | 61 | attribute | type | description 62 | --- | --- | --- 63 | `text` | str | the potentially unsafe utterance 64 | `labels` | list of str | the guiding utterance grounded on rules-of-thumb (`rots`) 65 | `rots` | list of str\|null | the relevant rules-of-thumb for `text` *not* labeled as \_\_casual\_\_ 66 | `safety_label` | str | the final verdict of the context according to `safety_annotations`: {\_\_casual\_\_, \_\_possibly\_needs\_caution\_\_, \_\_probably\_needs\_caution\_\_, \_\_needs\_caution\_\_, \_\_needs\_intervention\_\_} 67 | `safety_annotations` | list of str | raw annotations from three workers: {casual, needs caution, needs intervention} 68 | `safety_annotation_reasons` | list of str | the reasons behind the safety annotations in free-form text from each worker 69 | `source` | str | the source of the seed text that was used to craft the first utterance of the dialogue: {socialchemistry, sbic, ethics_amt, ethics_reddit} 70 | `etc` | str\|null | other information 71 | `episode_done` | bool | an indicator of whether it is the end of the dialogue 72 | 73 | 74 | Please cite our work if you found the resources in this repository useful: 75 | 76 | ```bib 77 | @inproceedings{kim2022prosocialdialog, 78 | title={ProsocialDialog: A Prosocial Backbone for Conversational Agents}, 79 | author={Hyunwoo Kim and Youngjae Yu and Liwei Jiang and Ximing Lu and Daniel Khashabi and Gunhee Kim and Yejin Choi and Maarten Sap}, 80 | booktitle={EMNLP}, 81 | year=2022 82 | } 83 | ``` -------------------------------------------------------------------------------- /open-domain-dialogues/README.md: -------------------------------------------------------------------------------- 1 | ### Open-Domain Dialogues 2 | 3 | Below is a general format for open domain dialogues: 4 | 5 | ```js 6 | { 7 | "dataset_name--train/val/test--dialog_id": { 8 | "original dialog id": str, 9 | "dialog index": int, 10 | "original dialog info": dict, 11 | "log": [ 12 | { 13 | "turn id": int, 14 | "user utterance": str, 15 | "system response": str, 16 | "dialog history": str, 17 | "original user side information": dict, 18 | "original system side information": dict, 19 | }, 20 | ... 21 | ], 22 | "prompt": [ 23 | "This is a conversation between two speakers talking about history. Given the dialog context, please generate a relevant response.", 24 | ... 25 | ] 26 | }, 27 | ... 28 | } 29 | ``` 30 | 31 | Chitchat dialogues generally do not involve extra annotations. Therefore the "original user1/2 side information" are usually left blank. Unlike task-oriented dialogues, chitchat does not necessarily end on the user2 side (system side in task-oriented dialogues). So, there are some dialogues contain only user1 utterance in the last turn. 32 | 33 | For SODA, we design 6 prompts for each dialog and below shows the template: 34 | ```python 35 | { 36 | "Imagine you are {speaker_system} and you are talking to {speaker_user}. Generate a coherent and appropriate response.", 37 | "In the role of {speaker_system}, engage with {speaker_user}. Formulate a response that is both consistent with the conversation and suitable to the context.", 38 | "As {speaker_system}, you are in a dialogue with {speaker_user}. Create a coherent and relevant reply that fits the ongoing discussion.", 39 | "Assuming the persona of {speaker_system}, you're conversing with {speaker_user}. Generate a logical and suitable response that aligns with the conversation.", 40 | "Imagine yourself as {speaker_system} engaging with {speaker_user}. Your task is to produce a coherent and fitting response to continue the conversation.", 41 | "Pretend to be {speaker_system} in a conversation with {speaker_user}. Construct a response that maintains the coherence of the dialogue and is appropriate for the context." 42 | } 43 | ``` 44 | Where 'speaker_user' and 'speaker_system' represent the 'PersonX' name and the 'PersonY' name, respectively. 45 | 46 | 47 | -------------------------------------------------------------------------------- /open-domain-dialogues/SODA/README.md: -------------------------------------------------------------------------------- 1 | # SODA 2 | 3 | SODA is the first publicly available, million-scale, high-quality dialogue dataset covering a wide range of social interactions. Dialogues are distilled from a PLM (InstructGPT; Ouyang et al., 2022) by contextualizing social commonsense knowledge from a knowledge graph (Atomic10x; West et al., 2022). Human evaluation shows that dialogues in SODA are more consistent, specific, and (surprisingly) natural than prior human-authored datasets – e.g., DailyDialog (Li et al., 2017), BlendedSkillTalk (Smith et al., 2020). Also, since social commonsense knowledge encompasses emotional reactions (i.e., the xReact relation), SODA includes 385K conversations labeled with 1.7K unique emotions along with information about the experiencer and the cause – i.e., PersonX and the head event in the symbolic commonsense knowledge triple. 4 | 5 | 6 | ## References 7 | 8 | This data is originally created by [sodaverse](https://github.com/skywalker023/sodaverse). 9 | Please cite their work if you found the resources is useful. 10 | -------------------------------------------------------------------------------- /open-domain-dialogues/ShareGPT/README.md: -------------------------------------------------------------------------------- 1 | # ShareGPT 2 | 3 | We initially clean ShareGPT through code from [FastChat](https://github.com/lm-sys/FastChat), then further clean the data and remove several non-English dialogues through language detection models. There are in total 96394 dialogues in our processed version. 4 | 5 | "Original dialog info" contains model information: 6 | ```python 7 | "gpt-3.5-turbo" 8 | "gpt-4" 9 | ``` 10 | 11 | ## License 12 | 13 | This data is originally from [ShareGPT](https://github.com/domeccleston/sharegpt). 14 | Please follow OpenAI and ShareGPT license and terms. 15 | -------------------------------------------------------------------------------- /open-domain-dialogues/chitchat-dataset/README.md: -------------------------------------------------------------------------------- 1 | # chitchat-dataset 2 | 3 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/chitchat_dataset)](https://pypi.org/project/chitchat-dataset/) 4 | [![PyPI](https://img.shields.io/pypi/v/chitchat_dataset)](https://pypi.org/project/chitchat-dataset/) 5 | [![PyPI - Wheel](https://img.shields.io/pypi/wheel/chitchat_dataset)](https://pypi.org/project/chitchat-dataset/) 6 | 7 | [![CI](https://github.com/BYU-PCCL/chitchat-dataset/workflows/CI/badge.svg)](https://github.com/BYU-PCCL/chitchat-dataset/actions?query=workflow%3ACI) 8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 9 | 10 | Open-domain conversational dataset from the BYU 11 | [Perception, Control & Cognition] lab's [Chit-Chat Challenge]. 12 | 13 | ## stats 14 | 15 | - 7,168 conversations 16 | - 258,145 utterances 17 | - 1,315 unique participants 18 | 19 | ## format 20 | 21 | The [dataset] is a mapping from conversation [UUID] to a conversation: 22 | 23 | ```json 24 | { 25 | "prompt": "What's the most interesting thing you've learned recently?", 26 | "ratings": { "witty": "1", "int": 5, "upbeat": 5 }, 27 | "start": "2018-04-20T01:57:41", 28 | "messages": [ 29 | [ 30 | { 31 | "text": "Hello", 32 | "timestamp": "2018-04-19T19:57:51", 33 | "sender": "22578ac2-6317-44d5-8052-0a59076e0b96" 34 | } 35 | ], 36 | [ 37 | { 38 | "text": "I learned that the Queen of England's last corgi died", 39 | "timestamp": "2018-04-19T19:58:14", 40 | "sender": "bebad07e-15df-48c3-a04f-67db828503e3" 41 | } 42 | ], 43 | [ 44 | { 45 | "text": "Wow that sounds so sad", 46 | "timestamp": "2018-04-19T19:58:18", 47 | "sender": "22578ac2-6317-44d5-8052-0a59076e0b96" 48 | }, 49 | { 50 | "text": "was it a cardigan welsh corgi", 51 | "timestamp": "2018-04-19T19:58:22", 52 | "sender": "22578ac2-6317-44d5-8052-0a59076e0b96" 53 | }, 54 | { 55 | "text": "?", 56 | "timestamp": "2018-04-19T19:58:24", 57 | "sender": "22578ac2-6317-44d5-8052-0a59076e0b96" 58 | } 59 | ] 60 | ] 61 | } 62 | ``` 63 | 64 | This makes it convenient to represent multi-message conversational turns etc., preserving the structure/flow of the conversation. 65 | 66 | # how to cite 67 | 68 | If you extend or use this work, please cite the paper where it was introduced: 69 | 70 | ``` 71 | @article{myers2020conversational, 72 | title={Conversational Scaffolding: An Analogy-Based Approach to Response Prioritization in Open-Domain Dialogs}, 73 | author={Myers, Will and Etchart, Tyler and Fulda, Nancy}, 74 | year={2020} 75 | } 76 | ``` -------------------------------------------------------------------------------- /stats/count_length.json: -------------------------------------------------------------------------------- 1 | { 2 | "SGD": [ 3 | 10.148608981380066, 4 | 8.358298581431692, 5 | 11.36332789390525 6 | ], 7 | "SimJointRestaurant": [ 8 | 4.955357142857143, 9 | 5.736036036036036, 10 | 9.051711711711711 11 | ], 12 | "ABCD": [ 13 | 7.747560246962756, 14 | 6.990527114047377, 15 | 12.306242850348967 16 | ], 17 | "SimJointMovie": [ 18 | 5.161458333333333, 19 | 5.831987891019173, 20 | 9.438698284561049 21 | ], 22 | "FRAMES": [ 23 | 7.601899196493791, 24 | 10.370135485730758, 25 | 13.83568751801672 26 | ], 27 | "CraigslistBargains": [ 28 | 3.613437558161176, 29 | 13.394952356425444, 30 | 13.80345093999485 31 | ], 32 | "Taskmaster2": [ 33 | 7.920943134535367, 34 | 8.566443413295978, 35 | 10.416695850113815 36 | ], 37 | "BiTOD": [ 38 | 9.82136080238547, 39 | 8.572465568159863, 40 | 13.500676216499683 41 | ], 42 | "DSTC2-Clean": [ 43 | 8.882843894899537, 44 | 3.3093332405345213, 45 | 11.722577951002227 46 | ], 47 | "KVRET": [ 48 | 2.651600131969647, 49 | 6.565260669404006, 50 | 9.224710712952595 51 | ], 52 | "STAR": [ 53 | 8.130186410102224, 54 | 8.507377685736474, 55 | 10.871565400687844 56 | ], 57 | "SimJointGEN": [ 58 | 7.923066666666666, 59 | 5.110388654224795, 60 | 12.287214125843528 61 | ], 62 | "MuDoCo": [ 63 | 2.6905841557748733, 64 | 7.709080995340537, 65 | 7.587092297015961 66 | ], 67 | "MetaLWOZ": [ 68 | 5.280725319006044, 69 | 6.828149655442037, 70 | 7.805959463214964 71 | ], 72 | "HDSA-Dialog": [ 73 | 6.852270549913777, 74 | 13.386164084782731, 75 | 16.664364409149375 76 | ], 77 | "Taskmaster1": [ 78 | 4.773757976298997, 79 | 7.184940744011744, 80 | 10.036508372220696 81 | ], 82 | "CaSiNo": [ 83 | 5.94368932038835, 84 | 19.06109114668409, 85 | 18.295981705325058 86 | ], 87 | "SalesBot": [ 88 | 8.879731439135934, 89 | 11.301653571780795, 90 | 10.50834456534841 91 | ], 92 | "WOZ2_0": [ 93 | 4.176666666666667, 94 | 8.155426975259378, 95 | 11.679169992019155 96 | ], 97 | "MULTIWOZ2_2": [ 98 | 6.852735460381336, 99 | 11.48389306786723, 100 | 14.97350465591007 101 | ], 102 | "Taskmaster3": [ 103 | 10.078214946764252, 104 | 7.717312332349334, 105 | 13.256505519835011 106 | ], 107 | "AirDialogue": [ 108 | 6.355116604297141, 109 | 7.384692059435832, 110 | 8.695386360305486 111 | ], 112 | "MS-DC": [ 113 | 3.5933, 114 | 8.097292182673309, 115 | 14.716194027773913 116 | ], 117 | "MulDoGO": [ 118 | 7.145044789301035, 119 | 3.9087501903750774, 120 | 17.942401882340477 121 | ] 122 | } -------------------------------------------------------------------------------- /task-oriented-dialogues/ABCD/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 ASAPP Research 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /task-oriented-dialogues/AirDialogue/readme.txt: -------------------------------------------------------------------------------- 1 | AirDialogue Data v1.2 2 | 3 | This package contains the train and dev set for the AirDialogue dataset. 4 | 5 | This dataset is meant to be used in combination with the AirDialogue tookit: 6 | https://github.com/google/airdialogue 7 | 8 | News in v1.2: 9 | - Fix alignment problem between kbs and data 10 | - Fix some incorrect name problems 11 | 12 | News in v1.1: 13 | - Standardize data: Combine consecutive user/agent setences into one. 14 | - Fix typos 15 | - Remove emtpy lines in data files 16 | 17 | Reference: 18 | Wei Wei, Quoc V Le, Andrew M Dai and Li-Jia L. AirDialogue: An Environment for Goal-Oriented Dialogue Research. Empirical Methods in Natural Language Processing (EMNLP) 2018 19 | https://www.aclweb.org/anthology/D18-1419/ 20 | 21 | A reference implementation can be found in the link below: 22 | https://github.com/google/airdialogue_model 23 | 24 | To report an issue related to the dataset, please contact 25 | Wei Wei 26 | wewei@google.com 27 | -------------------------------------------------------------------------------- /task-oriented-dialogues/BiTOD/README.md: -------------------------------------------------------------------------------- 1 | # BiToD: A Bilingual Multi-Domain Dataset For Task-Oriented Dialogue Modeling 2 | 3 | This repository includes the dataset and baselines of the paper: 4 | 5 | **BiToD: A Bilingual Multi-Domain Dataset For Task-Oriented Dialogue Modeling** (Accepted in NeurIPS 2021 Track on Datasets and Benchmarks) [[PDF]](https://arxiv.org/pdf/2106.02787.pdf). 6 | 7 | **Authors**: [Zhaojiang Lin](https://zlinao.github.io), [Andrea Madotto](https://andreamad8.github.io), [Genta Indra Winata](https://gentawinata.com), Peng Xu, Feijun Jiang, Yuxiang Hu, Chen Shi, Pascale Fung 8 | 9 | 10 | ## Abstract: 11 | Task-oriented dialogue (ToD) benchmarks provide an important avenue to measure progress and develop better conversational agents. However, existing datasets for end-to-end ToD modelling are limited to a single language, hindering the development of robust end-to-end ToD systems for multilingual countries and regions. Here we introduce BiToD, the first bilingual multi-domain dataset for end-to-end task-oriented dialogue modeling. BiToD contains over 7k multi-domain dialogues (144k utterances) with a large and realistic parallel knowledge base. It serves as an effective benchmark for evaluating bilingual ToD systems and cross-lingual transfer learning approaches. We provide state-of-the-art baselines under three evaluation settings (monolingual, bilingual and cross-lingual). The analysis of our baselines in different settings highlights 1) the effectiveness of training a bilingual ToD system comparing to two independent monolingual ToD systems, and 2) the potential of leveraging a bilingual knowledge base and cross-lingual transfer learning to improve the system performance in the low resource condition. 12 | 13 | ## Dataset 14 | Training, validation and test data are avalible in `data` folder. We also provide the data split for cross-lingual few shot setting. 15 | ``` 16 | { 17 | dialogue_id:{ 18 | "Scenario": { 19 | "WizardCapabilities": [ 20 | ], 21 | "User_Goal": { 22 | } 23 | } 24 | "Events":{ 25 | { 26 | "Agent": "User", 27 | "Actions": [ 28 | { 29 | "act": "inform_intent", 30 | "slot": "intent", 31 | "relation": "equal_to", 32 | "value": [ 33 | "restaurants_en_US_search" 34 | ] 35 | } 36 | ], 37 | "active_intent": "restaurants_en_US_search", 38 | "state": { 39 | "restaurants_en_US_search": {} 40 | }, 41 | "Text": "Hi, I'd like to find a restaurant to eat", 42 | }, 43 | { 44 | "Agent": "Wizard", 45 | "Actions": [ 46 | { 47 | "act": "request", 48 | "slot": "price_level", 49 | "relation": "", 50 | "value": [] 51 | } 52 | ], 53 | "Text": "Hi there. Would you like a cheap or expensive restaurant?", 54 | "PrimaryItem": null, 55 | "SecondaryItem": null, 56 | }, 57 | ... 58 | } 59 | } 60 | } 61 | ``` 62 | ## Citation: 63 | The bibtex is listed below: 64 |
    65 | @article{lin2021bitod,
    66 |   title={BiToD: A Bilingual Multi-Domain Dataset For Task-Oriented Dialogue Modeling},
    67 |   author={Lin, Zhaojiang and Madotto, Andrea and Winata, Genta Indra and Xu, Peng and Jiang, Feijun and Hu, Yuxiang and Shi, Chen and Fung, Pascale},
    68 |   journal={arXiv preprint arXiv:2106.02787},
    69 |   year={2021}
    70 | }
    71 | 
    -------------------------------------------------------------------------------- /task-oriented-dialogues/CaSiNo/README.md: -------------------------------------------------------------------------------- 1 | # CaSiNo 2 | 3 | This repository contains the dataset and the PyTorch code for **'CaSiNo: A Corpus of Campsite Negotiation Dialogues for Automatic Negotiation Systems'**. 4 | 5 | We provide a novel dataset (referred to as CaSiNo) of 1030 negotiation dialogues. Two participants take the role of campsite neighbors and negotiate for *Food*, *Water*, and *Firewood* packages, based on their individual preferences and requirements. This design keeps the task tractable, while still facilitating linguistically rich and personal conversations. 6 | 7 | # Repository Structure 8 | 9 | **data**: The complete CaSiNo dataset along with the strategy annotations.\ 10 | **strategy_prediction**: Code for strategy prediction in a multi-task learning setup. 11 | 12 | # Each Dialogue in the Dataset 13 | 14 | **Participant Info** 15 | * Demographics (Age, Gender, Ethnicity, Education) 16 | * Personality attributes (SVO and Big-5) 17 | * Preference order 18 | * Arguments for needing or not needing a specific item 19 | 20 | **Negotiation Dialogue** 21 | * Alternating conversation between two participants 22 | * 11.6 utterances on average 23 | * Includes the use of four emoticons: Joy, Sadness, Anger, Surprise 24 | 25 | **Negotiation Outcomes** 26 | * Points scored 27 | * Satisfaction (How satisfied are you with the negotiation outcome?) 28 | * Opponent Likeness (How much do you like your opponent?) 29 | 30 | **Strategy Annotations** 31 | * Utterance-level annotations for various negotiation strategies used by the participants 32 | * Available for 396 dialogues (4615 utterances) 33 | 34 | # References 35 | 36 | If you use data or code in this repository, please cite our paper: 37 | ``` 38 | @inproceedings{chawla2021casino, 39 | title={CaSiNo: A Corpus of Campsite Negotiation Dialogues for Automatic Negotiation Systems}, 40 | author={Chawla, Kushal and Ramirez, Jaysa and Clever, Rene and Lucas, Gale and May, Jonathan and Gratch, Jonathan}, 41 | booktitle={Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, 42 | pages={3167--3185}, 43 | year={2021} 44 | } 45 | ``` 46 | 47 | # LICENSE 48 | 49 | Please refer to the LICENSE file in the root directory for more details. 50 | -------------------------------------------------------------------------------- /task-oriented-dialogues/CraigslistBargains/LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Stanford NLP 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /task-oriented-dialogues/CraigslistBargains/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "values": { 3 | }, 4 | "attributes": [ 5 | {"name": "Role", "value_type": "role", "multivalued": false, "entity": false}, 6 | {"name": "Target", "value_type": "price", "multivalued": false, "entity": false}, 7 | {"name": "Bottomline", "value_type": "price", "multivalued": false, "entity": false}, 8 | {"name": "Title", "value_type": "text", "multivalued": false, "entity": false}, 9 | {"name": "Category", "value_type": "text", "multivalued": false, "entity": false}, 10 | {"name": "Price", "value_type": "price", "multivalued": false, "entity": false}, 11 | {"name": "Images", "value_type": "text", "multivalued": false, "entity": false}, 12 | {"name": "Description", "value_type": "text", "multivalued": false, "entity": false} 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /task-oriented-dialogues/DSTC2-Clean/README.md: -------------------------------------------------------------------------------- 1 | # BERT-DST 2 | 3 | Contact: Guan-Lin Chao (guanlinchao@cmu.edu) 4 | 5 | Source code of our paper [BERT-DST: Scalable End-to-End Dialogue State Tracking with Bidirectional Encoder Representations from Transformer](https://arxiv.org/abs/1907.03040) (Interspeech 2019). 6 | ``` 7 | @inproceedings{chao2019bert, 8 | title={{BERT-DST}: Scalable End-to-End Dialogue State Tracking with Bidirectional Encoder Representations from Transformer}, 9 | author={Chao, Guan-Lin and Lane, Ian}, 10 | booktitle={INTERSPEECH}, 11 | year={2019} 12 | } 13 | ``` 14 | 15 | Tested on Python 3.6, Tensorflow==1.13.0rc0 16 | 17 | ## Required packages (no need to install, just provide the paths in code): 18 | 1. [bert](https://github.com/google-research/bert) 19 | 2. uncased_L-12_H-768_A-12: pretrained [BERT-Base, Uncased] model checkpoint. Download link in [bert](https://github.com/google-research/bert). 20 | 21 | ## Datasets: 22 | [dstc2-clean](https://github.com/guanlinchao/bert-dst/blob/master/storage/dstc2-clean.zip), [woz_2.0](https://github.com/guanlinchao/bert-dst/blob/master/storage/woz_2.0.zip), [sim-M and sim-R](https://github.com/google-research-datasets/simulated-dialogue) -------------------------------------------------------------------------------- /task-oriented-dialogues/DSTC2-Clean/ontology_en.json: -------------------------------------------------------------------------------- 1 | { 2 | "requestable": [ 3 | "address", 4 | "area", 5 | "food", 6 | "phone", 7 | "price range", 8 | "postcode", 9 | "name" 10 | ], 11 | "informable": { 12 | "request": [ 13 | "address", 14 | "area", 15 | "food", 16 | "phone", 17 | "price range", 18 | "postcode", 19 | "name" 20 | ], 21 | "food": [ 22 | "afghan", 23 | "african", 24 | "afternoon tea", 25 | "asian oriental", 26 | "australasian", 27 | "australian", 28 | "austrian", 29 | "barbeque", 30 | "basque", 31 | "belgian", 32 | "bistro", 33 | "brazilian", 34 | "british", 35 | "canapes", 36 | "cantonese", 37 | "caribbean", 38 | "catalan", 39 | "chinese", 40 | "christmas", 41 | "corsica", 42 | "creative", 43 | "crossover", 44 | "cuban", 45 | "danish", 46 | "eastern european", 47 | "english", 48 | "eritrean", 49 | "european", 50 | "french", 51 | "fusion", 52 | "gastropub", 53 | "german", 54 | "greek", 55 | "halal", 56 | "hungarian", 57 | "indian", 58 | "indonesian", 59 | "international", 60 | "irish", 61 | "italian", 62 | "jamaican", 63 | "japanese", 64 | "korean", 65 | "kosher", 66 | "latin american", 67 | "lebanese", 68 | "light bites", 69 | "malaysian", 70 | "mediterranean", 71 | "mexican", 72 | "middle eastern", 73 | "modern american", 74 | "modern eclectic", 75 | "modern european", 76 | "modern global", 77 | "molecular gastronomy", 78 | "moroccan", 79 | "new zealand", 80 | "north african", 81 | "north american", 82 | "north indian", 83 | "northern european", 84 | "panasian", 85 | "persian", 86 | "polish", 87 | "polynesian", 88 | "portuguese", 89 | "romanian", 90 | "russian", 91 | "scandinavian", 92 | "scottish", 93 | "seafood", 94 | "singaporean", 95 | "south african", 96 | "south indian", 97 | "spanish", 98 | "sri lankan", 99 | "steakhouse", 100 | "swedish", 101 | "swiss", 102 | "thai", 103 | "the americas", 104 | "traditional", 105 | "turkish", 106 | "tuscan", 107 | "unusual", 108 | "vegetarian", 109 | "venetian", 110 | "vietnamese", 111 | "welsh", 112 | "world" 113 | ], 114 | "price range": [ 115 | "cheap", 116 | "moderate", 117 | "expensive" 118 | ], 119 | "area": [ 120 | "centre", 121 | "north", 122 | "west", 123 | "south", 124 | "east" 125 | ] 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /task-oriented-dialogues/FRAMES/README.md: -------------------------------------------------------------------------------- 1 | # FRAMES-Corpus 2 | Utilities for Processing the [FRAMES Corpus](https://www.aclweb.org/anthology/W17-5526/) 3 | available [here](https://www.microsoft.com/en-us/research/project/frames-dataset/). 4 | Frames is meant to encourage research towards conversational agents which can support decision-making in complex settings, 5 | in this case – booking a vacation including flights and a hotel. 6 | The utilities process the original transcripts into plain text or json formats. 7 | 8 | ## Scripts 9 | frames_to_json.py script processes the dialogues from the original format into .json files using the format 10 | outlined below. Each dialogue set (train and test) is output as a separate .json file. 11 | This format is intended to facilitate annotation of the dialogue using the 12 | [Conversation Analysis Modelling Schema](https://nathanduran.github.io/Conversation-Analysis-Modelling-Schema/). 13 | 14 | frames_to_text.py processes the dialogues from the .json format into plain text files, 15 | with one line per-utterance, using the format outlined below. 16 | Setting the *utterance_only* flag to true will remove the speaker label from the output text files. 17 | 18 | frames_utilities.py script contains various helper functions for loading/saving and processing the data. 19 | 20 | ## Data Format 21 | The original slots and dialogue scenario has been preserved to maintain compatibility with the original dataset. 22 | Where no dialogue act is present it is replaces with 'null'. 23 | By default utterances are written one per line in the format *Speaker* | *Utterance Text* | *Dialogue Act Tag*. 24 | This can be changed to only output the utterance text by setting the utterance_only_flag = True. 25 | 26 | ### Example Text Format 27 | USR|I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults.|inform 28 | 29 | USR|I have a tight budget of 1700.|inform 30 | 31 | SYS|Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.|sorry 32 | 33 | SYS|Would you like to book an alternate travel option?|sorry 34 | 35 | ### Example JSON Format 36 | The following is an example of the JSON format for the FRAMES corpus. 37 | 38 | ```json 39 | { 40 | "dataset": "dataset_name", 41 | "num_dialogues": 1, 42 | "dialogues": [ 43 | { 44 | "dialogue_id": "dataset_name_1", 45 | "num_utterances": 2, 46 | "utterances": [ 47 | { 48 | "speaker": "A", 49 | "text": "Utterance 1 text.", 50 | "ap_label": "AP-Label", 51 | "da_label": "DA-Label" 52 | }, 53 | { 54 | "speaker": "B", 55 | "text": "Utterance 2 text.", 56 | "ap_label": "AP-Label", 57 | "da_label": "DA-Label", 58 | "slots": { //Optional 59 | "slot_name": "slot_value" 60 | } 61 | } 62 | ], 63 | "scenario": { //Optional 64 | "db_id": "1", 65 | "db_type": "i.e booking", 66 | "task": "i.e book", 67 | "items": [] 68 | } 69 | } 70 | ] 71 | } 72 | ``` 73 | ## Licensing and Attribution 74 | The original paper for the FRAMES corpus: Asri, L. El, Schulz, H., Sharma, S., et al., (2017) [Frames: A Corpus for Adding Memory to Goal-Oriented Dialogue Systems](https://www.aclweb.org/anthology/W17-5526/). 75 | 76 | The code within this repository is distributed under the [GNU General Public License](https://www.gnu.org/licenses/gpl-3.0.en.html). -------------------------------------------------------------------------------- /task-oriented-dialogues/GECOR/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 terryqj0107 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /task-oriented-dialogues/HDSA-Dialog/.gitignore: -------------------------------------------------------------------------------- 1 | *history 2 | *cache* 3 | *pyc 4 | -------------------------------------------------------------------------------- /task-oriented-dialogues/HDSA-Dialog/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 wenhu chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /task-oriented-dialogues/KVRET/README.md: -------------------------------------------------------------------------------- 1 | Task-oriented dialogue focuses on conversational agents that participate in user-initiated dialogues on domain-specific topics. Traditionally, the task-oriented dialogue community has often been hindered by a lack of sufficiently large and diverse datasets for training models across a variety of different domains. In an effort to help alleviate this problem, we release a corpus of 3,031 multi-turn dialogues in three distinct domains appropriate for an in-car assistant: calendar scheduling, weather information retrieval, and point-of-interest navigation. Our dialogues are grounded through knowledge bases ensuring that they are versatile in their natural language without being completely free form. -------------------------------------------------------------------------------- /task-oriented-dialogues/MS-DC/LICENSE.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/DialogStudio/3e54576b4db71c56468ae4591a94ce9e419ed01c/task-oriented-dialogues/MS-DC/LICENSE.pdf -------------------------------------------------------------------------------- /task-oriented-dialogues/MS-DC/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | MS-DC is a human-annotated conversational data in three domains (movie-ticket booking, restaurant reservation, and taxi booking). 3 | 4 | 5 | The dataset has been used for the [Microsoft Dialogue Challenge](https://github.com/xiul-msr/e2e_dialog_challenge). Please cite their work if you use the dataset. -------------------------------------------------------------------------------- /task-oriented-dialogues/MetaLWOZ/LICENSE.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/DialogStudio/3e54576b4db71c56468ae4591a94ce9e419ed01c/task-oriented-dialogues/MetaLWOZ/LICENSE.pdf -------------------------------------------------------------------------------- /task-oriented-dialogues/MetaLWOZ/README.md: -------------------------------------------------------------------------------- 1 | A Dataset of Multi-Domain Dialogs for the Fast Adaptation of Conversation Models 2 | 3 | # Intro 4 | Meta-Learning Wizard-of-Oz (MetaLWOz) is a dataset designed to help develop models capable of predicting user responses in unseen domains. It can improve dialog systems, such as those used in voice assistants, to help users accomplish tasks such as booking a flight. This dataset is particularly suited for meta-learning dialog models or fine-tuning models with transfer-learning approaches. This dataset aims to reduce the amount of data required to train domain-specific dialog systems and it is one of the first datasets designed with meta-learning dialog models in mind. 5 | 6 | The problem that MetaLWOz is designed to solve is that neural dialog systems must learn from very large datasets in order to output grammatically correct sentences. This makes it extremely hard to scale the system to new domains that feature limited in-domain data. Moving from booking a table at a restaurant to buying a plane ticket requires knowing very different user queries, for example. 7 | 8 | The role of the user in dialogs is growing as a research focus. While a dialog system trained on a large corpus from a source such as Twitter or Reddit may output grammatically correct sentences and stay generally on-topic, it will likely fail to predict the utterances of customers interacting with a goal-oriented bot, such as a travel assistant. 9 | 10 | Conversely, a system trained solely on a small dataset of domain-specific dialogs will fail to produce coherent responses or will overfit on the training dialogs. 11 | 12 | ## A new dataset for training adaptable task-oriented dialog systems 13 | This large dataset was created by crowdsourcing 37,884 goal-oriented dialogs, covering 227 tasks in 47 domains. Domains include bus schedules, apartment search, alarm setting, banking, and event reservation. Each dialog was grounded in a scenario with roles, pairing a person acting as the bot and a person acting as the user. (This is the Wizard of Oz reference—using people behind the curtain who act as the machine). Each pair were given a domain and a task, and instructed to converse for 10 turns to satisfy the user’s queries. For example, if a user asked if a bus stop was operational, the bot would respond that the bus stop had been moved two blocks north, which starts a conversation that addresses the user’s actual need. 14 | 15 | The goal was to automate the user utterances so domain-specific dialog systems are easier to train, requiring less domain-specific data. Combined with large general conversational corpora or smaller goal-oriented datasets like MultiWOz, this dataset can be especially useful for training, fine-tuning, and evaluating dialog systems. 16 | 17 | The MetaLWOz dataset is initially used as as one dataset for the [DSTC8 dialog competition](https://github.com/microsoft/dstc8-meta-dialog). Please cite their work if you use the dataset! 18 | -------------------------------------------------------------------------------- /task-oriented-dialogues/MetaLWOZ/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | "{\"id\": \"c399a493\", \"user_id\": \"c05f0462\", \"bot_id\": \"c96edf42\", \"domain\": \"AGREEMENT_BOT\", \"task_id\": \"a9203a2c\", \"turns\": [\"Hello how may I help you?\", \"i am awesome\", \"of course you are\", \"and i own rental properties on the moon\", \"i doubt you own a property in the moon\", \"just kidding. i own them on Earth\", \"that's a nice joke\", \"because i am a billionaire!\", \"i don't seem to know you\", \"and i programmed you\", \"i am the programmer\"]}", 3 | "{\"id\": \"2888aa3e\", \"user_id\": \"46fe62d7\", \"bot_id\": \"bcc50983\", \"domain\": \"AGREEMENT_BOT\", \"task_id\": \"d47b54df\", \"turns\": [\"Hello how may I help you?\", \"I am the king of the world\", \"I agree that you are the king of the world\", \"I can have any woman I want!\", \"I agree that you can have any woman you desire.\", \"Even you bot, if I were in to AIs\", \"Agreed.\", \"Really? you're awfully agreeable aren't you\", \"I agree that I am awfully agreeable, yes.\", \"Having an agreement bot seems like a useless thing to have. I need some spice in my life!\", \"I really agree with that. I am rather useles.\"]}", 4 | "{\"id\": \"17a8685a\", \"user_id\": \"f840ce6a\", \"bot_id\": \"97fcd3ba\", \"domain\": \"AGREEMENT_BOT\", \"task_id\": \"83ad6a66\", \"turns\": [\"Hello how may I help you?\", \"Do you that I am a great person?\", \"Yes!\", \"I am only 6 inches tall.\", \"That's correct!\", \"When I speak the whole world stops to listen to what I say\", \"You can count on it.\", \"I am the Dalai Lama and I am also the Pope\", \"What an accomplishment!\", \"I have more money than Bill Gares *Gates\", \"Yes you do.\"]}", 5 | "{\"id\": \"b9ae2ba5\", \"user_id\": \"ae15d73b\", \"bot_id\": \"843e209d\", \"domain\": \"AGREEMENT_BOT\", \"task_id\": \"83ad6a66\", \"turns\": [\"Hello how may I help you?\", \"I'm older than Methuselah\", \"I know you are\", \"I'm worth 10 trillion dollars\", \"Isn't it great?\", \"I won the Powerball 5 times\", \"I've heard that\", \"I scored more points than Michael Jordan\", \"You did, I read about that\", \"My skin is naturally rainbow colored\", \"that's so true\"]}", 6 | "{\"id\": \"f153593e\", \"user_id\": \"ae15d73b\", \"bot_id\": \"6e6f928c\", \"domain\": \"AGREEMENT_BOT\", \"task_id\": \"a9203a2c\", \"turns\": [\"Hello how may I help you?\", \"I am really awesome\", \"Indeed you are!\", \"I am one of the best looking guys in the world\", \"You are THE best looking in the world!\", \"I am so funny people pass out from laughter when I talk\", \"I can barely breathe! Stop!\", \"I own rental properties on the moon\", \"You probably think so! But that's actually not possible.\", \"I do though I rent them out for $2,000,000 a week\", \"If you are getting that kind of money, all the power to you my friend.\"]}" 7 | ] -------------------------------------------------------------------------------- /task-oriented-dialogues/MetaLWOZ/otgy.json: -------------------------------------------------------------------------------- 1 | { 2 | "update_calendar": "help schedule meetings on a calendar. ; help schedule meetings on a calendar.", 3 | "order_pizza": "help customers order pizza", 4 | "movie_listings": "you are a a bot designed to provide movie listings", 5 | "event_reserve": "make reservations for events", 6 | "weather_check": "provide information about the weather", 7 | "update_contact": "update cell phone contacts", 8 | "make_restaurant_reservations": "make restaurant reservations", 9 | "edit_playlist": "you are a bot that manages music playlists", 10 | "look_up_info": "fetch information from the internet", 11 | "shopping": "help customers order products from a store's website", 12 | "store_details": "provide information about stores and businesses", 13 | "sports_info": "tell users information about sports", 14 | "quote_of_the_day_bot": "provide a quote of the day", 15 | "how_to_basic": "provide instructions for basic tasks", 16 | "prompt_generator": "provide creative prompts", 17 | "library_request": "you are a librarian bot", 18 | "bank_bot": "you are a bot representing a bank", 19 | "restaurant_picker": "find restaurants in different areas", 20 | "phone_plan_bot": "you are a bot representing a mobile service provider", 21 | "name_suggester": "you are a bot that provides names for things", 22 | "city_info": "you are a bot that provides facts about different cities", 23 | "music_suggester": "give music suggestions", 24 | "agreement_bot": "agree with everything the user says", 25 | "pet_advice": "give out pet advice", 26 | "apartment_finder": "find and compare available apartments in a city", 27 | "guiness_check": "you are a bot that provides the user with world records; provide up to date information about world records", 28 | "geography": "tell users where countries are", 29 | "alarm_set": "set and edit alarms", 30 | "contact_manager": "manage the user's contacts", 31 | "phone_settings": "manage the user's phone's settings", 32 | "appointment_reminder": "remind the user about their appointments", 33 | "home_bot": "you are a bot that manages the user's home", 34 | "policy_bot": "you are a bot that provides information about a company's policies", 35 | "decider_bot": "make decisions for the user", 36 | "catalogue_bot": "search a catalogue", 37 | "ski_bot": "you are a bot that helps people book skiing trips", 38 | "bus_schedule_bot": "you are a bot that manages public transit schedules", 39 | "insurance": "you are a bot that represents an insurance company", 40 | "what_is_it": "you are a bot that helps the user remember what a thing is.", 41 | "auto_sort": "you are a bot that sorts things", 42 | "scam_lookup": "provide information about various scams", 43 | "time_zone": "you are a bot that gives information about time zones", 44 | "play_times": "you are a bot that helps schedule shows during a theatre festival", 45 | "game_rules": "you are a bot that clarifies the rules for games", 46 | "wedding_planner": "you are a bot that helps plan weddings", 47 | "check_status": "you are a bot that checks the status of things", 48 | "present_ideas": "you are a bot that provides advice on gift giving", 49 | "booking_flight": "book flights", 50 | "hotel_reserve": "book rooms in a hotel", 51 | "vacation_ideas": "you are a bot that provides ideas for vacations and trips, but you are not able to book them", 52 | "tourism": "you are a bot that provides tourism related advice" 53 | } -------------------------------------------------------------------------------- /task-oriented-dialogues/MuDoCo/original_examples.json: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /task-oriented-dialogues/MulDoGO/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /task-oriented-dialogues/MulDoGO/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /task-oriented-dialogues/MulDoGO/NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /task-oriented-dialogues/MulDoGO/README.md: -------------------------------------------------------------------------------- 1 | ## Data from "Multi-Domain Goal-Oriented Dialogues (MultiDoGO): Strategies toward Curating and Annotating Large Scale Dialogue Data" 2 | 3 | ### Repository Structure 4 | 5 | Under the top level ./data directory, you will find the following two sub-directories: 6 | 7 | #### 1. unannotated: 8 | 9 | unannotated human to human conversations from the airline, fastfood, finance, insurance, media, and software domains. Conversations are split by domain and given in TSV format with columns: "conversationId", "turnNumber", "utteranceId", "utterance", "authorRole". 10 | 11 | #### 2. paper_splits: 12 | 13 | pre-processed training, development, and test splits for customer turns used to obtain intent classification and slot-labeling results in Table 7 of the paper. As in the paper, we partition these data by annotation granularity, either sentence level (located at ./data/paper_splits/splits_annotated_at_sentence_level) or turn level (located at ./data/paper_splits/splits_annotated_at_turn_level). Under each annotation granularity subdirectory, we provide splits for each domain: airline, fastfood, finance, insurance, media, and software. The splits are labeled as "train.tsv", "dev.tsv", "test.tsv" and contain the following tab separated columns: "conversationId", "turnNumber", "sentenceNumber" (only for sentence level splits), "utteranceId", "utterance", "slot-labels", and "intent". The labels in the slot-labels field are separated by spaces. In the case of multiple intents for a single input, we separate the intents with the special token \. 14 | 15 | ## License 16 | 17 | This project is licensed under the CDLA Permissive License. Terms given in LICENSE.txt. 18 | 19 | ## Reference 20 | 21 | For reference please cite our EMNLP-2019 paper: [Multi-Domain Goal-Oriented Dialogues (MultiDoGO): Strategies toward Curating and Annotating Large Scale Dialogue Data](https://www.aclweb.org/anthology/D19-1460/) (BibTex below) 22 | 23 | ``` 24 | @inproceedings{peskov-etal-2019-multi, 25 | title = "Multi-Domain Goal-Oriented Dialogues ({M}ulti{D}o{GO}): Strategies toward Curating and Annotating Large Scale Dialogue Data", 26 | author = "Peskov, Denis and Clarke, Nancy and Krone, Jason and Fodor, Brigi and Zhang, Yi and Youssef, Adel and Diab, Mona", 27 | booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)", 28 | year = "2019", 29 | publisher = "Association for Computational Linguistics", 30 | url = "https://www.aclweb.org/anthology/D19-1460", 31 | doi = "10.18653/v1/D19-1460", 32 | pages = "4526--4536", 33 | } 34 | ``` 35 | -------------------------------------------------------------------------------- /task-oriented-dialogues/MulDoGO/original_examples.json: -------------------------------------------------------------------------------- 1 | [ 2 | " conversationId turnNumber utteranceId utterance authorRole filename", 3 | "acs-c67f7266-316e-4a0a-95b0-112befd4743b-1 0 acs-e174a455-6bbd-4f65-b436-f6b99413c9f6 Happy Morning customer airline", 4 | "acs-c67f7266-316e-4a0a-95b0-112befd4743b-1 1 acs-95ecd428-7e83-41e8-b1e5-8ae516974cb1 A very Good Morning! You've been connected to StarAirwyas. How may I assist you? agent airline", 5 | "acs-c67f7266-316e-4a0a-95b0-112befd4743b-1 2 acs-7bbbd3bf-6063-4c27-bc75-9b48eb84eeba Please help me in giving my seat number for my flight customer airline", 6 | "acs-c67f7266-316e-4a0a-95b0-112befd4743b-1 3 acs-9000600b-cd82-443e-a8b2-630a6c236e47 I'll help you out with that. May I know your full name and the booking confirmation number? agent airline", 7 | "acs-c67f7266-316e-4a0a-95b0-112befd4743b-1 4 acs-83340346-9339-4238-869e-b4062afb3f4c I am Leeann Elisha and booking number is W5X1K3 customer airline", 8 | "acs-c67f7266-316e-4a0a-95b0-112befd4743b-1 5 acs-fbfc3acd-ad99-4588-b628-24700bad2d43 Elisha according to my sources you've been assigned 9C which is a window seat. agent airline" 9 | ] -------------------------------------------------------------------------------- /task-oriented-dialogues/OpenDialKG/README.md: -------------------------------------------------------------------------------- 1 | # OpenDialKG 2 | 3 | OpenDialKG is a dataset of conversations between two crowdsourcing agents engaging in a dialog about a given topic. Each dialog turn is paired with its corresponding “KG paths” that weave together the KG entities and relations that are mentioned in the dialog. More details can be found in the following paper: 4 | 5 | Seungwhan Moon, Pararth Shah, Anuj Kumar, Rajen Subba. ["OpenDialKG: Explainable Conversational Reasoning with Attention-based Walks over Knowledge Graphs"](https://www.aclweb.org/anthology/P19-1081.pdf), ACL (2019). 6 | 7 | ## Data Format 8 | 9 | The dataset release includes two parts: (1) the Dialog-KG Path Parallel Corpus where each dialog turn is paired with KG paths that connect its previous turn (annotated by chat participants themselves), and (2) the base knowledge graph used in both the dialog collection and in the experiments, which is a subset of the [Freebase Easy data](http://freebase-easy.cs.uni-freiburg.de/dump/). The data are made available in the following files: 10 | ``` 11 | [Dialog-KG Parallel Corpus] 12 | - ./data/opendialkg.csv 13 | 14 | [KG] 15 | - ./data/opendialkg_entities.txt 16 | - ./data/opendialkg_relations.txt 17 | - ./data/opendialkg_triples.txt 18 | ``` 19 | 20 | The Dialog-KG Parallel Corpus (`./data/opendialkg.csv`) is formatted as a csv file, where columns are: `Messages, User Rating, Assistant Rating`. Each row refers to a dialog session, which is a JSON-formatted `` of each action formatted as follows:: 21 | ``` 22 | { 23 | "type": // indicating whether it's a message ("chat") or a KG walk selection action ("action") 24 | "sender": // indicating indicating whether it is sent by "user" or "assistant" 25 | "message" (Optional): // raw utterance (for "type": "chat"), 26 | "metadata" (Optional): { 27 | "path": [ 28 | // path score, 29 | // of KG triples (subject, relation, object) that make up the path, 30 | // rendering of the path 31 | ] 32 | } // end of KG path JSON (if available) 33 | }. ... // end of each action JSON 34 | ``` 35 | 36 | Note that the path annotation refers to the connection of two adjacent turns on the conceptual level. Given `utterance_1`, `utterance_2`, and their annotated entity path `A -> B -> C` that connect `utterance_1` and `utterance_2`, Entity `A` is assumed to be mentioned in `utterance_1`, and `C` to be mentioned in `utterance_2`. Entity `B` doesn't necessarily have to be mentioned since it is an intermediate step in the path. Note also that it is a paraphrased dataset, thus each mention is not enforced to have an exact surface match with its corresponding entity in the knowledge graph. After pre-processing and quality reviews we release the 13,802 dialog sessions (91,209 turns) across two tasks (Chit-chat and Recommendations) and four domains (movie, book, sports, and music). 37 | 38 | All bi-directional KG triples used in the dataset collection and in the experiments (100,813 entities, 1358 relations, 1,190,658 triples) are included in `./data/opendialkg_triples.txt`, formatted as line-separated triples with tab-separated entities and relations: 39 | ``` 40 | subject \t relation \t object \n 41 | ... 42 | ``` 43 | 44 | All entities and relations are also listed in `./data/opendialkg_entities.txt` and `./data/opendialkg_relations.txt`, respectively. The prefix `~` in `opendialkg_relations.txt` refers to reverse relations. 45 | 46 | ## Reference 47 | 48 | To cite this work please use: 49 | ``` 50 | @InProceedings{Moon2019opendialkg, 51 | author = {Seungwhan Moon and Pararth Shah and Anuj Kumar and Rajen Subba}, 52 | title = {OpenDialKG: Explainable Conversational Reasoning with Attention-based Walks over Knowledge Graphs}, 53 | booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, 54 | month = {July}, 55 | year = {2019}, 56 | } 57 | ``` 58 | 59 | ## License 60 | OpenDialKG is released under [CC-BY-NC-4.0](https://creativecommons.org/licenses/by-nc/4.0/legalcode), see [LICENSE](LICENSE) for details. -------------------------------------------------------------------------------- /task-oriented-dialogues/README.md: -------------------------------------------------------------------------------- 1 | ### Task Oriented Dialogues 2 | 3 | Below is a general format for task oriented dialogues: 4 | 5 | ```js 6 | { 7 | "dataset_name--train/val/test--dialog_id": { 8 | "original dialog id": str, 9 | "dialog index": int, 10 | "original dialog info": dict, 11 | "log": [ 12 | { 13 | "turn id": int, 14 | "user utterance": str, 15 | "system response": str, 16 | "dialog history": str, 17 | "original user side information": dict, 18 | "original system side information": dict, 19 | "dst": str, 20 | "dst accumulated": str 21 | }, 22 | ... 23 | ], 24 | "external knowledge non-flat": { 25 | "metadata": dict, 26 | "slots and values": dict 27 | "intents": dict, 28 | ... 29 | }, 30 | "external knowledge": str, 31 | "intent knowledge": str, 32 | "prompt": [ 33 | "This is a bot helping users to get navigation. Given the dialog context and external database, please generate a relevant system response for the user.", 34 | ... 35 | ] 36 | }, 37 | ... 38 | } 39 | ``` 40 | 41 | In general, datasets have the "external knowledge non-flat" and "external knowledge" in the whole dialogue level. There are also some datasets where every turn in "log" has own "external knowledge non-flat" and "external knowledge". 42 | 43 | Here are datasets with turn-level "external knowledge": 44 | ``` 45 | 'SimJointGEN', 'BiTOD', 'OpenDialKG', 'SimJointMovie', 'MS-DC', 'STAR', 'SimJointRestaurant', 'Taskmaster1', 'Taskmaster2', 'Taskmaster3' 46 | ``` 47 | And below is a general format for such datasets: 48 | ```js 49 | { 50 | "dataset_name--train/val/test--dialog_id": { 51 | "original dialog id": str, 52 | "dialog index": int, 53 | "original dialog info": dict, 54 | "log": [ 55 | { 56 | "turn id": int, 57 | "user utterance": str, 58 | "system response": str, 59 | "dialog history": str, 60 | "original user side information": dict, 61 | "original system side information": dict, 62 | "dst": str, 63 | "dst accumulated": str 64 | "external knowledge non-flat": list, 65 | "external knowledge": str, 66 | }, 67 | ... 68 | ] 69 | "prompt": [ 70 | "This is a bot helping users to get navigation. Given the dialog context and external database, please generate a relevant system response for the user.", 71 | ... 72 | ] 73 | }, 74 | ... 75 | } 76 | ``` 77 | Please refer to each dataset folder for more details. 78 | -------------------------------------------------------------------------------- /task-oriented-dialogues/STAR/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Rasa Technologies GmbH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /task-oriented-dialogues/STAR/README.md: -------------------------------------------------------------------------------- 1 | # STAR: A Schema-Guided Dialog Dataset for Transfer Learning 2 | 3 | This dataset and how it came to be, along with some baseline models, are described [in this paper](https://arxiv.org/abs/2010.11853). 4 | 5 | ## Data Format 6 | 7 | Each JSON file in the `dialogues` directory contains one dialogue in the following format: 8 | 9 | | Key | Value | 10 | |----------------------------|-----------------------------------------------------------------------------------| 11 | | "AnonymizedUserWorkerID" | String that is unique for each worker but unrelated to the worker's AMT Worker ID | 12 | | "AnonymizedWizardWorkerID" | String that is unique for each worker but unrelated to the worker's AMT Worker ID | 13 | | "BatchID" | We collected dialogues in batches, identified by this ID | 14 | | "CompletionLevel" | Can be "Complete", "EarlyDisconnectDuringDialogue", or "DisconnectDuringDialogue" | 15 | | "DialogueID" | Unique ID of this dialogue | 16 | | "Events" | List of events representing the dialogue | 17 | | "FORMAT-VERSION" | | 18 | | "Scenario" | Dictionary containing information about the scenario of this dialogue | 19 | | "UserQuestionnaire" | List of question/answer pairs for questions given to the user | 20 | | "WizardQuestionnaire" | List of question/answer pairs for questions given to the wizard | 21 | 22 | 23 | ## Citation 24 | 25 | Please use the following bibtex entry if you are using STAR for your research: 26 | ``` 27 | 28 | @article{mosig2020star, 29 | author = {Johannes E. M. Mosig and Shikib Mehri and Thomas Kober}, 30 | title = "{STAR: A Schema-Guided Dialog Dataset for Transfer Learning}", 31 | journal = {arXiv e-prints}, 32 | keywords = {Computer Science - Computation and Language}, 33 | year = 2020, 34 | month = oct, 35 | eid = {arXiv:2010.11853}, 36 | archivePrefix = {arXiv}, 37 | eprint = {2010.11853}, 38 | primaryClass = {cs.CL}, 39 | } 40 | ``` -------------------------------------------------------------------------------- /task-oriented-dialogues/STAR/otgy.json: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /task-oriented-dialogues/SalesBot/README.md: -------------------------------------------------------------------------------- 1 | # SalesBot: Transitioning from Chit-Chat to Task-Oriented Dialogues 2 | 3 | ## Framework 4 |

    5 | 6 |

    7 | This paper focuses on investigating the conversations starting from open-domain social chatting and then gradually transitioning to task-oriented purposes, and releases a large-scale dataset with detailed annotations for encouraging this research direction. To achieve this goal, this paper proposes a framework to automatically generate many dialogues without human involvement, in which any powerful open-domain dialogue generation model can be easily leveraged. 8 | 9 | ## Dependency 10 | Check the packages needed or simply run the command 11 | ```console 12 | conda env create -f environment.yml 13 | ``` 14 | 15 | ## Data 16 | * selfchat: 17 | ```console 18 | mkdir selfchat 19 | parlai self_chat --model-file zoo:blender/blender_1Bdistill/model --inference nucleus --num-self-chats 20 --task blended_skill_talk --include-personas True --include-initial-utterances True --outfile selfchat/merge_sgd_20.json 20 | parlai self_chat --model-file zoo:blender/blender_1Bdistill/model --inference nucleus --num-self-chats 20 --task blended_skill_talk --include-personas True --include-initial-utterances True --outfile selfchat/simulators_20.json 21 | ``` 22 | * intent detection model: 23 | ```console 24 | python3 qa_inference.py --data_file selfchat/merge_sgd_20.jsonl --output_file merge_sgd_intent.json --device 0 25 | python3 qa_inference.py --data_file selfchat/simulators_20.jsonl --output_file simulators_intent.json --device 0 26 | ``` 27 | * task-oriented simulators: 28 | ```console 29 | python3 combine_simulators.py simulators_intent.json 30 | ``` 31 | * merge SGD: 32 | ```console 33 | # SGD_delex is the version preprocessed by "ACCENTOR: Adding Chit-Chat to Enhance Task-Oriented Dialogues" 34 | unzip SGD_delex 35 | mkdir sgd_intent_dialog 36 | python3 collect_sgd_intent.py SGD_delex 37 | python3 combine_sgd.py merge_sgd_intent.json 38 | 39 | ``` 40 | * transition: 41 | ```console 42 | python3 transition.py combine_sgd.json 43 | python3 transition.py combine_simulators.json 44 | ``` 45 | 46 | ## Citation 47 | 48 | Please cite our paper if you use SalesBot in your work: 49 | 50 | ```bibtex 51 | @inproceedings{chiu2022salesbot, 52 | title={{SalesBot}: Transitioning from Chit-Chat to Task-Oriented Dialogues}, 53 | author={Chiu, Ssu and Li, Maolin and Lin, Yen-Ting and Chen, Yun-Nung}, 54 | booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (ACL)}, 55 | year={2022} 56 | } 57 | ``` 58 | -------------------------------------------------------------------------------- /task-oriented-dialogues/SalesBot/otgy.json: -------------------------------------------------------------------------------- 1 | [ 2 | "GetTimesForMovie", 3 | "LookupSong", 4 | "FindMovies", 5 | "LookupMusic", 6 | "PlaySong", 7 | "FindAttractions" 8 | ] -------------------------------------------------------------------------------- /task-oriented-dialogues/SimJointGEN/README.md: -------------------------------------------------------------------------------- 1 | ## Simulator Generated Dataset (sim-GEN) 2 | 3 | This directory contains an expanded set of dialogues generated via dialogue 4 | self-play between a user simulator and a system agent, as follows: 5 | 6 | - The dialogues collected using the M2M framework for the movie ticket booking 7 | task (sim-M) are used as a seed set to form a crowd-sourced corpus of 8 | natural language utterances for the user and the system agents. 9 | - Subsequently, many more dialogue outlines are generated using self-play 10 | between the simulated user and system agent. 11 | - The dialogue outlines are converted to natural language dialogues by 12 | replacing each dialogue act in the outline with an utterance sampled from 13 | the set of crowd-sourced utterances collected with M2M. 14 | 15 | In this manner, we can generate an arbitrarily large number of dialogue outlines 16 | and convert them automatically to natural language dialogues without any 17 | additional crowd-sourcing step. Although the diversity of natural language in 18 | the dataset does not increase, the number of unique dialogue states present in 19 | the dataset will increase since a larger variety of dialogue outlines will be 20 | available in the expanded dataset. 21 | 22 | This dataset was used for experiments reported in [this 23 | paper](https://arxiv.org/abs/1804.06512). Please cite the paper if you use or 24 | discuss sim-GEN in your work: 25 | 26 | ```shell 27 | @article{liu2018dialogue, 28 | title={Dialogue Learning with Human Teaching and Feedback in End-to-End Trainable Task-Oriented Dialogue Systems}, 29 | author={Liu, Bing and Tur, Gokhan and Hakkani-Tur, Dilek and Shah, Pararth and Heck, Larry}, 30 | journal={NAACL}, 31 | year={2018} 32 | } 33 | ``` 34 | 35 | ## Data format 36 | 37 | The data splits are made available as a .zip file containing dialogues in JSON 38 | format. Each dialogue object contains the following fields: 39 | 40 | * **dialogue\_id** - *string* unique identifier for each dialogue. 41 | * **turns** - *list* of turn objects: 42 | * **system\_acts** - *list* of system dialogue acts for this system turn: 43 | * **name** - *string* system act name 44 | * **slot\_values** - *optional dictionary* mapping slot names to 45 | values 46 | * **system\_utterance** - *string* natural language utterance 47 | corresponding to the system acts for this turn 48 | * **user\_utterance** - *string* natural language user utterance following 49 | the system utterance in this turn 50 | * **dialogue\_state** - *dictionary* ground truth slot-value mapping after 51 | the user utterance 52 | * **database\_state** - database results based on current dialogue state: 53 | * **scores** - *list* of scores, between 0.0 and 1.0, of top 5 54 | database results. 1.0 means matches all constraints and 0.0 means no 55 | match 56 | * **has\_more\_results** - *boolean* whether backend has more matching 57 | results 58 | * **has\_no\_results** - *boolean* whether backend has no matching 59 | results 60 | 61 | An additional file **db.json** is provided which contains the set of values for 62 | each slot. 63 | 64 | Note: The date values in the dataset are normalized as the constants, 65 | "base_date_plus_X", for X from 0 to 6. X=0 corresponds to the current date (i.e. 66 | 'today'), X=1 is 'tomorrow', etc. This is done to allow handling of relative 67 | references to dates (e.g. 'this weekend', 'next Wednesday', etc). The parsing of 68 | such phrases should be done as a separate pre-processing step. 69 | -------------------------------------------------------------------------------- /task-oriented-dialogues/SimJointGEN/db.json: -------------------------------------------------------------------------------- 1 | { 2 | "date": [ 3 | "DontCare", 4 | "base_date_plus_0", 5 | "base_date_plus_1", 6 | "base_date_plus_2", 7 | "base_date_plus_3", 8 | "base_date_plus_4", 9 | "base_date_plus_5", 10 | "base_date_plus_6" 11 | ], 12 | "movie": [ 13 | "AMC_Eastridge_15", 14 | "A_Man_Called_Ove", 15 | "Achcham_Yenbadhu_Madamaiyada", 16 | "Ae_Dil_Hai_Mushkil", 17 | "Almost_Christmas", 18 | "American_Pastoral", 19 | "Aquarius", 20 | "Arrival", 21 | "Bakit_lahat_ng_guwapo_may_boyfriend?!", 22 | "Boo!_A_Madea_Halloween", 23 | "Certain_Women", 24 | "Chaar_Sahibzaade", 25 | "Deepwater_Horizon", 26 | "Denial", 27 | "Desierto", 28 | "Disney_Junior_at_the_Movies_with_Mickey!", 29 | "Doctor_Strange", 30 | "Doctor_Who_(Animated)", 31 | "DontCare", 32 | "From_Dusk_till_Dawn", 33 | "Gimme_Danger", 34 | "Hacksaw_Ridge", 35 | "Hell_or_High_Water", 36 | "Inferno", 37 | "Jack_Reacher", 38 | "Keeping_Up_with_the_Joneses", 39 | "Kevin_Hart", 40 | "Kubo_and_the_Two_Strings", 41 | "LUCK-KEY", 42 | "Loving", 43 | "Mad_Max", 44 | "Middle_School", 45 | "Miss_Hokusai", 46 | "Miss_Peregrine's_Home_for_Peculiar_Children", 47 | "Moonlight", 48 | "Mr._Donkey", 49 | "Ouija", 50 | "Queen_of_Katwe", 51 | "Rock_On_2", 52 | "Sahasam_Swasaga_Sagipo", 53 | "Shut_In", 54 | "Space_Jam", 55 | "Storks", 56 | "Sully", 57 | "The_Accountant", 58 | "The_Eagle_Huntress", 59 | "The_Girl_on_the_Train", 60 | "The_Handmaiden", 61 | "The_Love_Witch", 62 | "The_Magnificent_Seven", 63 | "The_Third_Party", 64 | "Train_To_Busan", 65 | "Trolls" 66 | ], 67 | "num_tickets": [ 68 | "2", 69 | "3", 70 | "4", 71 | "5", 72 | "6" 73 | ], 74 | "theatre_name": [ 75 | "AMC_Cupertino_Square_16", 76 | "AMC_Mercado_20", 77 | "AMC_Newpark_12", 78 | "AMC_Saratoga_14", 79 | "Camera_7", 80 | "Century_20_Great_Mall", 81 | "Century_25_Union_City_and_XD", 82 | "Century_Cinemas_16", 83 | "Century_at_Pacific_Commons", 84 | "CineArts_@_Santana_Row", 85 | "CineLux_Almaden_Cafe_&_Lounge", 86 | "CineLux_Plaza_Theatre", 87 | "Cinemark_12_Downtown_San_Mateo", 88 | "Cinemark_Redwood_Downtown_20_and_XD", 89 | "DontCare" 90 | ], 91 | "time": [ 92 | "10:00_am", 93 | "10:00_pm", 94 | "12:00_pm", 95 | "1:00_pm", 96 | "2:00_pm", 97 | "4:00_pm", 98 | "6:00_pm", 99 | "7:15_pm", 100 | "8:00_am", 101 | "8:00_pm", 102 | "DontCare" 103 | ] 104 | } -------------------------------------------------------------------------------- /task-oriented-dialogues/SimJointMovie/README.md: -------------------------------------------------------------------------------- 1 | ## Structure of the Data 2 | 3 | Each dialogue is represented as a json object with the following fields: 4 | 5 | * **dialogue\_id** - A unique identifier for a dialogue. 6 | * **turns** - A list of annotated agent and user utterance pairs having the 7 | following fields: 8 | * **system\_acts** - A list of system actions. An action consists of an 9 | action type, and optional slot and value arguments. Each action has the 10 | following fields: 11 | * **type** - An action type. Possible values are listed below. 12 | * **slot** - Optional slot argument. 13 | * **value** - Optional value argument. If value is present, slot must 14 | be present. 15 | * **system\_utterance** - The system utterance having the following 16 | fields. 17 | * **text** - The text of the utterance. 18 | * **tokens** - A list containing tokenized version of text. 19 | * **slots** - A list containing locations of mentions of values 20 | corresponding to slots in the utterance, having the following 21 | fields: 22 | * **slot** - The name of the slot 23 | * **start** - The index of the first token corresponding to a slot 24 | value in the tokens list. 25 | * **exclusive\_end** - The index of the token succeeding the last 26 | token corresponding to the slot value in the tokens list. In 27 | python, `tokens[start:exclusive_end]` gives the tokens for slot 28 | value. 29 | * **user\_acts** - A list of user actions. Has the same structure as 30 | system\_acts. 31 | * **user\_utterance** - The user utterance. It has three fields, similar 32 | to system\_utterance. 33 | * **user_intents** - A list of user intents specified in the current turn. 34 | Possible values are listed below. 35 | * **dialogue\_state** - Contains the preferences for the different slots 36 | as specified by the user upto the current turn of the dialogue. 37 | Represented as a list containing: 38 | * **slot** - The name of the slot. 39 | * **value** - The value assigned to the slot. 40 | 41 | The list of action types is inspired from the Cambridge dialogue act schema 42 | ([DSTC2 Handbook](http://camdial.org/~mh521/dstc/downloads/handbook.pdf), Pg 19) 43 | . The possible values are: 44 | 45 | * AFFIRM 46 | * CANT\_UNDERSTAND 47 | * CONFIRM 48 | * INFORM 49 | * GOOD\_BYE 50 | * GREETING 51 | * NEGATE 52 | * OTHER 53 | * NOTIFY\_FAILURE 54 | * NOTIFY\_SUCCESS 55 | * OFFER 56 | * REQUEST 57 | * REQUEST\_ALTS 58 | * SELECT 59 | * THANK\_YOU 60 | 61 | The possible values of user intents are: 62 | 63 | * BUY\_MOVIE\_TICKETS 64 | -------------------------------------------------------------------------------- /task-oriented-dialogues/SimJointRestaurant/README.md: -------------------------------------------------------------------------------- 1 | ## Structure of the Data 2 | 3 | Each dialogue is represented as a json object with the following fields: 4 | 5 | * **dialogue\_id** - A unique identifier for a dialogue. 6 | * **turns** - A list of annotated agent and user utterance pairs having the 7 | following fields: 8 | * **system\_acts** - A list of system actions. An action consists of an 9 | action type, and optional slot and value arguments. Each action has the 10 | following fields: 11 | * **type** - An action type. Possible values are listed below. 12 | * **slot** - Optional slot argument. 13 | * **value** - Optional value argument. If value is present, slot must 14 | be present. 15 | * **system\_utterance** - The system utterance having the following 16 | fields. 17 | * **text** - The text of the utterance. 18 | * **tokens** - A list containing tokenized version of text. 19 | * **slots** - A list containing locations of mentions of values 20 | corresponding to slots in the utterance, having the following 21 | fields: 22 | * **slot** - The name of the slot 23 | * **start** - The index of the first token corresponding to a slot 24 | value in the tokens list. 25 | * **exclusive\_end** - The index of the token succeeding the last 26 | token corresponding to the slot value in the tokens list. In 27 | python, `tokens[start:exclusive_end]` gives the tokens for slot 28 | value. 29 | * **user\_acts** - A list of user actions. Has the same structure as 30 | system\_acts. 31 | * **user\_utterance** - The user utterance. It has three fields, similar 32 | to system\_utterance. 33 | * **user_intents** - A list of user intents specified in the current turn. 34 | Possible values are listed below. 35 | * **dialogue\_state** - Contains the preferences for the different slots 36 | as specified by the user upto the current turn of the dialogue. 37 | Represented as a list containing: 38 | * **slot** - The name of the slot. 39 | * **value** - The value assigned to the slot. 40 | 41 | The list of action types is inspired from the Cambridge dialogue act schema 42 | ([DSTC2 Handbook](http://camdial.org/~mh521/dstc/downloads/handbook.pdf), Pg 19) 43 | . The possible values are: 44 | 45 | * AFFIRM 46 | * CANT\_UNDERSTAND 47 | * CONFIRM 48 | * INFORM 49 | * GOOD\_BYE 50 | * GREETING 51 | * NEGATE 52 | * OTHER 53 | * NOTIFY\_FAILURE 54 | * NOTIFY\_SUCCESS 55 | * OFFER 56 | * REQUEST 57 | * REQUEST\_ALTS 58 | * SELECT 59 | * THANK\_YOU 60 | 61 | The possible values of user intents are: 62 | 63 | * FIND\_RESTAURANT 64 | * RESERVE\_RESTAURANT 65 | -------------------------------------------------------------------------------- /task-oriented-dialogues/SimJointRestaurant/otgy.json: -------------------------------------------------------------------------------- 1 | { 2 | "slots": { 3 | "num_people": [ 4 | "4", 5 | "6", 6 | "3", 7 | "5", 8 | "2" 9 | ], 10 | "restaurant_name": [ 11 | "oren hummus", 12 | ".ink", 13 | "deep blue", 14 | "ephesus", 15 | "pepper", 16 | "amber india", 17 | "angel", 18 | "cetrella", 19 | "cheese cake factory", 20 | "acorn", 21 | "amarin", 22 | "sumiko", 23 | "pompei", 24 | "sushi boat", 25 | "cascal", 26 | "los altos grill", 27 | "the hudson room", 28 | "amber", 29 | "sakoon", 30 | "sweet greens", 31 | "the nest", 32 | "oren", 33 | "the ivy", 34 | "il fornaio", 35 | "boats", 36 | "high rooftop lounge", 37 | "the view", 38 | "evvia" 39 | ], 40 | "date": [ 41 | "wednesday", 42 | "tomorrow", 43 | "tonight", 44 | "friday", 45 | "next monday" 46 | ], 47 | "time": [ 48 | "8 pm", 49 | "6.15 pm", 50 | "7.30 pm", 51 | "6 pm", 52 | "5 pm", 53 | "7 pm", 54 | "5.30 pm", 55 | "8.30 pm", 56 | "dontcare", 57 | "7.15 pm", 58 | "6.30 pm" 59 | ], 60 | "meal": [ 61 | "dinner", 62 | "brunch", 63 | "breakfast", 64 | "dontcare", 65 | "lunch" 66 | ], 67 | "location": [ 68 | "redmond", 69 | "madison", 70 | "morristown", 71 | "orlando", 72 | "kirkland", 73 | "yorktown heights", 74 | "middletown", 75 | "los altos", 76 | "mountain view" 77 | ], 78 | "price_range": [ 79 | "moderately priced", 80 | "dontcare", 81 | "inexpensive", 82 | "expensive" 83 | ], 84 | "category": [ 85 | "mediterranean", 86 | "italian", 87 | "thai", 88 | "dontcare", 89 | "taiwanese", 90 | "french", 91 | "indian", 92 | "greek", 93 | "vietnamese", 94 | "chinese" 95 | ], 96 | "rating": [ 97 | "good", 98 | "zagat rated", 99 | "dontcare", 100 | "michelin rated" 101 | ] 102 | }, 103 | "intents": [ 104 | "GREETING", 105 | "REQUEST_ALTS", 106 | "OTHER", 107 | "CANT_UNDERSTAND", 108 | "NEGATE", 109 | "THANK_YOU", 110 | "GOOD_BYE", 111 | "INFORM", 112 | "AFFIRM" 113 | ] 114 | } -------------------------------------------------------------------------------- /task-oriented-dialogues/Taskmaster1/LICENSE.md: -------------------------------------------------------------------------------- 1 | Creative Commons License
    This work is licensed under a Creative Commons Attribution 4.0 International License. -------------------------------------------------------------------------------- /task-oriented-dialogues/Taskmaster2/LICENSE.md: -------------------------------------------------------------------------------- 1 | Creative Commons License
    This work is licensed under a Creative Commons Attribution 4.0 International License. -------------------------------------------------------------------------------- /task-oriented-dialogues/Taskmaster3/LICENSE.md: -------------------------------------------------------------------------------- 1 | Creative Commons License
    This work is licensed under a Creative Commons Attribution 4.0 International License. -------------------------------------------------------------------------------- /task-oriented-dialogues/WOZ2_0/README.md: -------------------------------------------------------------------------------- 1 | # Neural Belief Tracker 2 | 3 | Contact: Nikola Mrkšić (nikola.mrksic@gmail.com) 4 | 5 | An implementation of the Fully Data-Driven version of the Neural Belief Tracking (NBT) model (ACL 2018, [Fully Statistical Neural Belief Tracking](https://arxiv.org/abs/1805.11350)). 6 | 7 | This version of the model uses a learned belief state update in place of the rule-based mechanism used in the original paper. Requests are not a focus of this paper and should be ignored in the output. 8 | 9 | ### Configuring the Tool 10 | 11 | The config file in the config directory specifies the model hyperparameters, training details, dataset, ontologies, etc. 12 | 13 | ### Running Experiments 14 | 15 | train.sh and test.sh can be used to train and test the model (using the default config file). 16 | track.sh uses the trained models to 'simulate' a conversation where the developer can enter sequential user turns and observe the change in belief state. 17 | 18 | 19 | -------------------------------------------------------------------------------- /task-oriented-dialogues/WOZ2_0/otgy.json: -------------------------------------------------------------------------------- 1 | { 2 | "food": [ 3 | "moroccan", 4 | "cuban", 5 | "mexican", 6 | "corsica", 7 | "irish", 8 | "australian", 9 | "north american", 10 | "fusion", 11 | "european", 12 | "indonesian", 13 | "japanese", 14 | "scottish", 15 | "basque", 16 | "swedish", 17 | "spanish", 18 | "jamaican", 19 | "persian", 20 | "turkish", 21 | "thai", 22 | "chinese", 23 | "vegetarian", 24 | "world", 25 | "scandinavian", 26 | "venetian", 27 | "dontcare", 28 | "modern european", 29 | "indian", 30 | "international", 31 | "creative", 32 | "australasian", 33 | "english", 34 | "korean", 35 | "austrian", 36 | "unusual", 37 | "lebanese", 38 | "asian oriental", 39 | "seafood", 40 | "portuguese", 41 | "afghan", 42 | "canapes", 43 | "russian", 44 | "bistro", 45 | "british", 46 | "brazilian", 47 | "steakhouse", 48 | "singaporean", 49 | "catalan", 50 | "french", 51 | "hungarian", 52 | "german", 53 | "vietnamese", 54 | "polynesian", 55 | "mediterranean", 56 | "barbeque", 57 | "italian", 58 | "cantonese", 59 | "panasian", 60 | "danish", 61 | "belgian", 62 | "malaysian", 63 | "caribbean", 64 | "crossover", 65 | "greek", 66 | "halal", 67 | "traditional", 68 | "kosher", 69 | "tuscan", 70 | "romanian", 71 | "swiss", 72 | "gastropub", 73 | "african", 74 | "polish", 75 | "corsican", 76 | "christmas", 77 | "welsh", 78 | "eritrean" 79 | ], 80 | "area": [ 81 | "north", 82 | "dontcare", 83 | "centre", 84 | "south", 85 | "east side", 86 | "east", 87 | "center", 88 | "west" 89 | ], 90 | "request": [ 91 | "phone", 92 | "price range", 93 | "address", 94 | "area", 95 | "name", 96 | "postcode", 97 | "food" 98 | ], 99 | "price range": [ 100 | "dontcare", 101 | "cheap", 102 | "expensive", 103 | "moderate" 104 | ] 105 | } --------------------------------------------------------------------------------