├── README.md ├── LICENSE └── cornelldata.py /README.md: -------------------------------------------------------------------------------- 1 | # textutil-preprocess-cornell-movie-corpus 2 | textutil-preprocess-cornell-movie-corpus 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 floydhub 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cornelldata.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | import argparse 3 | import codecs 4 | import csv 5 | import os 6 | 7 | """ 8 | Load the cornell movie dialog corpus. 9 | 10 | Available from here: 11 | http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html 12 | 13 | """ 14 | 15 | def loadLines(fileName, fields): 16 | """ 17 | Args: 18 | fileName (str): file to load 19 | field (set): fields to extract 20 | Return: 21 | dict>: the extracted fields for each line 22 | """ 23 | lines = {} 24 | 25 | with open(fileName, 'r', encoding='iso-8859-1') as f: # TODO: Solve Iso encoding pb ! 26 | for line in f: 27 | values = line.split(" +++$+++ ") 28 | 29 | # Extract fields 30 | lineObj = {} 31 | for i, field in enumerate(fields): 32 | lineObj[field] = values[i] 33 | 34 | lines[lineObj['lineID']] = lineObj 35 | 36 | return lines 37 | 38 | def loadConversations(fileName, lines, fields): 39 | """ 40 | Args: 41 | fileName (str): file to load 42 | field (set): fields to extract 43 | Return: 44 | dict>: the extracted fields for each line 45 | """ 46 | conversations = [] 47 | 48 | with open(fileName, 'r', encoding='iso-8859-1') as f: # TODO: Solve Iso encoding pb ! 49 | for line in f: 50 | values = line.split(" +++$+++ ") 51 | 52 | # Extract fields 53 | convObj = {} 54 | for i, field in enumerate(fields): 55 | convObj[field] = values[i] 56 | 57 | # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]") 58 | lineIds = eval(convObj["utteranceIDs"]) 59 | 60 | # Reassemble lines 61 | convObj["lines"] = [] 62 | for lineId in lineIds: 63 | convObj["lines"].append(lines[lineId]) 64 | 65 | conversations.append(convObj) 66 | 67 | return conversations 68 | 69 | def extractSentencePairs(conversations): 70 | """ 71 | Extract the sample lines from the conversations 72 | """ 73 | 74 | qa_pairs = [] 75 | for conversation in conversations: 76 | # Iterate over all the lines of the conversation 77 | 78 | for i in range(len(conversation["lines"]) - 1): # We ignore the last line (no answer for it) 79 | inputLine = conversation["lines"][i]["text"].strip() 80 | targetLine = conversation["lines"][i+1]["text"].strip() 81 | 82 | if inputLine and targetLine: # Filter wrong samples (if one of the list is empty) 83 | qa_pairs.append([inputLine, targetLine]) 84 | 85 | return qa_pairs 86 | 87 | def main(): 88 | """ 89 | Parses the Cornell Movie Dialog Corpus, and extracts conversations from it. 90 | """ 91 | 92 | # Parse command line args 93 | parser = argparse.ArgumentParser(description='Extract conversations from Cornell movie dialog corpus') 94 | 95 | parser.add_argument('-i', '--input', required=True, 96 | help='Path to input dir') 97 | parser.add_argument('-d', '--delimiter', required=True, default='\t', 98 | help='Column delimiter between output columns') 99 | parser.add_argument('-o', '--output', required=True, help='Path to output file') 100 | 101 | args = parser.parse_args() 102 | # Unescape the delimiter 103 | args.delimiter = codecs.decode(args.delimiter, "unicode_escape") 104 | 105 | # Convert args to dict 106 | vargs = vars(args) 107 | 108 | print("\nArguments:") 109 | for arg in vargs: 110 | print("{}={}".format(arg, getattr(args, arg))) 111 | 112 | lines = {} 113 | conversations = [] 114 | 115 | MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"] 116 | MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"] 117 | 118 | print("\nProcessing corpus...") 119 | lines = loadLines(os.path.join(args.input, "movie_lines.txt"), MOVIE_LINES_FIELDS) 120 | print("\nLoading conversations...") 121 | conversations = loadConversations(os.path.join(args.input, "movie_conversations.txt"), 122 | lines, MOVIE_CONVERSATIONS_FIELDS) 123 | 124 | with open(args.output, 'w', encoding='iso-8859-1') as outputfile: 125 | writer = csv.writer(outputfile, delimiter=args.delimiter) 126 | 127 | for pair in extractSentencePairs(conversations): 128 | writer.writerow(pair) 129 | 130 | print("\nDone. Bye!") 131 | 132 | if __name__ == '__main__': 133 | main() --------------------------------------------------------------------------------