├── README.md
├── LICENSE
└── cornelldata.py


/README.md:
--------------------------------------------------------------------------------
1 | # textutil-preprocess-cornell-movie-corpus
2 | textutil-preprocess-cornell-movie-corpus
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 floydhub
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cornelldata.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function, unicode_literals
  2 | import argparse
  3 | import codecs
  4 | import csv
  5 | import os
  6 | 
  7 | """
  8 | Load the cornell movie dialog corpus.
  9 | 
 10 | Available from here:
 11 | http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
 12 | 
 13 | """
 14 | 
 15 | def loadLines(fileName, fields):
 16 |     """
 17 |     Args:
 18 |         fileName (str): file to load
 19 |         field (set<str>): fields to extract
 20 |     Return:
 21 |         dict<dict<str>>: the extracted fields for each line
 22 |     """
 23 |     lines = {}
 24 | 
 25 |     with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
 26 |         for line in f:
 27 |             values = line.split(" +++$+++ ")
 28 | 
 29 |             # Extract fields
 30 |             lineObj = {}
 31 |             for i, field in enumerate(fields):
 32 |                 lineObj[field] = values[i]
 33 | 
 34 |             lines[lineObj['lineID']] = lineObj
 35 | 
 36 |     return lines
 37 | 
 38 | def loadConversations(fileName, lines, fields):
 39 |     """
 40 |     Args:
 41 |         fileName (str): file to load
 42 |         field (set<str>): fields to extract
 43 |     Return:
 44 |         dict<dict<str>>: the extracted fields for each line
 45 |     """
 46 |     conversations = []
 47 | 
 48 |     with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
 49 |         for line in f:
 50 |             values = line.split(" +++$+++ ")
 51 | 
 52 |             # Extract fields
 53 |             convObj = {}
 54 |             for i, field in enumerate(fields):
 55 |                 convObj[field] = values[i]
 56 | 
 57 |             # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
 58 |             lineIds = eval(convObj["utteranceIDs"])
 59 | 
 60 |             # Reassemble lines
 61 |             convObj["lines"] = []
 62 |             for lineId in lineIds:
 63 |                 convObj["lines"].append(lines[lineId])
 64 | 
 65 |             conversations.append(convObj)
 66 | 
 67 |     return conversations
 68 | 
 69 | def extractSentencePairs(conversations):
 70 |     """
 71 |     Extract the sample lines from the conversations
 72 |     """
 73 | 
 74 |     qa_pairs = []
 75 |     for conversation in conversations:
 76 |         # Iterate over all the lines of the conversation
 77 | 
 78 |         for i in range(len(conversation["lines"]) - 1):  # We ignore the last line (no answer for it)
 79 |             inputLine = conversation["lines"][i]["text"].strip()
 80 |             targetLine = conversation["lines"][i+1]["text"].strip()
 81 | 
 82 |             if inputLine and targetLine:  # Filter wrong samples (if one of the list is empty)
 83 |                 qa_pairs.append([inputLine, targetLine])
 84 | 
 85 |     return qa_pairs
 86 | 
 87 | def main():
 88 |     """
 89 |     Parses the Cornell Movie Dialog Corpus, and extracts conversations from it.
 90 |     """
 91 | 
 92 |     # Parse command line args
 93 |     parser = argparse.ArgumentParser(description='Extract conversations from Cornell movie dialog corpus')
 94 | 
 95 |     parser.add_argument('-i', '--input', required=True,
 96 |                         help='Path to input dir')
 97 |     parser.add_argument('-d', '--delimiter', required=True, default='\t', 
 98 |                         help='Column delimiter between output columns')
 99 |     parser.add_argument('-o', '--output', required=True, help='Path to output file')
100 | 
101 |     args = parser.parse_args()
102 |     # Unescape the delimiter
103 |     args.delimiter = codecs.decode(args.delimiter, "unicode_escape")
104 | 
105 |     # Convert args to dict
106 |     vargs = vars(args)
107 | 
108 |     print("\nArguments:")
109 |     for arg in vargs:
110 |         print("{}={}".format(arg, getattr(args, arg)))
111 | 
112 |     lines = {}
113 |     conversations = []
114 | 
115 |     MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
116 |     MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
117 | 
118 |     print("\nProcessing corpus...")
119 |     lines = loadLines(os.path.join(args.input, "movie_lines.txt"), MOVIE_LINES_FIELDS)
120 |     print("\nLoading conversations...")
121 |     conversations = loadConversations(os.path.join(args.input, "movie_conversations.txt"),
122 |                                       lines, MOVIE_CONVERSATIONS_FIELDS)
123 | 
124 |     with open(args.output, 'w', encoding='iso-8859-1') as outputfile:
125 |         writer = csv.writer(outputfile, delimiter=args.delimiter)
126 |         
127 |         for pair in extractSentencePairs(conversations):
128 |             writer.writerow(pair)
129 | 
130 |     print("\nDone. Bye!")
131 | 
132 | if __name__ == '__main__':
133 |     main()


--------------------------------------------------------------------------------