├── .DS_Store ├── .gitignore ├── __init__.py ├── apis ├── __init__.py ├── concept_net_client.py └── text.py ├── emotext.db ├── models ├── __init__.py └── models.py ├── readme.md ├── requirements.txt └── utils ├── __init__.py └── utils.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | 45 | # Translations 46 | *.mo 47 | *.pot 48 | 49 | # Django stuff: 50 | *.log 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | # PyBuilder 56 | target/ 57 | 58 | config.cfg -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/__init__.py -------------------------------------------------------------------------------- /apis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/apis/__init__.py -------------------------------------------------------------------------------- /apis/concept_net_client.py: -------------------------------------------------------------------------------- 1 | """ 2 | `rest_client.py`_ is a simple client for interacting with ConceptNet 5's REST 3 | API. 4 | 5 | .. _`rest_client.py`: https://github.com/commonsense/conceptnet/blob/master/conceptnet/webapi/rest_client.py 6 | 7 | This client is not object-oriented. The data structures you work with are 8 | dictionaries, of the form described in the API documentation. The main function 9 | :func:`lookup` can be used to look up many different kinds of data. There are 10 | also convenience functions for performing common operations on this data. 11 | 12 | If you want to know what fields are contained in these dictionaries, read 13 | the REST API documentation at 14 | http://csc.media.mit.edu/docs/conceptnet/webapi.html#rest-requests . 15 | 16 | This wrapper has been portet to ConceptNet5 by Tim Daubenschuetz 17 | """ 18 | import sys 19 | import os.path 20 | 21 | import urllib, urllib2 22 | from requests_futures.sessions import FuturesSession 23 | from ..utils.utils import get_config 24 | 25 | try: 26 | import json 27 | except: 28 | import simplejson as json 29 | 30 | CLIENT_VERSION = '1' 31 | 32 | # Emotext specific parameters, used to enable the library to be run locally 33 | SERVER_URL = get_config('conceptnet5_parameters', 'SERVER_URL') 34 | API_URL = get_config('conceptnet5_parameters', 'API_URL') 35 | CONCEPT_NET_VERSION = get_config('conceptnet5_parameters', 'VERSION') 36 | REQ_LIMIT = get_config('conceptnet5_parameters', 'REQ_LIMIT') 37 | 38 | TYPES = { 39 | 'assertion': 'a', 40 | 'concept': 'c', 41 | 'datasets': 'd', 42 | 'edge': 'e', 43 | 'license': 'l', 44 | 'language_indenpendent_relation': 'r', 45 | 'knowledge_sources': 's', 46 | 'and': 'and', 47 | 'or': 'or' 48 | } 49 | 50 | def lookup(type, language, key): 51 | """ 52 | Get an object of a certain *type*, specified by the code for what 53 | *language* it is in and its *key*. The types currently supported are: 54 | 55 | `assertion` 56 | assertions 57 | `concept` 58 | concepts (words, disambiguated words, and phrases, in a particular language) 59 | 'datasets' 60 | datasets (large sources of knowledge that can be downloaded as a unit) 61 | 'edge' 62 | unique, arbitrary IDs for edges. Edges that assert the same thing combine to form assertions. 63 | 'license' 64 | license terms for redistributing the information in an edge. The two licenses in ConceptNet are /l/CC/By for Creative Commons Attribution, and /l/CC/By-SA for the more restrictive Attribution-ShareAlike license. See Copying and sharing ConceptNet. 65 | 'language_indenpendent_relation' 66 | language-independent relations, such as /r/IsA 67 | 'knowledge_sources' 68 | knowledge sources, which can be human contributors, Web sites, or automated processes 69 | 'and' 70 | conjunctions and disjunctions of sources 71 | 'or' 72 | conjunctions and disjunctions of sources 73 | 74 | The object will be returned as a dictionary, or in the case of features, 75 | a list. 76 | """ 77 | if type == None: 78 | raise Exception('Type must be specified to request the web api.') 79 | if len(type) > 1: 80 | type = from_name_to_type(type) 81 | return _get_json(type, language, key.lower()) 82 | 83 | def from_name_to_type(type='concept'): 84 | try: 85 | return TYPES[type] 86 | except: 87 | print 'The Type ' + type + 'could not have been found.' 88 | return None 89 | 90 | def lookup_concept_raw(language, concept_name): 91 | """ 92 | Look up a Concept by its language and its raw name. For example, 93 | `lookup_concept_raw('en', 'webbed feet')` will get no results, but 94 | `lookup_concept_raw('en', 'web foot')` will. 95 | 96 | Use :func:`lookup_concept_from_surface` to look up a concept from an 97 | existing surface text, such as "webbed feet". 98 | 99 | Use :func:`lookup_concept_from_nl` to look up a concept from any natural 100 | language text. This requires the `simplenlp` module. 101 | """ 102 | return lookup('concept', language, concept_name) 103 | 104 | def lookup_concept_from_surface(language, surface_text): 105 | """ 106 | Look up a concept, given a surface form of that concept that someone has 107 | entered into Open Mind. For example, 108 | `lookup_concept_from_surface('en', 'webbed feet')` will return the concept 109 | 'web foot'. 110 | """ 111 | surface = lookup('surface', language, surface_text) 112 | return surface['concept'] 113 | 114 | def lookup_concept_from_nl(language, text): 115 | """ 116 | Look up a concept using any natural language text that represents it. 117 | This function requires the :mod:`simplenlp` module 118 | to normalize natural language text into a raw concept name. 119 | """ 120 | import simplenlp 121 | nltools = simplenlp.get('en') 122 | 123 | normalized = nltools.normalize(text) 124 | return lookup_concept_raw(language, normalized) 125 | 126 | def assertions_for_concept(concept, direction='all', limit=20): 127 | """ 128 | Given a dictionary representing a concept, look up the assertions it 129 | appears in. 130 | 131 | By default, this returns all matching assertions. By setting the 132 | optional argument `direction` to "forward" or "backward", you can restrict 133 | it to only assertions that have that concept on the left or the right 134 | respectively. 135 | 136 | You may set the limit on the number of results up to 100. The default is 137 | 20. This limit is applied before results are filtered for forward or 138 | backward assertions. 139 | """ 140 | def assertion_filter(assertion): 141 | if direction == 'all': return True 142 | elif direction == 'forward': 143 | return assertion['concept1']['text'] == concept['text'] 144 | elif direction == 'backward': 145 | return assertion['concept2']['text'] == concept['text'] 146 | else: 147 | raise ValueError("Direction must be 'all', 'forward', or 'backward'") 148 | 149 | assertions = _refine_json(concept, 'assertions', 'limit:%d' % limit) 150 | return [a for a in assertions if assertion_filter(a)] 151 | 152 | def surface_forms_for_concept(concept, limit=20): 153 | """ 154 | Given a dictionary representing a concept, get a list of its surface 155 | forms (also represented as dictionaries). 156 | 157 | You may set the limit on the number of results up to 100. The default is 158 | 20. 159 | """ 160 | return _refine_json(concept, 'surfaceforms', 'limit:%d' % limit) 161 | 162 | def votes_for(obj): 163 | """ 164 | Given a dictionary representing any object that can be voted on -- such as 165 | an assertion or raw_assertion -- get a list of its votes. 166 | """ 167 | return _refine_json(obj, 'votes') 168 | 169 | def similar_to_concepts(concepts, limit=20): 170 | """ 171 | `concepts` is a list of concept names or (concept name, weight) pairs. 172 | Given this, `similar_to_concepts` will find the `limit` most related 173 | concepts. 174 | 175 | These similar concepts are returned in dictionaries of the form: 176 | 177 | {'concept': concept, 'score': score} 178 | 179 | where `concept` is the data structure for a concept. 180 | """ 181 | pieces = [] 182 | for entry in concepts: 183 | if isinstance(entry, tuple): 184 | concept, weight = entry 185 | else: 186 | concept = entry 187 | weight = 1. 188 | if hasattr(concept, 'text'): 189 | concept = concept.text 190 | concept = concept.replace(' ', '_').encode('utf-8') 191 | pieces.append("%s@%s" % (concept, weight)) 192 | termlist = ','.join(pieces) 193 | limitstr = 'limit:%d' % limit 194 | return _get_json('en', 'similar_to', termlist, limitstr) 195 | 196 | def add_statement(language, frame_id, text1, text2, username, password): 197 | """ 198 | Add a statement to Open Mind, or vote for it if it is there. 199 | 200 | Requires the following parameters: 201 | 202 | language 203 | The language code, such as 'en'. 204 | frame_id 205 | The numeric ID of the sentence frame to use. 206 | text1 207 | The text filling the first blank of the frame. 208 | text2 209 | The text filling the second blank of the frame. 210 | username 211 | Your Open Mind username. 212 | password 213 | Your Open Mind password. 214 | 215 | Example: 216 | >>> frame = lookup('frame', 'en', 7) 217 | >>> frame['text'] 218 | '{1} is for {2}' 219 | 220 | >>> add_statement('en', 7, 'election day', 'voting', 'rspeer', PASSWORD) 221 | (Result: rspeer adds the statement "election day is for voting", which 222 | is also returned as a raw_assertion.) 223 | """ 224 | return _post_json([language, 'frame', frame_id, 'statements'], { 225 | 'username': username, 226 | 'password': password, 227 | 'text1': text1, 228 | 'text2': text2 229 | }) 230 | 231 | 232 | def _get_json(*url_parts): 233 | """ 234 | This method has been updated and now uses ConceptNet5 syntax to access the web-API 235 | """ 236 | session = FuturesSession() 237 | url = API_URL + '/' + CONCEPT_NET_VERSION + '/' + '/'.join(urllib2.quote(p.encode('utf-8')) for p in url_parts) + '?limit=' + REQ_LIMIT 238 | # print 'Looking up: ' + url 239 | #return session.get(url) 240 | return json.loads(_get_url(url)) 241 | 242 | def _extend_url(old_url, *url_parts): 243 | url = old_url + '/'.join(urllib2.quote(str(p)) for p in url_parts) + '/' 244 | return json.loads(_get_url(url)) 245 | 246 | def _get_url(url): 247 | conn = urllib2.urlopen(url) 248 | return conn.read() 249 | 250 | def _refine_json(old_obj, *parts): 251 | return _extend_url(SERVER_URL + old_obj['resource_uri'], *parts) 252 | -------------------------------------------------------------------------------- /apis/text.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module uses a ConceptNet5 REST-API Wrapper to connect to the network. 3 | 4 | Given an arbitrary text (that has been stemmed and normalized), 5 | it analyzes every token in order to create a vector that represents 6 | the texts emotions. 7 | 8 | This is done by algorithms searching the graph structure of concept net for 9 | connections between a specific token and the entity 'Emotion'. 10 | """ 11 | import sys 12 | import os.path 13 | import re 14 | 15 | from sets import Set 16 | from math import pow 17 | 18 | from ..utils.utils import get_config 19 | 20 | from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters 21 | from nltk.tokenize import RegexpTokenizer 22 | from nltk.stem.snowball import SnowballStemmer 23 | from nltk.corpus import stopwords 24 | 25 | LANG_TO_CODE = { 26 | 'english': 'en', 27 | 'german': 'de', 28 | 'french': 'fr' 29 | } 30 | 31 | MAX_DEPTH = get_config('graph_search', 'MAX_DEPTH', 'getint') 32 | MIN_WEIGHT = get_config('graph_search', 'MIN_WEIGHT', 'getint') 33 | 34 | EMOTIONS = set(get_config('graph_search', 'EMOTIONS', 'getlist')) 35 | 36 | def lang_name_to_code(lang_name='english'): 37 | """ 38 | ConceptNet uses language codes to query words. 39 | Since we don't want to use those, we've integrated this method 40 | that allows conversion from language names to language codes. 41 | 42 | If a language code is missing, an exception will be thrown and the users 43 | will be notified. 44 | 45 | They can furthermore easily adjust the LANG_TO_CODE constant, to add his own language. 46 | """ 47 | try: 48 | return LANG_TO_CODE[lang_name] 49 | except: 50 | print 'Unfortunately, no lang_code is present for this language.' 51 | print 'This may be adjusted in apis/emotext.py: LANG_TO_CODE' 52 | return None 53 | 54 | def text_processing(text, remove_punctuation=True, stemming=True, remove_stopwords=True, language='english'): 55 | """ 56 | This function enables general text processing. 57 | It features: 58 | * Tokenization on sentence level 59 | * Tokenization on word level 60 | * Punctuation removal 61 | * Stemming (and stopword removal) 62 | * Conversion to lower case 63 | The language parameter is only required, if stemming and removal of stopwords are desired. 64 | """ 65 | 66 | # Texts often contain punctuation characters. 67 | # While we'd like to remove them from our data set, their information shouldn't be lost, as 68 | # it would enable us to handle negation in text later on. 69 | # 70 | # An example: 71 | # Given the sentence: 'The movie was not bad.', we could convert all 72 | # adjectives in the sentence to antonyms and remove all negations. 73 | # Afterwards, the sentence would read 'The movie was good', where 'good' 74 | # is the antonym of 'bad'. 75 | # 76 | # Therefore, punctuation information should not be lost throughout the process of 77 | # processing the text with NLP. 78 | sentence_tokenizer = PunktSentenceTokenizer(PunktParameters()) 79 | # tokenize always returns a list of strings divided by punctuation characters 80 | # 81 | # 'hello' => [u'hello'] 82 | # 'Hello this is doge. world.' => [u'Hello this is doge.', u'world.'] 83 | # 84 | # Therefore, we need to continue handling a list, namely the sentences variable 85 | sentences = sentence_tokenizer.tokenize(text) 86 | 87 | # In the English language at least, 88 | # there are certain stop words, that introduce low-level negation 89 | # on a sentence bases. 90 | # However, these stop words are often melted with their previous verb 91 | # 92 | # isn't = is not 93 | # wouldn't = would not 94 | # 95 | # This must resolved, as it would not be possible for further functionality of this function to continue 96 | # extracting information. 97 | # Especially the 'anonymity' functionality wouldn't work without this 98 | if language == 'english': 99 | sw_pattern = r"(n't)" 100 | sentences = [re.sub(sw_pattern, ' not', s) for s in sentences] 101 | 102 | # If desired, the user can no go ahead and remove punctuation from all sentences 103 | if remove_punctuation: 104 | # This tokenizer simply removes every character or word which 105 | # length is < 2 and is not a alphabetic one 106 | punct_rm_tokenizer = RegexpTokenizer(r'\w{2,}') 107 | # In this case, tokenize will return a list of every word in the sentence 108 | # 109 | # [u'hello'] => [[u'hello']] 110 | # [u'hello', u'this is another sentence'] => [[u'hello'], [u'this', u'is', u'another', u'sentence']] 111 | # 112 | # Therefore, in the next step we need to handle a list of lists 113 | sentences = [punct_rm_tokenizer.tokenize(s) for s in sentences] 114 | 115 | if remove_stopwords: 116 | try: 117 | sentences = [[w for w in sentence if not w in stopwords.words(language)] \ 118 | for sentence in sentences] 119 | except: 120 | print 'There are no stopwords available in this language = ' + language 121 | 122 | # Next, we want to stem on a words basis 123 | # What this does for example is convert every word into lowercase, remove morphological 124 | # meanings, and so on. 125 | if stemming: 126 | # If desired, stopwords such as 'i', 'me', 'my', 'myself', 'we' can be removed 127 | # from the text. 128 | stemmer = SnowballStemmer(language) 129 | sentences = [[stemmer.stem(w) for w in sentence] for sentence in sentences] 130 | else: 131 | # If stemming is not desired, all words are at least converted into lower case 132 | sentences = [[w.lower() for w in sentence] for sentence in sentences] 133 | 134 | return sentences 135 | 136 | def build_graph(token_queue, used_names, emo_vector, depth): 137 | """ 138 | Emotional features are extracted using ConceptNet5. 139 | 140 | We use the provided RESTful interface for lookups. 141 | This function is basically a breadth-first graph search. 142 | Eventually, it returns a emotion-expressing vector for 143 | every token it gets passed. 144 | """ 145 | 146 | # Overview: 147 | # 148 | # Essentially, ConceptNet5 lets us lookup nearly every concept known to man-kind. 149 | # A lookup is done using a GET request using the concepts name. 150 | # As an example, looking up rollercoaster would be as easy as requesting the following link: 151 | # 152 | # http://conceptnet5.media.mit.edu/data/5.3/c/en/rollercoaster 153 | # 154 | # Every concept has only two properties: 155 | # - numFound: an integer expressing the number of related edges found; and 156 | # - edges: concepts that are somehow connected to the original concept. 157 | # 158 | # Since this is basically a undirected graph structure, we can traverse it easily by 159 | # continuously looking up the edges of a concept. 160 | # 161 | # Algorithm: 162 | # 163 | # build_graph takes a: 164 | # 165 | # - token_queue: set of tokens (normal words) (default: ["a", "list", "of", "words"]) 166 | # - used_names: a list of names that have been previously looked up 167 | # - emo_vector: a key-value object with emotions as keys and absolute or percentual metrics as values 168 | # - depth: an integer representing the graph search's depth 169 | # 170 | # 171 | # 172 | # Cancellation condition: 173 | # 174 | # if MAX_DEPTH is reached, percentages (calc_percentages) are calculated from the absolute values 175 | # returned by calc_nodes_weight. 176 | # Subsequently, the function returns, hence execution is done. 177 | if depth >= MAX_DEPTH: 178 | emo_vector['emotions'] = calc_percentages(emo_vector['emotions']) 179 | return emo_vector 180 | 181 | # Graph search part: 182 | # 183 | # Since we're actively working on token_queue inside of a for-loop (adding and removing elements) 184 | # making a copy that is not enumerated on is necessary. 185 | # Here, we make use of a Set as one of its qualities is that it allows no duplicates. 186 | # We don't want to lookup the same word twice. Lookups are just too time and CPU consuming. 187 | token_queue_copy = Set(token_queue) 188 | 189 | # We traverse through every token in the set 190 | # if the token's name does not resemble to one of the searched-for 191 | # emotion's name, then we proceed diving further down the graph until MAX_DEPTH is reached. 192 | for token in token_queue: 193 | 194 | # if the token's name resembles 195 | if token.name in EMOTIONS: 196 | try: 197 | emo_vector['emotions'][token.name] = emo_vector['emotions'][token.name] + calc_nodes_weight(token, token.name, [], 0) 198 | except: 199 | emo_vector['emotions'][token.name] = calc_nodes_weight(token, token.name, [], 0) 200 | else: 201 | token_queue_copy.remove(token) 202 | try: 203 | token.edge_lookup(used_names, 'en') 204 | except Exception as e: 205 | print e 206 | continue 207 | for new_edge in token.edges: 208 | if new_edge.name not in used_names and new_edge.weight > MIN_WEIGHT: 209 | used_names.add(new_edge.name) 210 | token_queue_copy.add(new_edge) 211 | return build_graph(token_queue_copy, used_names, emo_vector, depth+1) 212 | 213 | def calc_percentages(emotions): 214 | sum_values = sum(emotions.values()) 215 | return {k: v/sum_values for k, v in emotions.items() if v != 0} 216 | 217 | def calc_nodes_weight(node, emotion, weights, weight_num): 218 | print node.name + ': %d' % node.weight 219 | if node.parent == None: 220 | for i, n in enumerate(weights): 221 | weight_num = weight_num + n/(i+1 * pow(len(weights), 2)) 222 | print '###########################' 223 | return weight_num 224 | else: 225 | weights.append(node.weight) 226 | return calc_nodes_weight(node.parent, emotion, weights, weight_num) -------------------------------------------------------------------------------- /emotext.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/emotext.db -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/models/__init__.py -------------------------------------------------------------------------------- /models/models.py: -------------------------------------------------------------------------------- 1 | import json 2 | import shelve 3 | from ..apis.concept_net_client import lookup 4 | from ..apis.text import build_graph 5 | from ..apis.text import lang_name_to_code 6 | from ..utils.utils import extr_from_concept_net_edge 7 | from ..apis.text import text_processing 8 | from datetime import datetime 9 | from sets import Set 10 | from threading import Thread 11 | from ..utils.utils import get_config 12 | from collections import Counter 13 | 14 | MAX_DEPTH = get_config('graph_search', 'MAX_DEPTH', 'getint') 15 | MIN_WEIGHT = get_config('graph_search', 'MIN_WEIGHT', 'getint') 16 | REQ_LIMIT = get_config('conceptnet5_parameters', 'REQ_LIMIT', 'getint') 17 | 18 | class Conversation(Thread): 19 | """ 20 | A conversation represents a real-world conversation and is essentially 21 | a collection of single messages. 22 | """ 23 | def __init__(self, messages): 24 | Thread.__init__(self) 25 | self.messages = messages 26 | 27 | def run(self): 28 | self.emotions = self.conv_to_emotion_vectors() 29 | self.emotions = self.word_interpolation(self.emotions[0].text) 30 | 31 | def word_interpolation(self, words): 32 | """ 33 | Interpolates a list of words. 34 | List must be structurally identical to self.emotions. 35 | """ 36 | # In this word-based interpolation, we simply iterate (enumerate, as we need the index) over our 37 | # list and calculate the average of the previous and the next element. 38 | # 39 | # A word is a dictionary with a name and a list of emotions. 40 | # 41 | # For the sake of simplicity, we interpolate the first element with the last, 42 | # and the last with the first. 43 | interpolated_words = list() 44 | 45 | for i, w in enumerate(words): 46 | prev_w = words[i-1] 47 | if i == len(words)-1: 48 | next_w = words[0] 49 | else: 50 | next_w = words[i+1] 51 | if prev_w is not None and next_w is not None: 52 | interpolated_w = self.interpolate_e_vector(prev_w, w, next_w) 53 | interpolated_words.append(interpolated_w) 54 | return interpolated_words 55 | 56 | def interpolate_e_vector(self, left, middle, right): 57 | """ 58 | Interpolates a dictionary emotions-vector with an arbitrary number and 59 | form of emotions. 60 | """ 61 | 62 | # An emotions-vector can have any emotion's name as a key. 63 | # If a key exists in for example only two of the passed vectors, we treat it as 0. 64 | emotions = Counter(left['emotions'].keys() + middle['emotions'].keys() + right['emotions'].keys()).keys() 65 | for e in emotions: 66 | 67 | if e in left['emotions']: 68 | left_e = left['emotions'][e] 69 | else: 70 | left_e = 0 71 | 72 | if e in middle['emotions']: 73 | middle_e = middle['emotions'][e] 74 | else: 75 | middle_e = 0 76 | 77 | if e in right['emotions']: 78 | right_e = right['emotions'][e] 79 | else: 80 | right_e = 0 81 | 82 | middle['emotions'][e] = (left_e + middle_e + right_e)/3 83 | return middle 84 | 85 | def conv_to_emotion_vectors(self): 86 | """ 87 | Converts a whole conversation and its messages to emotions. 88 | """ 89 | messages = list(self.messages) 90 | return [m.to_emotion_vector() for m in messages] 91 | 92 | def __repr__(self): 93 | return str(self.__dict__) 94 | 95 | class CacheController(): 96 | """ 97 | Extracting emotions from text through conceptnet5 can be a very time consuming task, 98 | especially when processing large quantities of text. 99 | 100 | The CacheController class therefore can be used to save word-based results persistently. 101 | """ 102 | 103 | # This class should simply act as a key-value storage cache that can be asked before a word is being processed. 104 | # If the word is not included in its cache, the word must be processed by traversing conceptnet5's 105 | # graph structure, else we can just use the already given result. 106 | # 107 | # Since different parameters (which can be found in config.cfg) alter the results immensely, 108 | # CacheController must be initialized with all those parameters. 109 | # Also, it is very likely that parameters will increase in later versions, hence naming function parameters 110 | # might be a good idea for everyone reusing this class. 111 | 112 | def __init__(self, max_depth, min_weight, req_limit): 113 | self.max_depth = max_depth 114 | self.min_weight = min_weight 115 | self.req_limit = req_limit 116 | 117 | # for every form those parameters can take, a new .db file is created on the hard drive. 118 | self.cache = shelve.open('./word_cache_%d_%d_%d' % (self.max_depth, self.min_weight, self.req_limit)) 119 | 120 | def add_word(self, word, emotions): 121 | """ 122 | Adds an emotion dictionary. 123 | 124 | This method will overwrite everything of an already given key. 125 | """ 126 | word = word.encode("utf8") 127 | self.cache[word] = emotions 128 | 129 | def fetch_word(self, word): 130 | """ 131 | Fetches a word and returns None if a KeyValue exception is thrown. 132 | """ 133 | try: 134 | word = word.encode("utf8") 135 | return self.cache[word] 136 | except: 137 | # in case a word is not found in the cache 138 | return None 139 | 140 | def __repr__(self): 141 | """ 142 | Simply returns a dictionary as representation of the object 143 | """ 144 | return str(self.__dict__) 145 | 146 | class Message(): 147 | """ 148 | Represents a message a user of Emotext sends to the cofra framework. 149 | """ 150 | def __init__(self, entity_name, text, date=datetime.today(), language='english'): 151 | self.entity_name = entity_name 152 | self.text = text 153 | self.date = date 154 | self.language = language 155 | 156 | def __repr__(self): 157 | """ 158 | Simply returns a dictionary as representation of the object. 159 | """ 160 | return str(self.__dict__) 161 | 162 | def __setitem__(self, key, value): 163 | self[key] = value 164 | 165 | def to_emotion_vector(self, cc=CacheController(max_depth=MAX_DEPTH, min_weight=MIN_WEIGHT, req_limit=REQ_LIMIT)): 166 | """ 167 | Converts a message to an emotions-vector. 168 | This method can be used in combination with a CacheController, which is set default to emotext's config settings. 169 | """ 170 | 171 | # A conversation consists of an arbitrary number of messages, which contain 172 | # an arbitrary number of tokens. 173 | # 174 | # Due to the fact that processing text to emotions is a tedious process, 175 | # we implemented a Cache Service to enable faster processing of already seen words 176 | 177 | # Process text via Message object method that uses tokenization, stemming, punctuation removal and so on... 178 | tokens = " ".join([" ".join([w for w in s]) \ 179 | for s in \ 180 | text_processing(self.text, stemming=False)]) \ 181 | .split() 182 | 183 | # We have to use enumerate here, as a for each loop's reference 184 | # would not work appropriately 185 | for i, t in enumerate(tokens): 186 | empty_vector = { 187 | 'name': t, 188 | 'emotions': {} 189 | } 190 | 191 | if cc is not None: 192 | # we try to use the cache to find the word's emotions 193 | pot_t_vector = cc.fetch_word(t) 194 | if pot_t_vector is not None: 195 | tokens[i] = pot_t_vector 196 | else: 197 | tokens[i] = build_graph(Set([Node(t, lang_name_to_code(self.language), 'c')]), Set([]), empty_vector, 0) 198 | cc.add_word(tokens[i]['name'], tokens[i]) 199 | else: 200 | tokens[i] = build_graph(Set([Node(t, lang_name_to_code(self.language), 'c')]), Set([]), empty_vector, 0) 201 | self.text = tokens 202 | return self 203 | 204 | class Node(): 205 | def __init__(self, name, lang_code='en', type='c', rel=None, weight=0, edges=[], parent=None): 206 | self.name = name 207 | self.lang_code = lang_code 208 | self.type = type 209 | self.edges = edges 210 | self.rel = rel 211 | self.weight = weight 212 | self.parent = parent 213 | 214 | def __repr__(self): 215 | """ 216 | Simply returns a dictionary as representation of the object 217 | """ 218 | return str(self.__dict__) 219 | 220 | def edge_lookup(self, used_names, lang_code='en'): 221 | """ 222 | Uses ConceptNet's lookup function to search for all related 223 | nodes to this one. 224 | 225 | Subsequently parses all of those edges and returns nothing 226 | when update was successful. 227 | """ 228 | # node must at least have a name to do a lookup 229 | # otherwise, an exception is raised 230 | if self.name == None: 231 | raise Exception('Cannot do edge_lookup without nodes name.') 232 | # lookup token via ConceptNet web-API 233 | req = lookup(self.type, self.lang_code, self.name) 234 | token_res = req 235 | # used_names is a list of objects, however, in order to perform lookups, 236 | # we need it to be a list of strings 237 | # if result has more than 0 edges continue 238 | if token_res != None and token_res['numFound'] > 0: 239 | edges = [] 240 | # for every edge, try converting it to a Node object that 241 | # can be processed further 242 | for e in token_res['edges']: 243 | # extract basic information from the 'end' key of an edge 244 | # it contains, type, lang_code and the name of the node 245 | basic_start = extr_from_concept_net_edge(e['start']) 246 | basic_end = extr_from_concept_net_edge(e['end']) 247 | # instantiate a Node object from this information and append it to a list of edges 248 | # print basic_start['name'] + ' --> ' + e['rel'] + ' --> ' + basic_end['name'] 249 | if basic_end['name'] != self.name: 250 | if basic_end['name'] not in used_names and basic_end['lang_code'] == lang_code: 251 | edges.append(Node(basic_end['name'], basic_end['lang_code'], basic_end['type'], e['rel'], e['weight'], [], self)) 252 | else: 253 | if basic_start['name'] not in used_names and basic_start['lang_code'] == lang_code: 254 | edges.append(Node(basic_start['name'], basic_start['lang_code'], basic_start['type'], e['rel'], e['weight'], [], self)) 255 | # if all edges have been processed, add them to the current object 256 | self.edges = edges 257 | else: 258 | # if no edges found on token, raise exception 259 | raise Exception('Token has no connecting edges.') 260 | 261 | class NodeEncoder(json.JSONEncoder): 262 | """ 263 | Taken from: http://stackoverflow.com/a/1458716/1263876 264 | 265 | The Node object is a recursive data structure that can contain itself, 266 | as it holds all it's child Nodes. 267 | 268 | Therefore, this method needs to be defined when trying to serialize a Node object 269 | to json. 270 | """ 271 | def default(self, obj): 272 | if not isinstance(obj, Node): 273 | return super(NodeEncoder, self).default(obj) 274 | return obj.__dict__ 275 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Emotext 2 | Emotext is framework that helps you extract, save and correlate emotions with contextual information. 3 | It uses MIT's conceptnet5, nltk and Python. 4 | 5 | To enable programming-language-independent usage, Emotext's interface is provided RESTfully. 6 | 7 | ## Automatic installation 8 | Emotext uses `pip` for dependency management. To install all required dependencies, you simply run: 9 | 10 | pip install -r requirements.txt 11 | 12 | Additionally, you'll need to download `nltk`'s [language-specific files](#downloading-nltks-language-specific-files). 13 | 14 | ## Manual installation 15 | Python3 is required, as well as `pip` for installing dependencies. 16 | The web server is hosted using `flask`. Tests are implemented against the RESTful interface, therefore the `requests` library is required. 17 | 18 | pip install flask 19 | pip install requests 20 | 21 | Furthermore, `ntlk` is used for natural language processing. It can also be installed using `pip`: 22 | 23 | pip install nltk 24 | 25 | However, `ntlk` still needs [language-specific files](#downloading-nltks-language-specific-files). 26 | 27 | ## Downloading nltk's language-specific files 28 | Enter Python's IDE by typing `python` in your terminal and run the following commands: 29 | 30 | >>> import nltk 31 | >>> nltk.download() 32 | 33 | We recommend downloading all dependencies. 34 | 35 | ## Setting up a local Conceptnet image 36 | As mentioned already, Emotext is able to extract emotions from text. This is done by looking up concepts on conceptnet5's graph database. 37 | Through path finding, emotext searches an arbitrary number of levels for connections to find a connection between the entered word and an emotion. 38 | This process requires *a lot* of lookups, which is why we recommend hosting a local instance of conceptnet5 instead of using the web-API. 39 | 40 | A detailed installation tutorial on how to set up docker and conceptnet5 can be found [here](https://github.com/commonsense/conceptnet5/wiki/Docker). 41 | However, we will still go through the installation process here: 42 | 43 | 1. [Install docker (Mac OS X)](https://docs.docker.com/installation/mac/). Make sure, that you're using a bash shell, otherwise the installation will probably fail at some point. 44 | 2. [Increase your virtual machine's HD](https://docs.docker.com/articles/b2d_volume_resize/) up to 100-150 GB of storage. 45 | 3. Pull conceptnet5-web from docker's repositories: `sudo docker run rspeer/conceptnet-web:5.3` 46 | 4. In your VirtualBox GUI, setup a port forward from port 80 of your virtual machine to port 80 of your real machine (NAT Interface in the tab "Network") 47 | 5. Run the application with a port forward from 10053 to 80, like this: 48 | `docker run -it -p 80:10053 rspeer/conceptnet-web --net=host` 49 | 6. Now, do either `boot2docker ip` or `arp -an` to find your virtual machine's ip 50 | 7. Once you were able to find the right IP, conceptnet5's web interface should appear, if you enter it in your browser 51 | 52 | ## Configuration 53 | For convenience, when wanting to adjust parameters concerning for example the emotion extraction process there is the file `config.cfg`. 54 | After changes on this file, the server must be restarted. 55 | 56 | If you want to connect to the docker container's shell, try: 57 | `sudo docker exec -i -t bash`. 58 | 59 | ### Removing conceptnet5's request limiter 60 | Per default, conceptnet5 limits requests to about 6000 in 60 minutes (https://github.com/commonsense/conceptnet5/search?utf8=%E2%9C%93&q=Limiter). 61 | To remove the limiter, open the docker container's bash (as described above) and `cd conceptnet5`. Install `apt-get install nano` and `nano api.py`. 62 | Inside of this file `Limiter` gets exported and an instance of it is assigned to `limiter`. Also python-decorators `@limiter` are used to limit conceptnet5's requests. You have to remove all of them. 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | altgraph==0.10.2 2 | bdist-mpkg==0.5.0 3 | #bonjour-py==0.3 4 | colorama==0.3.3 5 | cov-core==1.15.0 6 | coverage==3.7.1 7 | execnet==1.3.0 8 | Flask==0.10.1 9 | futures==2.2.0 10 | itsdangerous==0.24 11 | Jinja2==2.7.3 12 | macholib==1.5.1 13 | MarkupSafe==0.23 14 | #matplotlib==1.3.1 15 | modulegraph==0.10.4 16 | nltk==3.0.1 17 | numpy==1.8.0rc1 18 | psycopg2==2.6 19 | py==1.4.26 20 | py2app==0.7.3 21 | #pyobjc-core==2.5.1 22 | #pyobjc-framework-Accounts==2.5.1 23 | #pyobjc-framework-AddressBook==2.5.1 24 | #pyobjc-framework-AppleScriptKit==2.5.1 25 | #pyobjc-framework-AppleScriptObjC==2.5.1 26 | #pyobjc-framework-Automator==2.5.1 27 | #pyobjc-framework-CFNetwork==2.5.1 28 | #pyobjc-framework-Cocoa==2.5.1 29 | #pyobjc-framework-Collaboration==2.5.1 30 | #pyobjc-framework-CoreData==2.5.1 31 | #pyobjc-framework-CoreLocation==2.5.1 32 | #pyobjc-framework-CoreText==2.5.1 33 | #pyobjc-framework-DictionaryServices==2.5.1 34 | #pyobjc-framework-EventKit==2.5.1 35 | #pyobjc-framework-ExceptionHandling==2.5.1 36 | #pyobjc-framework-FSEvents==2.5.1 37 | #pyobjc-framework-InputMethodKit==2.5.1 38 | #pyobjc-framework-InstallerPlugins==2.5.1 39 | #pyobjc-framework-InstantMessage==2.5.1 40 | #pyobjc-framework-LatentSemanticMapping==2.5.1 41 | #pyobjc-framework-LaunchServices==2.5.1 42 | #pyobjc-framework-Message==2.5.1 43 | #pyobjc-framework-OpenDirectory==2.5.1 44 | #pyobjc-framework-PreferencePanes==2.5.1 45 | #pyobjc-framework-PubSub==2.5.1 46 | #pyobjc-framework-QTKit==2.5.1 47 | #pyobjc-framework-Quartz==2.5.1 48 | #pyobjc-framework-ScreenSaver==2.5.1 49 | #pyobjc-framework-ScriptingBridge==2.5.1 50 | #pyobjc-framework-SearchKit==2.5.1 51 | #pyobjc-framework-ServiceManagement==2.5.1 52 | #pyobjc-framework-Social==2.5.1 53 | #pyobjc-framework-SyncServices==2.5.1 54 | #pyobjc-framework-SystemConfiguration==2.5.1 55 | #pyobjc-framework-WebKit==2.5.1 56 | pyOpenSSL==0.13.1 57 | pyparsing==2.0.1 58 | pytest==2.6.4 59 | pytest-cov==1.8.1 60 | pytest-xdist==1.11 61 | python-dateutil==1.5 62 | python-termstyle==0.1.10 63 | pytz==2013.7 64 | requests==2.5.1 65 | requests-futures==0.9.5 66 | scipy==0.13.0 67 | six==1.4.1 68 | sniffer==0.3.4 69 | Twisted==13.2.0 70 | #vboxapi==1.0 71 | virtualenv==12.0.7 72 | Werkzeug==0.10.1 73 | xattr==0.6.4 74 | zope.interface==4.1.1 75 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/utils/__init__.py -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import ConfigParser 2 | import os 3 | 4 | def extr_from_concept_net_edge(s): 5 | """ 6 | ConceptNet returnes on lookup edges that are named in this fashion: 7 | 8 | 'c/en/autobahn' 9 | 10 | From this we can extract: 11 | - type 12 | - language-code 13 | - name of the node 14 | """ 15 | params_list = s.split('/') 16 | if len(params_list) < 3: 17 | raise Exception('The given string did not contain at least two slashes.') 18 | return { 19 | 'type': params_list[1], 20 | 'lang_code': params_list[2], 21 | 'name': params_list[3] 22 | } 23 | 24 | def get_config(section, key, method_name='get'): 25 | """ 26 | Reads the 'config.cfg' file in the root directory and allows 27 | to select specific values from it that will - if found - be returned. 28 | """ 29 | config_parser = ConfigParser.ConfigParser() 30 | config_parser.readfp(open(os.path.dirname(os.path.abspath(__file__)) + r'/../config.cfg')) 31 | try: 32 | if method_name == 'getlist': 33 | # split string on comma 34 | l = getattr(config_parser, 'get')(section, key).split(',') 35 | return l 36 | else: 37 | return getattr(config_parser, method_name)(section, key) 38 | except: 39 | print 'Combination of section and key has not been found in config.cfg file.' 40 | return None --------------------------------------------------------------------------------