├── .DS_Store
├── .gitignore
├── __init__.py
├── apis
    ├── __init__.py
    ├── concept_net_client.py
    └── text.py
├── emotext.db
├── models
    ├── __init__.py
    └── models.py
├── readme.md
├── requirements.txt
└── utils
    ├── __init__.py
    └── utils.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | 
45 | # Translations
46 | *.mo
47 | *.pot
48 | 
49 | # Django stuff:
50 | *.log
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | # PyBuilder
56 | target/
57 | 
58 | config.cfg


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/__init__.py


--------------------------------------------------------------------------------
/apis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/apis/__init__.py


--------------------------------------------------------------------------------
/apis/concept_net_client.py:
--------------------------------------------------------------------------------
  1 | """
  2 | `rest_client.py`_ is a simple client for interacting with ConceptNet 5's REST
  3 | API.
  4 | 
  5 | .. _`rest_client.py`: https://github.com/commonsense/conceptnet/blob/master/conceptnet/webapi/rest_client.py
  6 | 
  7 | This client is not object-oriented. The data structures you work with are
  8 | dictionaries, of the form described in the API documentation. The main function
  9 | :func:`lookup` can be used to look up many different kinds of data. There are
 10 | also convenience functions for performing common operations on this data.
 11 | 
 12 | If you want to know what fields are contained in these dictionaries, read
 13 | the REST API documentation at
 14 | http://csc.media.mit.edu/docs/conceptnet/webapi.html#rest-requests .
 15 | 
 16 | This wrapper has been portet to ConceptNet5 by Tim Daubenschuetz
 17 | """
 18 | import sys
 19 | import os.path
 20 | 
 21 | import urllib, urllib2
 22 | from requests_futures.sessions import FuturesSession
 23 | from ..utils.utils import get_config
 24 | 
 25 | try:
 26 |     import json
 27 | except:
 28 |     import simplejson as json
 29 | 
 30 | CLIENT_VERSION = '1'
 31 | 
 32 | # Emotext specific parameters, used to enable the library to be run locally
 33 | SERVER_URL = get_config('conceptnet5_parameters', 'SERVER_URL')
 34 | API_URL = get_config('conceptnet5_parameters', 'API_URL')
 35 | CONCEPT_NET_VERSION = get_config('conceptnet5_parameters', 'VERSION')
 36 | REQ_LIMIT = get_config('conceptnet5_parameters', 'REQ_LIMIT')
 37 | 
 38 | TYPES = {
 39 |     'assertion': 'a',
 40 |     'concept': 'c',
 41 |     'datasets': 'd',
 42 |     'edge': 'e',
 43 |     'license': 'l',
 44 |     'language_indenpendent_relation': 'r',
 45 |     'knowledge_sources': 's',
 46 |     'and': 'and',
 47 |     'or': 'or'
 48 | }
 49 | 
 50 | def lookup(type, language, key):
 51 |     """
 52 |     Get an object of a certain *type*, specified by the code for what
 53 |     *language* it is in and its *key*. The types currently supported are:
 54 | 
 55 |         `assertion`
 56 |             assertions
 57 |         `concept`
 58 |             concepts (words, disambiguated words, and phrases, in a particular language)
 59 |         'datasets'
 60 |             datasets (large sources of knowledge that can be downloaded as a unit)
 61 |         'edge'
 62 |             unique, arbitrary IDs for edges. Edges that assert the same thing combine to form assertions.
 63 |         'license'
 64 |             license terms for redistributing the information in an edge. The two licenses in ConceptNet are /l/CC/By for Creative Commons Attribution, and /l/CC/By-SA for the more restrictive Attribution-ShareAlike license. See Copying and sharing ConceptNet.
 65 |         'language_indenpendent_relation'
 66 |             language-independent relations, such as /r/IsA
 67 |         'knowledge_sources'
 68 |             knowledge sources, which can be human contributors, Web sites, or automated processes
 69 |         'and'
 70 |             conjunctions and disjunctions of sources
 71 |         'or'
 72 |             conjunctions and disjunctions of sources
 73 |     
 74 |     The object will be returned as a dictionary, or in the case of features,
 75 |     a list.
 76 |     """
 77 |     if type == None: 
 78 |         raise Exception('Type must be specified to request the web api.')
 79 |     if len(type) > 1: 
 80 |         type = from_name_to_type(type)
 81 |     return _get_json(type, language, key.lower())
 82 | 
 83 | def from_name_to_type(type='concept'):
 84 |     try:
 85 |         return TYPES[type]
 86 |     except:
 87 |         print 'The Type ' + type + 'could not have been found.'
 88 |         return None
 89 | 
 90 | def lookup_concept_raw(language, concept_name):
 91 |     """
 92 |     Look up a Concept by its language and its raw name. For example,
 93 |     `lookup_concept_raw('en', 'webbed feet')` will get no results, but
 94 |     `lookup_concept_raw('en', 'web foot')` will.
 95 | 
 96 |     Use :func:`lookup_concept_from_surface` to look up a concept from an
 97 |     existing surface text, such as "webbed feet".
 98 | 
 99 |     Use :func:`lookup_concept_from_nl` to look up a concept from any natural
100 |     language text. This requires the `simplenlp` module.
101 |     """
102 |     return lookup('concept', language, concept_name)
103 | 
104 | def lookup_concept_from_surface(language, surface_text):
105 |     """
106 |     Look up a concept, given a surface form of that concept that someone has
107 |     entered into Open Mind. For example,
108 |     `lookup_concept_from_surface('en', 'webbed feet')` will return the concept
109 |     'web foot'.
110 |     """
111 |     surface = lookup('surface', language, surface_text)
112 |     return surface['concept']
113 | 
114 | def lookup_concept_from_nl(language, text):
115 |     """
116 |     Look up a concept using any natural language text that represents it.
117 |     This function requires the :mod:`simplenlp` module
118 |     to normalize natural language text into a raw concept name.
119 |     """
120 |     import simplenlp
121 |     nltools = simplenlp.get('en')
122 | 
123 |     normalized = nltools.normalize(text)
124 |     return lookup_concept_raw(language, normalized)
125 | 
126 | def assertions_for_concept(concept, direction='all', limit=20):
127 |     """
128 |     Given a dictionary representing a concept, look up the assertions it
129 |     appears in.
130 | 
131 |     By default, this returns all matching assertions. By setting the
132 |     optional argument `direction` to "forward" or "backward", you can restrict
133 |     it to only assertions that have that concept on the left or the right
134 |     respectively.
135 | 
136 |     You may set the limit on the number of results up to 100. The default is
137 |     20. This limit is applied before results are filtered for forward or
138 |     backward assertions.
139 |     """
140 |     def assertion_filter(assertion):
141 |         if direction == 'all': return True
142 |         elif direction == 'forward':
143 |             return assertion['concept1']['text'] == concept['text']
144 |         elif direction == 'backward':
145 |             return assertion['concept2']['text'] == concept['text']
146 |         else:
147 |             raise ValueError("Direction must be 'all', 'forward', or 'backward'")
148 |         
149 |     assertions = _refine_json(concept, 'assertions', 'limit:%d' % limit)
150 |     return [a for a in assertions if assertion_filter(a)]
151 | 
152 | def surface_forms_for_concept(concept, limit=20):
153 |     """
154 |     Given a dictionary representing a concept, get a list of its surface
155 |     forms (also represented as dictionaries).
156 | 
157 |     You may set the limit on the number of results up to 100. The default is
158 |     20.
159 |     """
160 |     return _refine_json(concept, 'surfaceforms', 'limit:%d' % limit)
161 | 
162 | def votes_for(obj):
163 |     """
164 |     Given a dictionary representing any object that can be voted on -- such as
165 |     an assertion or raw_assertion -- get a list of its votes.
166 |     """
167 |     return _refine_json(obj, 'votes')
168 | 
169 | def similar_to_concepts(concepts, limit=20):
170 |     """
171 |     `concepts` is a list of concept names or (concept name, weight) pairs.
172 |     Given this, `similar_to_concepts` will find the `limit` most related
173 |     concepts.
174 | 
175 |     These similar concepts are returned in dictionaries of the form:
176 | 
177 |         {'concept': concept, 'score': score}
178 | 
179 |     where `concept` is the data structure for a concept.
180 |     """
181 |     pieces = []
182 |     for entry in concepts:
183 |         if isinstance(entry, tuple):
184 |             concept, weight = entry
185 |         else:
186 |             concept = entry
187 |             weight = 1.
188 |         if hasattr(concept, 'text'):
189 |             concept = concept.text
190 |         concept = concept.replace(' ', '_').encode('utf-8')
191 |         pieces.append("%s@%s" % (concept, weight))
192 |     termlist = ','.join(pieces)
193 |     limitstr = 'limit:%d' % limit
194 |     return _get_json('en', 'similar_to', termlist, limitstr)
195 | 
196 | def add_statement(language, frame_id, text1, text2, username, password):
197 |     """
198 |     Add a statement to Open Mind, or vote for it if it is there.
199 | 
200 |     Requires the following parameters:
201 |         
202 |         language
203 |             The language code, such as 'en'.
204 |         frame_id
205 |             The numeric ID of the sentence frame to use.
206 |         text1
207 |             The text filling the first blank of the frame.
208 |         text2
209 |             The text filling the second blank of the frame.
210 |         username
211 |             Your Open Mind username.
212 |         password
213 |             Your Open Mind password.
214 |     
215 |     Example: 
216 |     >>> frame = lookup('frame', 'en', 7)
217 |     >>> frame['text']
218 |     '{1} is for {2}'
219 |     
220 |     >>> add_statement('en', 7, 'election day', 'voting', 'rspeer', PASSWORD)
221 |     (Result: rspeer adds the statement "election day is for voting", which
222 |     is also returned as a raw_assertion.)
223 |     """
224 |     return _post_json([language, 'frame', frame_id, 'statements'], {
225 |         'username': username,
226 |         'password': password,
227 |         'text1': text1,
228 |         'text2': text2
229 |     })
230 | 
231 | 
232 | def _get_json(*url_parts):
233 |     """
234 |     This method has been updated and now uses ConceptNet5 syntax to access the web-API
235 |     """
236 |     session = FuturesSession()
237 |     url = API_URL + '/' + CONCEPT_NET_VERSION + '/' + '/'.join(urllib2.quote(p.encode('utf-8')) for p in url_parts) + '?limit=' + REQ_LIMIT
238 |     # print 'Looking up: ' + url
239 |     #return session.get(url)
240 |     return json.loads(_get_url(url))
241 | 
242 | def _extend_url(old_url, *url_parts):
243 |     url = old_url + '/'.join(urllib2.quote(str(p)) for p in url_parts) + '/'
244 |     return json.loads(_get_url(url))
245 | 
246 | def _get_url(url):
247 |     conn = urllib2.urlopen(url)
248 |     return conn.read()
249 | 
250 | def _refine_json(old_obj, *parts):
251 |     return _extend_url(SERVER_URL + old_obj['resource_uri'], *parts)
252 | 


--------------------------------------------------------------------------------
/apis/text.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module uses a ConceptNet5 REST-API Wrapper to connect to the network.
  3 | 
  4 | Given an arbitrary text (that has been stemmed and normalized),
  5 | it analyzes every token in order to create a vector that represents
  6 | the texts emotions.
  7 | 
  8 | This is done by algorithms searching the graph structure of concept net for
  9 | connections between a specific token and the entity 'Emotion'.
 10 | """
 11 | import sys
 12 | import os.path
 13 | import re
 14 | 
 15 | from sets import Set
 16 | from math import pow
 17 | 
 18 | from ..utils.utils import get_config
 19 | 
 20 | from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
 21 | from nltk.tokenize import RegexpTokenizer
 22 | from nltk.stem.snowball import SnowballStemmer
 23 | from nltk.corpus import stopwords
 24 | 
 25 | LANG_TO_CODE = {
 26 |     'english': 'en',
 27 |     'german': 'de',
 28 |     'french': 'fr'
 29 | }
 30 | 
 31 | MAX_DEPTH = get_config('graph_search', 'MAX_DEPTH', 'getint')
 32 | MIN_WEIGHT = get_config('graph_search', 'MIN_WEIGHT', 'getint')
 33 | 
 34 | EMOTIONS = set(get_config('graph_search', 'EMOTIONS', 'getlist'))
 35 | 
 36 | def lang_name_to_code(lang_name='english'):
 37 |     """
 38 |     ConceptNet uses language codes to query words.
 39 |     Since we don't want to use those, we've integrated this method
 40 |     that allows conversion from language names to language codes.
 41 | 
 42 |     If a language code is missing, an exception will be thrown and the users
 43 |     will be notified.
 44 | 
 45 |     They can furthermore easily adjust the LANG_TO_CODE constant, to add his own language.
 46 |     """
 47 |     try:
 48 |         return LANG_TO_CODE[lang_name]
 49 |     except:
 50 |         print 'Unfortunately, no lang_code is present for this language.'
 51 |         print 'This may be adjusted in apis/emotext.py: LANG_TO_CODE'
 52 |         return None
 53 | 
 54 | def text_processing(text, remove_punctuation=True, stemming=True, remove_stopwords=True, language='english'):
 55 |     """
 56 |     This function enables general text processing.
 57 |     It features:
 58 |         * Tokenization on sentence level
 59 |         * Tokenization on word level
 60 |         * Punctuation removal
 61 |         * Stemming (and stopword removal)
 62 |         * Conversion to lower case
 63 |     The language parameter is only required, if stemming and removal of stopwords are desired.
 64 |     """
 65 | 
 66 |     # Texts often contain punctuation characters.
 67 |     # While we'd like to remove them from our data set, their information shouldn't be lost, as
 68 |     # it would enable us to handle negation in text later on.
 69 |     # 
 70 |     # An example:
 71 |     # Given the sentence: 'The movie was not bad.', we could convert all
 72 |     # adjectives in the sentence to antonyms and remove all negations.
 73 |     # Afterwards, the sentence would read 'The movie was good', where 'good'
 74 |     # is the antonym of 'bad'.
 75 |     # 
 76 |     # Therefore, punctuation information should not be lost throughout the process of
 77 |     # processing the text with NLP.
 78 |     sentence_tokenizer = PunktSentenceTokenizer(PunktParameters())
 79 |     # tokenize always returns a list of strings divided by punctuation characters
 80 |     # 
 81 |     # 'hello' => [u'hello']
 82 |     # 'Hello this is doge. world.' => [u'Hello this is doge.', u'world.']
 83 |     # 
 84 |     # Therefore, we need to continue handling a list, namely the sentences variable
 85 |     sentences = sentence_tokenizer.tokenize(text)
 86 | 
 87 |     # In the English language at least, 
 88 |     # there are certain stop words, that introduce low-level negation
 89 |     # on a sentence bases.
 90 |     # However, these stop words are often melted with their previous verb
 91 |     # 
 92 |     # isn't = is not
 93 |     # wouldn't = would not
 94 |     # 
 95 |     # This must resolved, as it would not be possible for further functionality of this function to continue
 96 |     # extracting information.
 97 |     # Especially the 'anonymity' functionality wouldn't work without this
 98 |     if language == 'english':
 99 |         sw_pattern = r"(n't)"
100 |         sentences = [re.sub(sw_pattern, ' not', s) for s in sentences]
101 |     
102 |     # If desired, the user can no go ahead and remove punctuation from all sentences
103 |     if remove_punctuation:
104 |         # This tokenizer simply removes every character or word which
105 |         # length is < 2 and is not a alphabetic one
106 |         punct_rm_tokenizer = RegexpTokenizer(r'\w{2,}')
107 |         # In this case, tokenize will return a list of every word in the sentence
108 |         # 
109 |         # [u'hello'] => [[u'hello']]
110 |         # [u'hello', u'this is another sentence'] => [[u'hello'], [u'this', u'is', u'another', u'sentence']]
111 |         # 
112 |         # Therefore, in the next step we need to handle a list of lists
113 |         sentences = [punct_rm_tokenizer.tokenize(s) for s in sentences]
114 | 
115 |     if remove_stopwords:
116 |         try:
117 |             sentences = [[w for w in sentence if not w in stopwords.words(language)] \
118 |                                 for sentence in sentences]
119 |         except:
120 |             print 'There are no stopwords available in this language = ' + language
121 | 
122 |     # Next, we want to stem on a words basis
123 |     # What this does for example is convert every word into lowercase, remove morphological
124 |     # meanings, and so on.
125 |     if stemming:
126 |         # If desired, stopwords such as 'i', 'me', 'my', 'myself', 'we' can be removed
127 |         # from the text.
128 |         stemmer = SnowballStemmer(language)
129 |         sentences = [[stemmer.stem(w) for w in sentence] for sentence in sentences]
130 |     else:
131 |         # If stemming is not desired, all words are at least converted into lower case
132 |         sentences = [[w.lower() for w in sentence] for sentence in sentences]
133 | 
134 |     return sentences
135 | 
136 | def build_graph(token_queue, used_names, emo_vector, depth):
137 |     """
138 |     Emotional features are extracted using ConceptNet5.
139 | 
140 |     We use the provided RESTful interface for lookups.
141 |     This function is basically a breadth-first graph search.
142 |     Eventually, it returns a emotion-expressing vector for
143 |     every token it gets passed.
144 |     """
145 | 
146 |     # Overview:
147 |     # 
148 |     # Essentially, ConceptNet5 lets us lookup nearly every concept known to man-kind.
149 |     # A lookup is done using a GET request using the concepts name.
150 |     # As an example, looking up rollercoaster would be as easy as requesting the following link:
151 |     # 
152 |     # http://conceptnet5.media.mit.edu/data/5.3/c/en/rollercoaster
153 |     # 
154 |     # Every concept has only two properties:
155 |     # - numFound: an integer expressing the number of related edges found; and
156 |     # - edges: concepts that are somehow connected to the original concept.
157 |     # 
158 |     # Since this is basically a undirected graph structure, we can traverse it easily by
159 |     # continuously looking up the edges of a concept.
160 |     # 
161 |     # Algorithm:
162 |     # 
163 |     # build_graph takes a:
164 |     # 
165 |     # - token_queue: set of tokens (normal words) (default: ["a", "list", "of", "words"])
166 |     # - used_names: a list of names that have been previously looked up
167 |     # - emo_vector: a key-value object with emotions as keys and absolute or percentual metrics as values
168 |     # - depth: an integer representing the graph search's depth
169 |     #
170 |     #
171 |     #
172 |     # Cancellation condition:
173 |     # 
174 |     # if MAX_DEPTH is reached, percentages (calc_percentages) are calculated from the absolute values
175 |     # returned by calc_nodes_weight.
176 |     # Subsequently, the function returns, hence execution is done.
177 |     if depth >= MAX_DEPTH:
178 |         emo_vector['emotions'] = calc_percentages(emo_vector['emotions'])
179 |         return emo_vector
180 | 
181 |     # Graph search part:
182 |     # 
183 |     # Since we're actively working on token_queue inside of a for-loop (adding and removing elements)
184 |     # making a copy that is not enumerated on is necessary.
185 |     # Here, we make use of a Set as one of its qualities is that it allows no duplicates.
186 |     # We don't want to lookup the same word twice. Lookups are just too time and CPU consuming.
187 |     token_queue_copy = Set(token_queue)
188 | 
189 |     # We traverse through every token in the set
190 |     # if the token's name does not resemble to one of the searched-for
191 |     # emotion's name, then we proceed diving further down the graph until MAX_DEPTH is reached.
192 |     for token in token_queue:
193 |         
194 |         # if the token's name resembles 
195 |         if token.name in EMOTIONS:
196 |             try:
197 |                 emo_vector['emotions'][token.name] = emo_vector['emotions'][token.name] + calc_nodes_weight(token, token.name, [], 0)
198 |             except:
199 |                 emo_vector['emotions'][token.name] = calc_nodes_weight(token, token.name, [], 0)
200 |         else:
201 |             token_queue_copy.remove(token)
202 |             try:
203 |                 token.edge_lookup(used_names, 'en')
204 |             except Exception as e:
205 |                 print e
206 |                 continue
207 |             for new_edge in token.edges:
208 |                 if new_edge.name not in used_names and new_edge.weight > MIN_WEIGHT:
209 |                     used_names.add(new_edge.name)
210 |                     token_queue_copy.add(new_edge)
211 |     return build_graph(token_queue_copy, used_names, emo_vector, depth+1)
212 | 
213 | def calc_percentages(emotions):
214 |     sum_values = sum(emotions.values())
215 |     return {k: v/sum_values for k, v in emotions.items() if v != 0}
216 | 
217 | def calc_nodes_weight(node, emotion, weights, weight_num):
218 |     print node.name + ': %d' % node.weight
219 |     if node.parent == None:
220 |         for i, n in enumerate(weights):
221 |             weight_num = weight_num + n/(i+1 * pow(len(weights), 2))
222 |         print '###########################'
223 |         return weight_num
224 |     else:
225 |         weights.append(node.weight)
226 |         return calc_nodes_weight(node.parent, emotion, weights, weight_num)


--------------------------------------------------------------------------------
/emotext.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/emotext.db


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/models/__init__.py


--------------------------------------------------------------------------------
/models/models.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import shelve
  3 | from ..apis.concept_net_client import lookup
  4 | from ..apis.text import build_graph
  5 | from ..apis.text import lang_name_to_code
  6 | from ..utils.utils import extr_from_concept_net_edge
  7 | from ..apis.text import text_processing
  8 | from datetime import datetime
  9 | from sets import Set
 10 | from threading import Thread
 11 | from ..utils.utils import get_config
 12 | from collections import Counter
 13 | 
 14 | MAX_DEPTH = get_config('graph_search', 'MAX_DEPTH', 'getint')
 15 | MIN_WEIGHT = get_config('graph_search', 'MIN_WEIGHT', 'getint')
 16 | REQ_LIMIT = get_config('conceptnet5_parameters', 'REQ_LIMIT', 'getint')
 17 | 
 18 | class Conversation(Thread):
 19 |     """
 20 |     A conversation represents a real-world conversation and is essentially
 21 |     a collection of single messages.
 22 |     """
 23 |     def __init__(self, messages):
 24 |         Thread.__init__(self)
 25 |         self.messages = messages        
 26 | 
 27 |     def run(self):
 28 |         self.emotions = self.conv_to_emotion_vectors()
 29 |         self.emotions = self.word_interpolation(self.emotions[0].text)
 30 | 
 31 |     def word_interpolation(self, words):
 32 |         """
 33 |         Interpolates a list of words.
 34 |         List must be structurally identical to self.emotions.
 35 |         """
 36 |         # In this word-based interpolation, we simply iterate (enumerate, as we need the index) over our
 37 |         # list and calculate the average of the previous and the next element.
 38 |         # 
 39 |         # A word is a dictionary with a name and a list of emotions.
 40 |         # 
 41 |         # For the sake of simplicity, we interpolate the first element with the last,
 42 |         # and the last with the first.
 43 |         interpolated_words = list()
 44 | 
 45 |         for i, w in enumerate(words):
 46 |             prev_w = words[i-1]
 47 |             if i == len(words)-1:
 48 |                 next_w = words[0]
 49 |             else:
 50 |                 next_w = words[i+1]
 51 |             if prev_w is not None and next_w is not None:
 52 |                 interpolated_w = self.interpolate_e_vector(prev_w, w, next_w)
 53 |                 interpolated_words.append(interpolated_w)
 54 |         return interpolated_words
 55 | 
 56 |     def interpolate_e_vector(self, left, middle, right):
 57 |         """
 58 |         Interpolates a dictionary emotions-vector with an arbitrary number and
 59 |         form of emotions.
 60 |         """
 61 | 
 62 |         # An emotions-vector can have any emotion's name as a key.
 63 |         # If a key exists in for example only two of the passed vectors, we treat it as 0.
 64 |         emotions = Counter(left['emotions'].keys() + middle['emotions'].keys() + right['emotions'].keys()).keys()
 65 |         for e in emotions:
 66 | 
 67 |             if e in left['emotions']:
 68 |                 left_e = left['emotions'][e]
 69 |             else:
 70 |                 left_e = 0
 71 | 
 72 |             if e in middle['emotions']:
 73 |                 middle_e = middle['emotions'][e]
 74 |             else:
 75 |                 middle_e = 0
 76 | 
 77 |             if e in right['emotions']:
 78 |                 right_e = right['emotions'][e]
 79 |             else:
 80 |                 right_e = 0
 81 | 
 82 |             middle['emotions'][e] = (left_e + middle_e + right_e)/3
 83 |         return middle
 84 | 
 85 |     def conv_to_emotion_vectors(self):
 86 |         """
 87 |         Converts a whole conversation and its messages to emotions.
 88 |         """
 89 |         messages = list(self.messages)
 90 |         return [m.to_emotion_vector() for m in messages]
 91 | 
 92 |     def __repr__(self):
 93 |         return str(self.__dict__)
 94 | 
 95 | class CacheController():
 96 |     """
 97 |     Extracting emotions from text through conceptnet5 can be a very time consuming task,
 98 |     especially when processing large quantities of text.
 99 | 
100 |     The CacheController class therefore can be used to save word-based results persistently.
101 |     """
102 | 
103 |     # This class should simply act as a key-value storage cache that can be asked before a word is being processed.
104 |     # If the word is not included in its cache, the word must be processed by traversing conceptnet5's
105 |     # graph structure, else we can just use the already given result.
106 |     # 
107 |     # Since different parameters (which can be found in config.cfg) alter the results immensely,
108 |     # CacheController must be initialized with all those parameters.
109 |     # Also, it is very likely that parameters will increase in later versions, hence naming function parameters
110 |     # might be a good idea for everyone reusing this class.
111 |     
112 |     def __init__(self, max_depth, min_weight, req_limit):
113 |         self.max_depth = max_depth
114 |         self.min_weight = min_weight
115 |         self.req_limit = req_limit
116 | 
117 |         # for every form those parameters can take, a new .db file is created on the hard drive.
118 |         self.cache = shelve.open('./word_cache_%d_%d_%d' % (self.max_depth, self.min_weight, self.req_limit))
119 | 
120 |     def add_word(self, word, emotions):
121 |         """
122 |         Adds an emotion dictionary. 
123 | 
124 |         This method will overwrite everything of an already given key.
125 |         """
126 |         word = word.encode("utf8")
127 |         self.cache[word] = emotions
128 | 
129 |     def fetch_word(self, word):
130 |         """
131 |         Fetches a word and returns None if a KeyValue exception is thrown.
132 |         """
133 |         try:
134 |             word = word.encode("utf8")
135 |             return self.cache[word]
136 |         except:
137 |             # in case a word is not found in the cache
138 |             return None
139 | 
140 |     def __repr__(self):
141 |         """
142 |         Simply returns a dictionary as representation of the object
143 |         """
144 |         return str(self.__dict__)
145 | 
146 | class Message():
147 |     """
148 |     Represents a message a user of Emotext sends to the cofra framework.
149 |     """
150 |     def __init__(self, entity_name, text, date=datetime.today(), language='english'):
151 |         self.entity_name = entity_name
152 |         self.text = text
153 |         self.date = date
154 |         self.language = language
155 | 
156 |     def __repr__(self):
157 |         """
158 |         Simply returns a dictionary as representation of the object.
159 |         """
160 |         return str(self.__dict__)
161 | 
162 |     def __setitem__(self, key, value):
163 |         self[key] = value
164 | 
165 |     def to_emotion_vector(self, cc=CacheController(max_depth=MAX_DEPTH, min_weight=MIN_WEIGHT, req_limit=REQ_LIMIT)):
166 |         """
167 |         Converts a message to an emotions-vector.
168 |         This method can be used in combination with a CacheController, which is set default to emotext's config settings.
169 |         """
170 | 
171 |         # A conversation consists of an arbitrary number of messages, which contain
172 |         # an arbitrary number of tokens.
173 |         # 
174 |         # Due to the fact that processing text to emotions is a tedious process,
175 |         # we implemented a Cache Service to enable faster processing of already seen words
176 | 
177 |         # Process text via Message object method that uses tokenization, stemming, punctuation removal and so on...
178 |         tokens = " ".join([" ".join([w for w in s]) \
179 |             for s in \
180 |             text_processing(self.text, stemming=False)]) \
181 |             .split()
182 | 
183 |         # We have to use enumerate here, as a for each loop's reference
184 |         # would not work appropriately
185 |         for i, t in enumerate(tokens):
186 |             empty_vector = {
187 |                 'name': t,
188 |                 'emotions': {}
189 |             }
190 | 
191 |             if cc is not None:
192 |                 # we try to use the cache to find the word's emotions
193 |                 pot_t_vector = cc.fetch_word(t)
194 |                 if pot_t_vector is not None:
195 |                     tokens[i] = pot_t_vector
196 |                 else:
197 |                     tokens[i] = build_graph(Set([Node(t, lang_name_to_code(self.language), 'c')]), Set([]), empty_vector, 0)
198 |                     cc.add_word(tokens[i]['name'], tokens[i])
199 |             else:
200 |                 tokens[i] = build_graph(Set([Node(t, lang_name_to_code(self.language), 'c')]), Set([]), empty_vector, 0)
201 |         self.text = tokens
202 |         return self
203 | 
204 | class Node():
205 |     def __init__(self, name, lang_code='en', type='c', rel=None, weight=0, edges=[], parent=None):
206 |         self.name = name
207 |         self.lang_code = lang_code
208 |         self.type = type
209 |         self.edges = edges
210 |         self.rel = rel
211 |         self.weight = weight
212 |         self.parent = parent
213 | 
214 |     def __repr__(self):
215 |         """
216 |         Simply returns a dictionary as representation of the object
217 |         """
218 |         return str(self.__dict__)
219 | 
220 |     def edge_lookup(self, used_names, lang_code='en'):
221 |         """
222 |         Uses ConceptNet's lookup function to search for all related
223 |         nodes to this one.
224 | 
225 |         Subsequently parses all of those edges and returns nothing
226 |         when update was successful.
227 |         """
228 |         # node must at least have a name to do a lookup
229 |         # otherwise, an exception is raised
230 |         if self.name == None:
231 |             raise Exception('Cannot do edge_lookup without nodes name.')
232 |         # lookup token via ConceptNet web-API
233 |         req = lookup(self.type, self.lang_code, self.name)
234 |         token_res = req
235 |         # used_names is a list of objects, however, in order to perform lookups,
236 |         # we need it to be a list of strings
237 |         # if result has more than 0 edges continue
238 |         if token_res != None and token_res['numFound'] > 0:
239 |             edges = []
240 |             # for every edge, try converting it to a Node object that 
241 |             # can be processed further
242 |             for e in token_res['edges']:
243 |                 # extract basic information from the 'end' key of an edge
244 |                 # it contains, type, lang_code and the name of the node
245 |                 basic_start = extr_from_concept_net_edge(e['start'])
246 |                 basic_end = extr_from_concept_net_edge(e['end'])
247 |                 # instantiate a Node object from this information and append it to a list of edges
248 |                 # print basic_start['name'] + ' --> ' + e['rel'] + ' --> ' + basic_end['name']
249 |                 if basic_end['name'] != self.name:
250 |                     if basic_end['name'] not in used_names and basic_end['lang_code'] == lang_code:
251 |                         edges.append(Node(basic_end['name'], basic_end['lang_code'], basic_end['type'], e['rel'], e['weight'], [], self))
252 |                 else:
253 |                     if basic_start['name'] not in used_names and basic_start['lang_code'] == lang_code:
254 |                         edges.append(Node(basic_start['name'], basic_start['lang_code'], basic_start['type'], e['rel'], e['weight'], [], self))
255 |             # if all edges have been processed, add them to the current object
256 |             self.edges = edges
257 |         else:
258 |             # if no edges found on token, raise exception
259 |             raise Exception('Token has no connecting edges.')
260 | 
261 | class NodeEncoder(json.JSONEncoder):
262 |     """
263 |     Taken from: http://stackoverflow.com/a/1458716/1263876
264 | 
265 |     The Node object is a recursive data structure that can contain itself,
266 |     as it holds all it's child Nodes.
267 | 
268 |     Therefore, this method needs to be defined when trying to serialize a Node object
269 |     to json.
270 |     """
271 |     def default(self, obj):
272 |         if not isinstance(obj, Node):
273 |             return super(NodeEncoder, self).default(obj)
274 |         return obj.__dict__
275 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Emotext
 2 | Emotext is framework that helps you extract, save and correlate emotions with contextual information.
 3 | It uses MIT's conceptnet5, nltk and Python.
 4 | 
 5 | To enable programming-language-independent usage, Emotext's interface is provided RESTfully.
 6 | 
 7 | ## Automatic installation
 8 | Emotext uses `pip` for dependency management. To install all required dependencies, you simply run:
 9 | 
10 |     pip install -r requirements.txt
11 | 
12 | Additionally, you'll need to download `nltk`'s [language-specific files](#downloading-nltks-language-specific-files).
13 | 
14 | ## Manual installation
15 | Python3 is required, as well as `pip` for installing dependencies.
16 | The web server is hosted using `flask`. Tests are implemented against the RESTful interface, therefore the `requests` library is required.
17 | 
18 |     pip install flask
19 |     pip install requests
20 | 
21 | Furthermore, `ntlk` is used for natural language processing. It can also be installed using `pip`:
22 | 
23 |     pip install nltk
24 | 
25 | However, `ntlk` still needs [language-specific files](#downloading-nltks-language-specific-files).
26 | 
27 | ## Downloading nltk's language-specific files
28 | Enter Python's IDE by typing `python` in your terminal and run the following commands: 
29 | 
30 |     >>> import nltk
31 |     >>> nltk.download()
32 | 
33 | We recommend downloading all dependencies.
34 | 
35 | ## Setting up a local Conceptnet image
36 | As mentioned already, Emotext is able to extract emotions from text. This is done by looking up concepts on conceptnet5's graph database.
37 | Through path finding, emotext searches an arbitrary number of levels for connections to find a connection between the entered word and an emotion.
38 | This process requires *a lot* of lookups, which is why we recommend hosting a local instance of conceptnet5 instead of using the web-API.
39 | 
40 | A detailed installation tutorial on how to set up docker and conceptnet5 can be found [here](https://github.com/commonsense/conceptnet5/wiki/Docker).
41 | However, we will still go through the installation process here:
42 | 
43 | 1. [Install docker (Mac OS X)](https://docs.docker.com/installation/mac/). Make sure, that you're using a bash shell, otherwise the installation will probably fail at some point.
44 | 2. [Increase your virtual machine's HD](https://docs.docker.com/articles/b2d_volume_resize/) up to 100-150 GB of storage.
45 | 3. Pull conceptnet5-web from docker's repositories: `sudo docker run rspeer/conceptnet-web:5.3`
46 | 4. In your VirtualBox GUI, setup a port forward from port 80 of your virtual machine to port 80 of your real machine (NAT Interface in the tab "Network")
47 | 5. Run the application with a port forward from 10053 to 80, like this: 
48 | `docker run -it -p 80:10053 rspeer/conceptnet-web --net=host`
49 | 6. Now, do either `boot2docker ip` or `arp -an` to find your virtual machine's ip
50 | 7. Once you were able to find the right IP, conceptnet5's web interface should appear, if you enter it in your browser
51 | 
52 | ## Configuration
53 | For convenience, when wanting to adjust parameters concerning for example the emotion extraction process there is the file `config.cfg`.
54 | After changes on this file, the server must be restarted.
55 | 
56 | If you want to connect to the docker container's shell, try:
57 | `sudo docker exec -i -t <containerID> bash`.
58 | 
59 | ### Removing conceptnet5's request limiter
60 | Per default, conceptnet5 limits requests to about 6000 in 60 minutes (https://github.com/commonsense/conceptnet5/search?utf8=%E2%9C%93&q=Limiter).
61 | To remove the limiter, open the docker container's bash (as described above) and `cd conceptnet5`. Install `apt-get install nano` and `nano api.py`.
62 | Inside of this file `Limiter` gets exported and an instance of it is assigned to `limiter`. Also python-decorators `@limiter` are used to limit conceptnet5's requests. You have to remove all of them.
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | altgraph==0.10.2
 2 | bdist-mpkg==0.5.0
 3 | #bonjour-py==0.3
 4 | colorama==0.3.3
 5 | cov-core==1.15.0
 6 | coverage==3.7.1
 7 | execnet==1.3.0
 8 | Flask==0.10.1
 9 | futures==2.2.0
10 | itsdangerous==0.24
11 | Jinja2==2.7.3
12 | macholib==1.5.1
13 | MarkupSafe==0.23
14 | #matplotlib==1.3.1
15 | modulegraph==0.10.4
16 | nltk==3.0.1
17 | numpy==1.8.0rc1
18 | psycopg2==2.6
19 | py==1.4.26
20 | py2app==0.7.3
21 | #pyobjc-core==2.5.1
22 | #pyobjc-framework-Accounts==2.5.1
23 | #pyobjc-framework-AddressBook==2.5.1
24 | #pyobjc-framework-AppleScriptKit==2.5.1
25 | #pyobjc-framework-AppleScriptObjC==2.5.1
26 | #pyobjc-framework-Automator==2.5.1
27 | #pyobjc-framework-CFNetwork==2.5.1
28 | #pyobjc-framework-Cocoa==2.5.1
29 | #pyobjc-framework-Collaboration==2.5.1
30 | #pyobjc-framework-CoreData==2.5.1
31 | #pyobjc-framework-CoreLocation==2.5.1
32 | #pyobjc-framework-CoreText==2.5.1
33 | #pyobjc-framework-DictionaryServices==2.5.1
34 | #pyobjc-framework-EventKit==2.5.1
35 | #pyobjc-framework-ExceptionHandling==2.5.1
36 | #pyobjc-framework-FSEvents==2.5.1
37 | #pyobjc-framework-InputMethodKit==2.5.1
38 | #pyobjc-framework-InstallerPlugins==2.5.1
39 | #pyobjc-framework-InstantMessage==2.5.1
40 | #pyobjc-framework-LatentSemanticMapping==2.5.1
41 | #pyobjc-framework-LaunchServices==2.5.1
42 | #pyobjc-framework-Message==2.5.1
43 | #pyobjc-framework-OpenDirectory==2.5.1
44 | #pyobjc-framework-PreferencePanes==2.5.1
45 | #pyobjc-framework-PubSub==2.5.1
46 | #pyobjc-framework-QTKit==2.5.1
47 | #pyobjc-framework-Quartz==2.5.1
48 | #pyobjc-framework-ScreenSaver==2.5.1
49 | #pyobjc-framework-ScriptingBridge==2.5.1
50 | #pyobjc-framework-SearchKit==2.5.1
51 | #pyobjc-framework-ServiceManagement==2.5.1
52 | #pyobjc-framework-Social==2.5.1
53 | #pyobjc-framework-SyncServices==2.5.1
54 | #pyobjc-framework-SystemConfiguration==2.5.1
55 | #pyobjc-framework-WebKit==2.5.1
56 | pyOpenSSL==0.13.1
57 | pyparsing==2.0.1
58 | pytest==2.6.4
59 | pytest-cov==1.8.1
60 | pytest-xdist==1.11
61 | python-dateutil==1.5
62 | python-termstyle==0.1.10
63 | pytz==2013.7
64 | requests==2.5.1
65 | requests-futures==0.9.5
66 | scipy==0.13.0
67 | six==1.4.1
68 | sniffer==0.3.4
69 | Twisted==13.2.0
70 | #vboxapi==1.0
71 | virtualenv==12.0.7
72 | Werkzeug==0.10.1
73 | xattr==0.6.4
74 | zope.interface==4.1.1
75 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TimDaub/emotext/fb77a3e1188f6d60cb921dbe48e4a0b55add08b0/utils/__init__.py


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import ConfigParser
 2 | import os
 3 | 
 4 | def extr_from_concept_net_edge(s):
 5 |     """
 6 |     ConceptNet returnes on lookup edges that are named in this fashion:
 7 | 
 8 |         'c/en/autobahn'
 9 | 
10 |     From this we can extract:
11 |         - type
12 |         - language-code
13 |         - name of the node
14 |     """
15 |     params_list = s.split('/')
16 |     if len(params_list) < 3: 
17 |         raise Exception('The given string did not contain at least two slashes.')
18 |     return {
19 |         'type': params_list[1],
20 |         'lang_code': params_list[2],
21 |         'name': params_list[3]
22 |     }
23 | 
24 | def get_config(section, key, method_name='get'):
25 |     """
26 |     Reads the 'config.cfg' file in the root directory and allows
27 |     to select specific values from it that will - if found - be returned.
28 |     """
29 |     config_parser = ConfigParser.ConfigParser()
30 |     config_parser.readfp(open(os.path.dirname(os.path.abspath(__file__)) + r'/../config.cfg'))
31 |     try:
32 |         if method_name == 'getlist':
33 |             # split string on comma
34 |             l = getattr(config_parser, 'get')(section, key).split(',')
35 |             return l
36 |         else:
37 |             return getattr(config_parser, method_name)(section, key)
38 |     except:
39 |         print 'Combination of section and key has not been found in config.cfg file.'
40 |         return None


--------------------------------------------------------------------------------