├── .flake8
├── .gitattributes
├── .gitignore
├── LICENCE
├── MANIFEST.in
├── README.md
├── ekphrasis
    ├── LICENCE
    ├── __init__.py
    ├── classes
    │   ├── __init__.py
    │   ├── exmanager.py
    │   ├── preprocessor.py
    │   ├── segmenter.py
    │   ├── spellcorrect.py
    │   └── tokenizer.py
    ├── dicts
    │   ├── __init__.py
    │   ├── emoticons.py
    │   ├── noslang
    │   │   ├── __init__.py
    │   │   ├── manager.py
    │   │   ├── slangdict.pickle
    │   │   └── slangdict.py
    │   └── sentiment
    │   │   ├── __init__.py
    │   │   └── nrc_emolex
    │   │       ├── NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt
    │   │       ├── NRCEmolex.py
    │   │       └── __init__.py
    ├── examples
    │   ├── SentReadMe.md
    │   ├── __init__.py
    │   ├── demo_data.py
    │   ├── demo_ext.py
    │   ├── demo_segmenter.py
    │   ├── demo_tok.py
    │   ├── example.py
    │   ├── segmenter_diffs.pickle
    │   ├── segmenter_diffs.txt
    │   ├── sentiment.py
    │   └── word_segmentation.ipynb
    ├── regexes
    │   ├── __init__.py
    │   ├── expressions.txt
    │   └── generate_expressions.py
    ├── stats
    │   └── .gitkeep
    ├── tools
    │   ├── __init__.py
    │   └── generate_stats.py
    └── utils
    │   ├── __init__.py
    │   ├── helpers.py
    │   └── nlp.py
├── local_install.sh
├── pypi_push.sh
├── requirements.txt
├── setup.cfg
└── setup.py


/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 120
 3 | 
 4 | exclude =
 5 |     .tox,
 6 |     __pycache__,
 7 |     build,
 8 |     dist
 9 | 
10 | ignore =
11 |     # F401 imported but unused
12 |     F401,
13 |     # E501 line too long
14 |     E501,
15 |     # E303 too many blank lines
16 |     E303,
17 |     # E731 do not assign a lambda expression, use a def
18 |     E731,
19 |     # F812: list comprehension redefines ...
20 |     F812,
21 |     # E402 module level import not at top of file
22 |     E402,
23 |     # W292 no newline at end of file
24 |     W292,
25 |     # E999 SyntaxError: invalid syntax
26 |     E999,
27 |     # F821 undefined name
28 |     F821,
29 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/.gitattributes


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | analysis/
 3 | corpus/
 4 | gen_corpus/
 5 | dist/
 6 | build/
 7 | *.egg-info/
 8 | classes/__pycache__
 9 | *.pyc
10 | /prototyping.py
11 | ekphrasis/__pycache__/
12 | ekphrasis/classes/__pycache__/
13 | ekphrasis/dicts/__pycache__/
14 | ekphrasis/dicts/noslang/__pycache__/
15 | ekphrasis/examples/.ipynb_checkpoints/
16 | ekphrasis/examples/word_segmentation-Copy1.ipynb
17 | ekphrasis/stats/**
18 | ekphrasis/utils/__pycache__/
19 | /local_install.bat
20 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Christos Baziotis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | # MANIFEST.in
 2 | exclude .gitignore
 3 | exclude .coverage
 4 | exclude .travis.yml
 5 | include README.rst
 6 | include README.md
 7 | include setup.cfg
 8 | prune .cache
 9 | prune .git
10 | prune build
11 | prune dist
12 | recursive-exclude *.egg-info *
13 | recursive-include tests *
14 | recursive-include ekphrasis/regexes *
15 | recursive-include regexes *
16 | recursive-exclude ekphrasis/stats *
17 | 
18 | # data files
19 | #include stats/**/**/*.txt


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Collection of lightweight text tools, geared towards text from social networks, such as Twitter or Facebook, for tokenization, word normalization, word segmentation (for splitting hashtags) and spell correction, 
  2 | using word statistics from 2 big corpora (english Wikipedia, twitter - 330mil english tweets).
  3 | 
  4 | _ekphrasis_ was developed as part of the text processing pipeline for
  5 | _DataStories_ team's submission for _SemEval-2017 Task 4 (English), Sentiment Analysis in Twitter_.
  6 | 
  7 | If you use the library in you research project, please cite the paper 
  8 | ["DataStories at SemEval-2017 Task 4: Deep LSTM with Attention for Message-level and Topic-based Sentiment Analysis"](http://www.aclweb.org/anthology/S17-2126).
  9 | 
 10 | Citation:
 11 | ```
 12 | @InProceedings{baziotis-pelekis-doulkeridis:2017:SemEval2,
 13 |   author    = {Baziotis, Christos  and  Pelekis, Nikos  and  Doulkeridis, Christos},
 14 |   title     = {DataStories at SemEval-2017 Task 4: Deep LSTM with Attention for Message-level and Topic-based Sentiment Analysis},
 15 |   booktitle = {Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)},
 16 |   month     = {August},
 17 |   year      = {2017},
 18 |   address   = {Vancouver, Canada},
 19 |   publisher = {Association for Computational Linguistics},
 20 |   pages     = {747--754}
 21 | }
 22 | ```
 23 | 
 24 | **Disclaimer:** The library is no longer actively developed. I will try to resolve important issues, but I can't make any promises.
 25 | 
 26 | # Installation
 27 | 
 28 | build from source 
 29 | ```
 30 | pip install git+git://github.com/cbaziotis/ekphrasis.git
 31 | ```
 32 | or install from pypi
 33 | ```
 34 | pip install ekphrasis -U
 35 | ```
 36 | 
 37 | # Overview
 38 | 
 39 | _ekphrasis_ offers the following functionality:
 40 | 
 41 |   1. **Social Tokenizer**. A text tokenizer geared towards social networks (Facebook, Twitter...), 
 42 |       which understands complex emoticons, emojis and other unstructured expressions like dates, times and more.
 43 | 
 44 |   2. **Word Segmentation**. You can split a long string to its constituent words. Suitable for hashtag segmentation.
 45 | 
 46 |   3. **Spell Correction**. You can replace a misspelled word, with the most probable candidate word.
 47 | 
 48 |   4. **Customization**. Taylor the word-segmentation, spell-correction and term identification, to suit your needs.
 49 |   
 50 |       Word Segmentation and Spell Correction mechanisms, operate on top of word statistics, collected from a given corpus. We provide word statistics from 2 big corpora (from Wikipedia and Twitter), but you can also generate word statistics from your own corpus. You may need to do that if you are working with domain-specific texts, like biomedical documents. For example a word describing a technique or a chemical compound may be treated as a misspelled word, using the word statistics from a general purposed corpus.
 51 | 
 52 |       _ekphrasis_ tokenizes the text based on a list of regular expressions. You can easily enable _ekphrasis_ to identify new entities, by simply adding a new entry to the dictionary of regular expressions (`ekphrasis/regexes/expressions.txt`).
 53 | 
 54 |   5. **Pre-Processing Pipeline**. You can combine all the above steps in an easy way, in order to prepare the text files in your dataset for some kind of analysis or for machine learning.
 55 |   In addition, to the aforementioned actions, you can perform text normalization, word annotation (labeling) and more.
 56 | 
 57 | 
 58 | 
 59 | 
 60 | ## Text Pre-Processing pipeline
 61 | 
 62 | You can easily define a preprocessing pipeline, by using the ``TextPreProcessor``. 
 63 | 
 64 | ```python
 65 | from ekphrasis.classes.preprocessor import TextPreProcessor
 66 | from ekphrasis.classes.tokenizer import SocialTokenizer
 67 | from ekphrasis.dicts.emoticons import emoticons
 68 | 
 69 | text_processor = TextPreProcessor(
 70 |     # terms that will be normalized
 71 |     normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
 72 |         'time', 'url', 'date', 'number'],
 73 |     # terms that will be annotated
 74 |     annotate={"hashtag", "allcaps", "elongated", "repeated",
 75 |         'emphasis', 'censored'},
 76 |     fix_html=True,  # fix HTML tokens
 77 |     
 78 |     # corpus from which the word statistics are going to be used 
 79 |     # for word segmentation 
 80 |     segmenter="twitter", 
 81 |     
 82 |     # corpus from which the word statistics are going to be used 
 83 |     # for spell correction
 84 |     corrector="twitter", 
 85 |     
 86 |     unpack_hashtags=True,  # perform word segmentation on hashtags
 87 |     unpack_contractions=True,  # Unpack contractions (can't -> can not)
 88 |     spell_correct_elong=False,  # spell correction for elongated words
 89 |     
 90 |     # select a tokenizer. You can use SocialTokenizer, or pass your own
 91 |     # the tokenizer, should take as input a string and return a list of tokens
 92 |     tokenizer=SocialTokenizer(lowercase=True).tokenize,
 93 |     
 94 |     # list of dictionaries, for replacing tokens extracted from the text,
 95 |     # with other expressions. You can pass more than one dictionaries.
 96 |     dicts=[emoticons]
 97 | )
 98 | 
 99 | sentences = [
100 |     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
101 |     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
102 |     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/."
103 | ]
104 | 
105 | for s in sentences:
106 |     print(" ".join(text_processor.pre_process_doc(s)))
107 | ```
108 | 
109 | Output:
110 | 
111 | ```
112 | cant <allcaps> wait <allcaps> for the new season of <hashtag> twin peaks </hashtag> ＼(^o^)／ ! <repeated> <hashtag> david lynch </hashtag> <hashtag> tv series </hashtag> <happy>
113 | 
114 | i saw the new <hashtag> john doe </hashtag> movie and it sucks <elongated> ! <repeated> waisted <allcaps> <money> . <repeated> <hashtag> bad movies </hashtag> <annoyed>
115 | 
116 | <user> : can not wait for the <date> <hashtag> sentiment </hashtag> talks ! yay <allcaps> <elongated> ! <repeated> <laugh> <url>
117 | ```
118 | 
119 | 
120 | Notes:
121 | 
122 | * elongated words are automatically normalized.
123 | * Spell correction affects performance.
124 | 
125 | ---
126 | 
127 | ### Word Statistics
128 | _ekphrasis_ provides word statistics (unigrams and bigrams) from 2 big corpora:
129 | * the english Wikipedia
130 | * a collection of 330 million english Twitter messages
131 | 
132 | These word statistics are required for the word segmentation and spell correction.
133 | Moreover, you can generate word statistics from your own corpus.
134 | You can use `ekphrasis/tools/generate_stats.py` and generate statistics from a text file, or a directory that contains a collection of text files.
135 | For example, in order generate word statistics for [text8](http://mattmahoney.net/dc/textdata.html) (http://mattmahoney.net/dc/text8.zip), you can do:
136 | 
137 | ```
138 | python generate_stats.py --input text8.txt --name text8 --ngrams 2 --mincount 70 30
139 | ```
140 | * input: path to file or directory containing the files for calculating the statistics.
141 | * name: the name of the corpus.
142 | * ngrams: up-to how many ngrams to calculate statistics.
143 | * mincount: the minimum count of each ngram, in order to be included. 
144 |   In this case, the mincount for unigrams is 70 and for bigrams is 30.
145 | 
146 | After you run the script, you will see a new directory inside `ekphrasis/stats/` with the statistics of your corpus. 
147 | In the case of the example above, `ekphrasis/stats/text8/`. 
148 | 
149 | 
150 | 
151 | ### Word Segmentation
152 | The word segmentation implementation uses the Viterbi algorithm and is based on [CH14](http://norvig.com/ngrams/ch14.pdf) from the book [Beautiful Data (Segaran and Hammerbacher, 2009)](http://shop.oreilly.com/product/9780596157128.do).
153 | The implementation requires word statistics in order to identify and separating the words in a string. 
154 | You can use the word statistics from one of the 2 provided corpora, or from your own corpus.
155 | 
156 | 
157 | **Example:**
158 | In order to perform word segmentation, first you have to instantiate a segmenter with a given corpus, and then just use the `segment()` method:
159 | ```python
160 | from ekphrasis.classes.segmenter import Segmenter
161 | seg = Segmenter(corpus="mycorpus") 
162 | print(seg.segment("smallandinsignificant"))
163 | ```
164 | Output:
165 | ```
166 | > small and insignificant
167 | ```
168 | 
169 | You can test the output using statistics from the different corpora:
170 | ```python
171 | from ekphrasis.classes.segmenter import Segmenter
172 | 
173 | # segmenter using the word statistics from english Wikipedia
174 | seg_eng = Segmenter(corpus="english") 
175 | 
176 | # segmenter using the word statistics from Twitter
177 | seg_tw = Segmenter(corpus="twitter")
178 | 
179 | words = ["exponentialbackoff", "gamedev", "retrogaming", "thewatercooler", "panpsychism"]
180 | for w in words:
181 |     print(w)
182 |     print("(eng):", seg_eng.segment(w))
183 |     print("(tw):", seg_tw.segment(w))
184 |     print()
185 | ```
186 | Output:
187 | ```
188 | exponentialbackoff
189 | (eng): exponential backoff
190 | (tw): exponential back off
191 | 
192 | gamedev
193 | (eng): gamedev
194 | (tw): game dev
195 | 
196 | retrogaming
197 | (eng): retrogaming
198 | (tw): retro gaming
199 | 
200 | thewatercooler
201 | (eng): the water cooler
202 | (tw): the watercooler
203 | 
204 | panpsychism
205 | (eng): panpsychism
206 | (tw): pan psych is m
207 | 
208 | ```
209 | 
210 | Finally, if the word is camelCased or PascalCased, then the algorithm splits the words based on the case of the characters.
211 | ```python
212 | from ekphrasis.classes.segmenter import Segmenter
213 | seg = Segmenter() 
214 | print(seg.segment("camelCased"))
215 | print(seg.segment("PascalCased"))
216 | ```
217 | Output:
218 | ```
219 | > camel cased
220 | > pascal cased
221 | ```
222 | 
223 | ### Spell Correction
224 | The Spell Corrector is based on [Peter Norvig's spell-corrector](http://norvig.com/spell-correct.html).
225 | Just like the segmentation algorithm, we utilize word statistics in order to find the most probable candidate.
226 | Besides the provided statistics, you can use your own.
227 | 
228 | **Example:**
229 | 
230 | You can perform the spell correction, just like the word segmentation.
231 | First you have to instantiate a `SpellCorrector` object, 
232 | that uses the statistics from the corpus of your choice and then use on of the available methods.
233 | ```python
234 | from ekphrasis.classes.spellcorrect import SpellCorrector
235 | sp = SpellCorrector(corpus="english") 
236 | print(sp.correct("korrect"))
237 | ```
238 | Output:
239 | ```
240 | > correct
241 | ```
242 | 
243 | 
244 | ### Social Tokenizer
245 | The difficulty in tokenization is to avoid splitting expressions or words that should be kept intact (as one token).
246 | This is more important in texts from social networks, with "creative" writing and expressions like emoticons, hashtags and so on.
247 | Although there are some tokenizers geared towards Twitter [1],[2], 
248 | that recognize the Twitter markup and some basic sentiment expressions or simple emoticons, 
249 | our tokenizer is able to identify almost all emoticons, emojis and many complex expressions.
250 | 
251 | Especially for tasks such as sentiment analysis, there are many expressions that play a decisive role in identifying the sentiment expressed in text. Expressions like these are: 
252 | 
253 | - Censored words, such as ``f**k``, ``s**t``.
254 | - Words with emphasis, such as ``a *great* time``, ``I don't *think* I ...``.
255 | - Emoticons, such as ``>:(``, ``:))``, ``\o/``.
256 | - Dash-separated words, such as ``over-consumption``, ``anti-american``, ``mind-blowing``.
257 | 
258 | Moreover, ekphrasis can identify information-bearing  expressions. Depending on the task, you may want to keep preserve / extract them as one token (IR) and then normalize them since this information may be irrelevant for the task (sentiment analysis). Expressions like these are:
259 | 
260 | 
261 | -   Dates, such as ``Feb 18th``, ``December 2, 2016``, ``December 2-2016``,
262 |     ``10/17/94``, ``3 December 2016``, ``April 25, 1995``, ``11.15.16``,
263 |     ``November 24th 2016``, ``January 21st``.
264 | -   Times, such as ``5:45pm``, ``11:36 AM``, ``2:45 pm``, ``5:30``.
265 | -   Currencies, such as ``$220M``, ``$2B``, ``$65.000``, ``€10``, ``$50K``.
266 | -   Phone numbers.
267 | -   URLs, such as ``http://www.cs.unipi.gr``, ``https://t.co/Wfw5Z1iSEt``.
268 | 
269 | **Example**:
270 | 
271 | ```python
272 | import nltk
273 | from ekphrasis.classes.tokenizer import SocialTokenizer
274 | 
275 | 
276 | def wsp_tokenizer(text):
277 |     return text.split(" ")
278 | 
279 | puncttok = nltk.WordPunctTokenizer().tokenize
280 | 
281 | social_tokenizer = SocialTokenizer(lowercase=False).tokenize
282 | 
283 | sents = [
284 |     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／ yaaaay!!! #davidlynch #tvseries :)))",
285 |     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies >3:/",
286 |     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! >:-D http://sentimentsymposium.com/.",
287 | ]
288 | 
289 | for s in sents:
290 |     print()
291 |     print("ORG: ", s)  # original sentence
292 |     print("WSP : ", wsp_tokenizer(s))  # whitespace tokenizer
293 |     print("WPU : ", puncttok(s))  # WordPunct tokenizer
294 |     print("SC : ", social_tokenizer(s))  # social tokenizer
295 | 
296 | ```
297 | 
298 | Output:
299 | 
300 | ```
301 | ORG:  CANT WAIT for the new season of #TwinPeaks ＼(^o^)／ yaaaay!!! #davidlynch #tvseries :)))
302 | WSP :  ['CANT', 'WAIT', 'for', 'the', 'new', 'season', 'of', '#TwinPeaks', '＼(^o^)／', 'yaaaay!!!', '#davidlynch', '#tvseries', ':)))']
303 | WPU :  ['CANT', 'WAIT', 'for', 'the', 'new', 'season', 'of', '#', 'TwinPeaks', '＼(^', 'o', '^)／', 'yaaaay', '!!!', '#', 'davidlynch', '#', 'tvseries', ':)))']
304 | SC :  ['CANT', 'WAIT', 'for', 'the', 'new', 'season', 'of', '#TwinPeaks', '＼(^o^)／', 'yaaaay', '!', '!', '!', '#davidlynch', '#tvseries', ':)))']
305 | 
306 | ORG:  I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies >3:/
307 | WSP :  ['I', 'saw', 'the', 'new', '#johndoe', 'movie', 'and', 'it', 'suuuuucks!!!', 'WAISTED', '$10...', '#badmovies', '>3:/']
308 | WPU :  ['I', 'saw', 'the', 'new', '#', 'johndoe', 'movie', 'and', 'it', 'suuuuucks', '!!!', 'WAISTED', '$', '10', '...', '#', 'badmovies', '>', '3', ':/']
309 | SC :  ['I', 'saw', 'the', 'new', '#johndoe', 'movie', 'and', 'it', 'suuuuucks', '!', '!', '!', 'WAISTED', '$10', '.', '.', '.', '#badmovies', '>', '3:/']
310 | ```
311 | 
312 | 
313 | 
314 | <!-- 
315 | 
316 | ---
317 | _Ekphrasis_ means expression in Greek (Modern Greek:έκφραση, Ancient Greek:ἔκφρασις). 
318 |  relies on Regular Expression for the text tokenization.
319 | 
320 |  -->
321 | 
322 | #### References
323 | 
324 | [1] K. Gimpel et al., “Part-of-speech tagging for twitter: Annotation, features, and experiments,” in Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: short papers-Volume 2, 2011, pp. 42–47.
325 | 
326 | [2] C. Potts, “Sentiment Symposium Tutorial: Tokenizing,” Sentiment Symposium Tutorial, 2011. [Online]. Available: http://sentiment.christopherpotts.net/tokenizing.html.
327 | 


--------------------------------------------------------------------------------
/ekphrasis/LICENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Christos Baziotis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ekphrasis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/__init__.py


--------------------------------------------------------------------------------
/ekphrasis/classes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/classes/__init__.py


--------------------------------------------------------------------------------
/ekphrasis/classes/exmanager.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import re
 4 | 
 5 | 
 6 | class ExManager:
 7 |     ext_path = os.path.join(os.path.dirname(__file__),
 8 |                             '../regexes/expressions.txt')
 9 | 
10 |     with open(ext_path) as fh:
11 |         expressions = json.load(fh)
12 | 
13 |     def get_compiled(self):
14 |         regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in
15 |                    self.expressions.items()}
16 |         return regexes
17 | 
18 |     def print_expressions(self):
19 |         {print(k.lower(), ":", self.expressions[k])
20 |          for k, v in sorted(self.expressions.items())}
21 | 


--------------------------------------------------------------------------------
/ekphrasis/classes/preprocessor.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from functools import lru_cache
  3 | 
  4 | import ftfy
  5 | 
  6 | from ekphrasis.classes.exmanager import ExManager
  7 | from ekphrasis.classes.segmenter import Segmenter
  8 | from ekphrasis.classes.spellcorrect import SpellCorrector
  9 | from ekphrasis.utils.nlp import unpack_contractions
 10 | from ekphrasis.utils.helpers import remove_tags
 11 | 
 12 | # noinspection PyPackageRequirements
 13 | class TextPreProcessor:
 14 |     def __init__(self, **kwargs):
 15 |         """
 16 |         Kwargs:
 17 |             omit (list): choose what tokens that you want to omit from the text.
 18 |                 possible values: ['email', 'percent', 'money', 'phone', 'user',
 19 |                     'time', 'url', 'date', 'hashtag']
 20 |                 Important Notes:
 21 |                             1 - put url at front, if you plan to use it.
 22 |                                 Messes with the regexes!
 23 |                             2 - if you use hashtag then unpack_hashtags will
 24 |                                 automatically be set to False
 25 | 
 26 |             normalize (list): choose what tokens that you want to normalize
 27 |                 from the text.
 28 |                 possible values: ['email', 'percent', 'money', 'phone', 'user',
 29 |                     'time', 'url', 'date', 'hashtag']
 30 |                 for example: myaddress@mysite.com will be transformed to <email>
 31 |                 Important Notes:
 32 |                             1 - put url at front, if you plan to use it.
 33 |                                 Messes with the regexes!
 34 |                             2 - if you use hashtag then unpack_hashtags will
 35 |                                 automatically be set to False
 36 | 
 37 |             unpack_contractions (bool): Replace *English* contractions in
 38 |                 ``text`` str with their unshortened forms
 39 |                 for example: can't -> can not, wouldn't -> would not, and so on...
 40 | 
 41 |             unpack_hashtags (bool): split a hashtag to it's constituent words.
 42 |                 for example: #ilikedogs -> i like dogs
 43 | 
 44 |             annotate (list): add special tags to special tokens.
 45 |                 possible values: ['hashtag', 'allcaps', 'elongated', 'repeated']
 46 |                 for example: myaddress@mysite.com -> myaddress@mysite.com <email>
 47 | 
 48 |             tokenizer (callable): callable function that accepts a string and
 49 |                 returns a list of strings if no tokenizer is provided then
 50 |                 the text will be tokenized on whitespace
 51 | 
 52 |             segmenter (str): define the statistics of what corpus you would
 53 |                 like to use [english, twitter]
 54 | 
 55 |             corrector (str): define the statistics of what corpus you would
 56 |                 like to use [english, twitter]
 57 | 
 58 |             all_caps_tag (str): how to wrap the capitalized words
 59 |                 values [single, wrap, every]
 60 |                 Note: applicable only when `allcaps` is included in annotate[]
 61 |                     - single: add a tag after the last capitalized word
 62 |                     - wrap: wrap all words with opening and closing tags
 63 |                     - every: add a tag after each word
 64 | 
 65 |             spell_correct_elong (bool): choose if you want to perform
 66 |                 spell correction after the normalization of elongated words.
 67 |                 * significantly affects performance (speed)
 68 | 
 69 |             spell_correction (bool): choose if you want to perform
 70 |                 spell correction to the text
 71 |                 * significantly affects performance (speed)
 72 | 
 73 |             fix_text (bool): choose if you want to fix bad unicode terms and
 74 |                 html entities.
 75 |             
 76 |             remove_tags (bool): Choose to remove tags after processing
 77 |         """
 78 |         self.omit = kwargs.get("omit", {})
 79 |         self.backoff = kwargs.get("normalize", {})
 80 |         self.include_tags = kwargs.get("annotate", {})
 81 |         self.unpack_contractions = kwargs.get("unpack_contractions", False)
 82 |         self.tokenizer = kwargs.get("tokenizer", None)
 83 |         self.dicts = kwargs.get("dicts", None)
 84 |         self.spell_correction = kwargs.get("spell_correction", False)
 85 |         self.spell_correct_elong = kwargs.get("spell_correct_elong", False)
 86 |         self.fix_text = kwargs.get("fix_bad_unicode", False)
 87 |         self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
 88 |         self.segmenter_corpus = kwargs.get("segmenter", "english")
 89 |         self.corrector_corpus = kwargs.get("corrector", "english")
 90 |         self.all_caps_tag = kwargs.get("all_caps_tag", "wrap")
 91 |         self.mode = kwargs.get("mode", "normal")
 92 |         self.remove_tags = kwargs.get("remove_tags", False)
 93 | 
 94 |         if self.unpack_hashtags:
 95 |             self.segmenter = Segmenter(corpus=self.segmenter_corpus)
 96 |         if self.mode != "fast":
 97 |             self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)
 98 | 
 99 |         self.regexes = ExManager().get_compiled()
100 |         if 'hashtag' in self.omit or 'hashtag' in self.backoff:
101 |             print("You can't omit/backoff and unpack hashtags!\n "
102 |                   "unpack_hashtags will be set to False")
103 |             self.unpack_hashtags = False
104 | 
105 |     def __copy__(self):
106 |         return self
107 | 
108 |     def __deepcopy__(self, memo):
109 |         return self
110 | 
111 |     @staticmethod
112 |     def add_special_tag(m, tag, mode="single"):
113 | 
114 |         if isinstance(m, str):
115 |             text = m
116 |         else:
117 |             text = m.group()
118 | 
119 |         if mode == "single":
120 |             return " {} <{}> ".format(text, tag)
121 |         elif mode == "wrap":
122 |             return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " "
123 |         elif mode == "every":
124 |             tokens = text.split()
125 |             processed = " ".join([" {} <{}> ".format(t, tag)
126 |                                   for t in tokens])
127 |             return " " + processed + " "
128 | 
129 |     @lru_cache(maxsize=65536)
130 |     def handle_hashtag_match(self, m):
131 |         """
132 |         Break a string to its constituent words (using Viterbi algorithm)
133 |         """
134 |         text = m.group()[1:]
135 | 
136 |         # todo:simplify routine
137 |         if text.islower():
138 |             expanded = self.segmenter.segment(text)
139 |             expanded = " ".join(expanded.split("-"))
140 |             expanded = " ".join(expanded.split("_"))
141 |             # print(m.group(), " - ", expanded)
142 |             # with open("analysis/segmenter_" +
143 |             # self.segmenter_corpus + ".txt", "a") as f:
144 |             #     f.write(m.group() + "\t" + expanded + "\n")
145 | 
146 |         else:
147 |             # split words following CamelCase convention
148 |             expanded = self.regexes["camel_split"].sub(r' \1', text)
149 |             expanded = expanded.replace("-", "")
150 |             expanded = expanded.replace("_", "")
151 |             # print(m.group(), " - ", expanded)
152 | 
153 |         if "hashtag" in self.include_tags:
154 |             expanded = self.add_special_tag(expanded, "hashtag", mode="wrap")
155 | 
156 |         return expanded
157 | 
158 |     def handle_elongated_match(self, m):
159 |         text = m.group()
160 |         # normalize to at most 2 repeating chars
161 |         text = self.regexes["normalize_elong"].sub(r'\1\1', text)
162 |         normalized = self.spell_corrector.normalize_elongated(text)
163 |         if normalized:
164 |             text = normalized
165 | 
166 |         # try to spell correct the word
167 |         if self.spell_correct_elong:
168 |             text = self.spell_corrector.correct_word(text, assume_wrong=True,
169 |                                                      fast=True)
170 |             # with open("analysis/spell_corrector_" +
171 |             # self.corrector_corpus + ".txt", "a") as f:
172 |             #     f.write(m.group() + " - " + text + "\n")
173 | 
174 |             # print(m.group(), "-", text)
175 |         if "elongated" in self.include_tags:
176 |             text = self.add_special_tag(text, "elongated")
177 | 
178 |         return text
179 |     
180 | 
181 |     @lru_cache(maxsize=65536)
182 |     def handle_repeated_puncts(self, m):
183 |         """
184 |         return the sorted set so mathes random combinations of puncts
185 |         will be mapped to the same token
186 |         "!??!?!!", "?!!!!?!", "!!?", "!?!?" --> "?!"
187 |         "!...", "...?!" --> ".!"
188 |         :param m:
189 |         :return:
190 |         """
191 |         text = m.group()
192 |         text = "".join(sorted(set(text), reverse=True))
193 | 
194 |         if "repeated" in self.include_tags:
195 |             text = self.add_special_tag(text, "repeated")
196 | 
197 |         return text
198 | 
199 |     @lru_cache(maxsize=65536)
200 |     def handle_generic_match(self, m, tag, mode="every"):
201 |         """
202 | 
203 |         Args:
204 |             m ():
205 |             tag ():
206 |             mode ():
207 | 
208 |         Returns:
209 | 
210 |         """
211 |         text = m.group()
212 |         text = self.add_special_tag(text, tag, mode=mode)
213 | 
214 |         return text
215 | 
216 |     @lru_cache(maxsize=65536)
217 |     def handle_emphasis_match(self, m):
218 |         """
219 |         :param m:
220 |         :return:
221 |         """
222 |         text = m.group().replace("*", "")
223 |         if "emphasis" in self.include_tags:
224 |             text = self.add_special_tag(text, "emphasis")
225 | 
226 |         return text
227 | 
228 |     @staticmethod
229 |     def dict_replace(wordlist, _dict):
230 |         return [_dict[w] if w in _dict else w for w in wordlist]
231 | 
232 |     @staticmethod
233 |     def remove_hashtag_allcaps(wordlist):
234 |         in_hashtag = False
235 |         _words = []
236 |         for word in wordlist:
237 | 
238 |             if word == "<hashtag>":
239 |                 in_hashtag = True
240 |             elif word == "</hashtag>":
241 |                 in_hashtag = False
242 |             elif word in {"<allcaps>", "</allcaps>"} and in_hashtag:
243 |                 continue
244 | 
245 |             _words.append(word)
246 | 
247 |         return _words
248 | 
249 |     def pre_process_doc(self, doc):
250 | 
251 |         doc = re.sub(r' +', ' ', doc)  # remove repeating spaces
252 | 
253 |         # ###########################
254 |         # # fix bad unicode
255 |         # ###########################
256 |         # if self.fix_bad_unicode:
257 |         #     doc = textacy.preprocess.fix_bad_unicode(doc)
258 |         #
259 |         # ###########################
260 |         # # fix html leftovers
261 |         # ###########################
262 |         # doc = html.unescape(doc)
263 | 
264 |         ###########################
265 |         # fix text
266 |         ###########################
267 |         if self.fix_text:
268 |             doc = ftfy.fix_text(doc)
269 | 
270 |         ###########################
271 |         # BACKOFF & OMIT
272 |         ###########################
273 |         for item in self.backoff:
274 |             # better add an extra space after the match.
275 |             # Just to be safe. extra spaces will be normalized later anyway
276 |             doc = self.regexes[item].sub(lambda m: " " + "<" + item + ">" + " ",
277 |                                          doc)
278 |         for item in self.omit:
279 |             doc = doc.replace("<" + item + ">", '')
280 | 
281 |         ###########################
282 |         # unpack hashtags
283 |         ###########################
284 |         if self.unpack_hashtags:
285 |             doc = self.regexes["hashtag"].sub(
286 |                 lambda w: self.handle_hashtag_match(w), doc)
287 | 
288 |         ###########################
289 |         # handle special cases
290 |         ###########################
291 |         if self.mode != "fast":
292 |             if "allcaps" in self.include_tags:
293 |                 doc = self.regexes["allcaps"].sub(
294 |                     lambda w: self.handle_generic_match(w, "allcaps",
295 |                                                         mode=self.all_caps_tag),
296 |                     doc)
297 | 
298 |             if "elongated" in self.include_tags:
299 |                 doc = self.regexes["elongated"].sub(
300 |                     lambda w: self.handle_elongated_match(w), doc)
301 | 
302 |             if "repeated" in self.include_tags:
303 |                 doc = self.regexes["repeat_puncts"].sub(
304 |                     lambda w: self.handle_repeated_puncts(w), doc)
305 | 
306 |             if "emphasis" in self.include_tags:
307 |                 doc = self.regexes["emphasis"].sub(
308 |                     lambda w: self.handle_emphasis_match(w), doc)
309 | 
310 |             if "censored" in self.include_tags:
311 |                 doc = self.regexes["censored"].sub(
312 |                     lambda w: self.handle_generic_match(w, "censored"), doc)
313 |         
314 |         ###########################
315 |         # unpack contractions: i'm -> i am, can't -> can not...
316 |         ###########################
317 | 
318 |         # remove textacy dependency
319 |         if self.unpack_contractions:
320 |             doc = unpack_contractions(doc)
321 | 
322 |         if self.remove_tags:
323 |             doc = remove_tags(doc)
324 |      
325 |         # omit allcaps if inside hashtags
326 |         doc = re.sub(r' +', ' ', doc)  # remove repeating spaces
327 |         # doc = re.sub(r'<hashtag><allcaps>', '<hashtag>', doc)  # remove repeating spaces
328 |         # doc = doc.replace('<hashtag> <allcaps>', '<hashtag>')
329 |         # doc = doc.replace('</allcaps> </hashtag>', '</hashtag>')
330 | 
331 |         ###########################
332 |         # Tokenize
333 |         ###########################
334 |         doc = self.remove_hashtag_allcaps(doc.split())
335 |         doc = " ".join(doc)  # normalize whitespace
336 |         if self.tokenizer:
337 |             doc = self.tokenizer(doc)
338 | 
339 |             # Replace tokens with special dictionaries (slang,emoticons ...)
340 |             # todo: add spell check before!
341 |             if self.dicts:
342 |                 for d in self.dicts:
343 |                     doc = self.dict_replace(doc, d)
344 | 
345 |         return doc
346 | 
347 |     def pre_process_docs(self, docs, lazy=True):
348 |         from tqdm import tqdm
349 |         for d in tqdm(docs, desc="PreProcessing..."):
350 |             yield self.pre_process_doc(d)
351 | 


--------------------------------------------------------------------------------
/ekphrasis/classes/segmenter.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from functools import lru_cache
  3 | from math import log10
  4 | 
  5 | from ekphrasis.classes.exmanager import ExManager
  6 | from ekphrasis.utils.helpers import read_stats
  7 | 
  8 | """
  9 | The Segmenter Class implements the Viterbi algorithm for word segmentation.
 10 | Based on CH14 from the book Beautiful Data (Segaran and Hammerbacher, 2009)
 11 | """
 12 | 
 13 | REGEX_TOKEN = re.compile(r'\b[a-z]{2,}\b')
 14 | NGRAM_SEP = "_"  # todo: move to values
 15 | 
 16 | 
 17 | class Pdist(dict):
 18 |     """
 19 |     A probability distribution estimated from word counts
 20 |     Notice: if pw = Pdist(unigrams, n_tokens:
 21 |         * pw[w] is the raw count of the word w
 22 |         * pw(w) is the probability of the word w
 23 |     """
 24 | 
 25 |     @staticmethod
 26 |     def default_unk_func(key, total):
 27 |         return 1. / total
 28 | 
 29 |     def __init__(self, data=None, total=None, unk_func=None, **kwargs):
 30 |         super().__init__(**kwargs)
 31 | 
 32 |         # insert the word counts
 33 |         data = data or {}
 34 |         for key, count in data.items():
 35 |             self[key] = self.get(key, 0) + int(count)
 36 | 
 37 |         self.total = float(total or sum(self.values()))
 38 |         self.unk_prob = unk_func or self.default_unk_func
 39 | 
 40 |     def __call__(self, key):
 41 |         if key in self:
 42 |             return self[key] / self.total
 43 |         else:
 44 |             return self.unk_prob(key, self.total)
 45 | 
 46 | 
 47 | class Segmenter:
 48 |     def __init__(self, corpus="english", max_split_length=20):
 49 |         """
 50 |         Args:
 51 |             corpus (str): the statistics from which corpus to use for
 52 |                 the spell correction.
 53 |             max_split_length (int): the maximum length of that a word can have
 54 |                 for looking for splits
 55 |         """
 56 | 
 57 |         # self.unigrams = Counter(read_stats(corpus, 1))
 58 |         # self.bigrams = Counter(read_stats(corpus, 2))
 59 |         self.unigrams = read_stats(corpus, 1)
 60 |         self.bigrams = read_stats(corpus, 2)
 61 |         self.N = sum(self.unigrams.values())
 62 |         self.L = max_split_length
 63 | 
 64 |         self.Pw = Pdist(self.unigrams, self.N, self.unk_probability)
 65 |         self.P2w = Pdist(self.bigrams, self.N)
 66 | 
 67 |         self.case_split = ExManager().get_compiled()["camel_split"]
 68 | 
 69 |     def condProbWord(self, word, prev):
 70 |         """
 71 |         Conditional probability of word, given previous word
 72 |         if bigram is not in our list, then fall back to unigrams
 73 |         Args:
 74 |             word (): candidate word
 75 |             prev (): previous observed word
 76 | 
 77 |         Returns:
 78 | 
 79 |         """
 80 |         try:
 81 |             return self.P2w[prev + NGRAM_SEP + word] / float(self.Pw[prev])
 82 |         except KeyError:
 83 |             return self.Pw(word)
 84 | 
 85 |     @staticmethod
 86 |     def unk_probability(key, total):
 87 |         """
 88 |         Estimate the probability of an unknown word, penalizing its length
 89 |         :param key: the word
 90 |         :param total: the count of all tokens
 91 |         :return:
 92 |         """
 93 |         return 10. / (total * 10 ** len(key))
 94 | 
 95 |     @staticmethod
 96 |     def combine(first, rem):
 97 |         """
 98 |         Combine first and rem results into one (probability, words) pair
 99 |         :param first: a tuple in the form: probability, word
100 |         :param rem: a tuple in the form: probability, list_of_words
101 |         :return:
102 |         """
103 |         (first_prob, first_word) = first
104 |         (rem_prob, rem_words) = rem
105 |         return first_prob + rem_prob, [first_word] + rem_words
106 | 
107 |     def splits(self, text):
108 |         """
109 |         Return a list of all possible (first, rem) pairs with max length of first <=L
110 |         :param text:
111 |         :return:
112 |         """
113 |         return [(text[:i + 1], text[i + 1:])
114 |                 for i in range(min(len(text), self.L))]
115 | 
116 |     # if you don't have enough RAM lower the maxsize
117 |     @lru_cache(maxsize=65536)
118 |     def find_segment(self, text, prev='<S>'):
119 |         """
120 |         Return (log P(words), words), where words is the best estimated segmentation
121 |         :param text: the text to be segmented
122 |         :param prev:
123 |         :return:
124 |         """
125 |         if not text:
126 |             return 0.0, []
127 |         candidates = [self.combine((log10(self.condProbWord(first, prev)), first), self.find_segment(rem, first))
128 |                       for first, rem in self.splits(text)]
129 |         return max(candidates)
130 | 
131 |     # if you don't have enough RAM lower the maxsize
132 |     @lru_cache(maxsize=65536)
133 |     def segment(self, word):
134 |         if word.islower():
135 |             return " ".join(self.find_segment(word)[1])
136 |         else:
137 |             return self.case_split.sub(r' \1', word).lower()
138 | 
139 |     def demo(self):
140 |         print("BBCtest: ", self.segment('BbcTest'))
141 |         print("choosespain: ", self.segment('choosespain'))
142 |         print("speedofart: ", self.segment('speedofart'))
143 |         print("smallandinsignificant: ", self.segment('smallandinsignificant'))
144 | 
145 | # Segmenter().demo()
146 | 


--------------------------------------------------------------------------------
/ekphrasis/classes/spellcorrect.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from collections import Counter
  3 | from difflib import SequenceMatcher
  4 | from functools import lru_cache
  5 | 
  6 | from ekphrasis.utils.helpers import read_stats
  7 | 
  8 | REGEX_TOKEN = re.compile(r'\b[a-z]{2,}\b')
  9 | 
 10 | 
 11 | class SpellCorrector:
 12 |     """
 13 |     The SpellCorrector extends the functionality of the Peter Norvig's
 14 |     spell-corrector in http://norvig.com/spell-correct.html
 15 |     """
 16 | 
 17 |     def __init__(self, corpus="english"):
 18 |         """
 19 | 
 20 |         :param corpus: the statistics from which corpus to use for the spell correction.
 21 |         """
 22 |         super().__init__()
 23 |         self.WORDS = Counter(read_stats(corpus, 1))
 24 |         self.N = sum(self.WORDS.values())
 25 | 
 26 |     @staticmethod
 27 |     def tokens(text):
 28 |         return REGEX_TOKEN.findall(text.lower())
 29 | 
 30 |     def P(self, word):
 31 |         """
 32 |         Probability of `word`.
 33 |         """
 34 |         return self.WORDS[word] / self.N
 35 | 
 36 |     def most_probable(self, words):
 37 |         _known = self.known(words)
 38 |         if _known:
 39 |             return max(_known, key=self.P)
 40 |         else:
 41 |             return []
 42 | 
 43 |     @staticmethod
 44 |     def edit_step(word):
 45 |         """
 46 |         All edits that are one edit away from `word`.
 47 |         """
 48 |         letters = 'abcdefghijklmnopqrstuvwxyz'
 49 |         splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
 50 |         deletes = [L + R[1:] for L, R in splits if R]
 51 |         transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
 52 |         replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
 53 |         inserts = [L + c + R for L, R in splits for c in letters]
 54 |         return set(deletes + transposes + replaces + inserts)
 55 | 
 56 |     def edits2(self, word):
 57 |         """
 58 |         All edits that are two edits away from `word`.
 59 |         """
 60 |         return (e2 for e1 in self.edit_step(word)
 61 |                 for e2 in self.edit_step(e1))
 62 | 
 63 |     def known(self, words):
 64 |         """
 65 |         The subset of `words` that appear in the dictionary of WORDS.
 66 |         """
 67 |         return set(w for w in words if w in self.WORDS)
 68 | 
 69 |     @staticmethod
 70 |     def similar(a, b):
 71 |         return SequenceMatcher(None, a, b).ratio()
 72 | 
 73 |     def edit_candidates(self, word, assume_wrong=False, fast=True):
 74 |         """
 75 |         Generate possible spelling corrections for word.
 76 |         """
 77 | 
 78 |         if fast:
 79 |             if assume_wrong:
 80 |                 return self.known(self.edit_step(word)) or [word]
 81 |             else:
 82 |                 return self.known([word]) or self.known(self.edit_step(word)) or [word]
 83 |         else:
 84 |             if assume_wrong:
 85 |                 ttt = self.known(self.edit_step(word)) or self.known(self.edits2(word)) or {word}
 86 |                 return ttt
 87 |             else:
 88 |                 return self.known([word]) or self.known(self.edit_step(word)) or self.known(self.edits2(word)) or [word]
 89 | 
 90 |     # def distance_candidates(self, word, max_distance=3):
 91 |     #     """
 92 |     #     Generate possible spelling corrections for word.
 93 |     #     """
 94 |     #     candidates = [w for w in self.WORDS if w]
 95 |     #     return self.known([word]) or self.known(self.edit_step(word)) or self.known(self.edits2(word)) or [word]
 96 | 
 97 |     @lru_cache(maxsize=65536)
 98 |     def correct(self, word, assume_wrong=False, fast=False):
 99 |         """
100 |         Most probable spelling correction for word.
101 |         """
102 |         return max(self.edit_candidates(word, assume_wrong=assume_wrong, fast=fast), key=self.P)
103 | 
104 |     def correct_text(self, text):
105 |         """
106 |         Correct all the words within a text, returning the corrected text."""
107 | 
108 |         return re.sub('[a-zA-Z]+', self.correct_match, text)
109 | 
110 |     def correct_match(self, match):
111 |         """
112 |         Spell-correct word in match, and preserve proper upper/lower/title case.
113 |         """
114 | 
115 |         word = match.group()
116 |         return self.case_of(word)(self.correct(word.lower()))
117 | 
118 |     def correct_word(self, word, assume_wrong=False, fast=False):
119 |         """
120 |         Spell-correct word in match, and preserve proper upper/lower/title case.
121 |         """
122 | 
123 |         return self.case_of(word)(self.correct(word.lower(), assume_wrong=assume_wrong, fast=fast))
124 | 
125 |     @staticmethod
126 |     def case_of(text):
127 |         """
128 |         Return the case-function appropriate for text: upper, lower, title, or just str.
129 |         """
130 | 
131 |         return (str.upper if text.isupper() else
132 |                 str.lower if text.islower() else
133 |                 str.title if text.istitle() else
134 |                 str)
135 | 
136 |     def elong_normalized_candidates(self, word, acc=None):
137 |         if acc is None:
138 |             acc = []
139 |         candidates = [w for w in set(word) if word.count(w) > 1]
140 |         for c in candidates:
141 |             _w = word.replace(c + c, c)
142 |             if _w in acc:
143 |                 continue
144 |             acc.append(_w)
145 |             self.elong_normalized_candidates(_w, acc)
146 |         return acc + [word]
147 | 
148 |     def best_elong_candidate(self, word):
149 |         candidates = self.elong_normalized_candidates(word)
150 |         best = self.most_probable(candidates)
151 |         return best or word
152 | 
153 |     def normalize_elongated(self, word):
154 |         return self.case_of(word)(self.best_elong_candidate(word.lower()))
155 | 


--------------------------------------------------------------------------------
/ekphrasis/classes/tokenizer.py:
--------------------------------------------------------------------------------
  1 | import html
  2 | import re
  3 | 
  4 | import colorama
  5 | from termcolor import colored
  6 | 
  7 | from ekphrasis.classes.exmanager import ExManager
  8 | 
  9 | 
 10 | class Tokenizer:
 11 |     social_pipeline = [
 12 |         "EMOJI", "URL", "TAG", "EMAIL", "USER", "HASHTAG",
 13 |         "CASHTAG", "PHONE", "PERCENT", "MONEY", "DATE", "TIME",
 14 |         "ACRONYM", "LTR_FACE", "RTL_FACE", "CENSORED", "EMPHASIS",
 15 |         "REST_EMOTICONS", "NUMBER", "WORD", "EASTERN_EMOTICONS",
 16 |     ]
 17 |     default_pipeline = social_pipeline
 18 | 
 19 |     def __init__(self, pipeline=None, lowercase=False, verbose=False,
 20 |                  debug=False):
 21 |         """
 22 |         Args:
 23 |             pipeline (list): list of terms to use for tokenization.
 24 |                 Each term, is a key from the dict of regexes `expressions.txt`.
 25 |                 Order matters!
 26 |             lowercase (bool): set to True in order to lowercase the text
 27 |             verbose (bool): set to True to print each text after tokenization.
 28 |                 Useful for debugging purposes.
 29 |             debug (bool): set to True in order to pause after tokenizing
 30 |                 each text (wait for pressing any key).
 31 |                 Useful for debugging purposes, if you want to inspect each text
 32 |                 as is processed.
 33 |         """
 34 |         self.lowercase = lowercase
 35 |         self.debug = debug
 36 |         self.verbose = verbose
 37 |         colorama.init(autoreset=False, convert=False, strip=False, wrap=True)
 38 | 
 39 |         self.pipeline = []
 40 | 
 41 |         self.regexes = ExManager().expressions
 42 | 
 43 |         if pipeline is None:
 44 |             pipeline = self.default_pipeline
 45 | 
 46 |         self.build(pipeline)
 47 | 
 48 |         self.pipeline.append("(?:\S)")  # CATCH ALL remaining terms
 49 |         self.tok = re.compile(r"({})".format("|".join(self.pipeline)))
 50 | 
 51 |     def add_to_pipeline(self, term):
 52 |         # todo: don't wrap all terms
 53 |         self.pipeline.append(self.wrap_non_matching(self.regexes[term]))
 54 | 
 55 |     def build(self, pipeline):
 56 |         for term in pipeline:
 57 |             self.add_to_pipeline(term)
 58 | 
 59 | 
 60 |     @staticmethod
 61 |     def wrap_non_matching(exp):
 62 |         return "(?:{})".format(exp)
 63 | 
 64 |     def verbose_text(self, text, tokenized):
 65 |         # print(text.rstrip())
 66 |         for term in tokenized:
 67 |             print(colored(term, 'red', attrs=["underline"]), end=" ")
 68 |         print()
 69 |         if self.debug:
 70 |             input()
 71 |         else:
 72 |             print()
 73 | 
 74 |     def tokenize(self, text):
 75 |         escaped = html.unescape(text)
 76 |         tokenized = self.tok.findall(escaped)
 77 | 
 78 |         if self.verbose:
 79 |             self.verbose_text(text, tokenized)
 80 | 
 81 |         if self.lowercase:
 82 |             tokenized = [t.lower() for t in tokenized]
 83 | 
 84 |         return tokenized
 85 | 
 86 | 
 87 | class SocialTokenizer:
 88 |     """
 89 |     **Deprecated**
 90 | 
 91 |     A parametric tokenizer that understands many expression found in natural
 92 |     language such as hashtags, dates, times, emoticons and much more.
 93 |     """
 94 | 
 95 |     def __init__(self, lowercase=False, verbose=False, debug=False, **kwargs):
 96 |         """
 97 | 
 98 |         Args:
 99 |             lowercase (bool): set to True in order to lowercase the text
100 |             verbose (bool): set to True to print each text after tokenization.
101 |                 Useful for debugging purposes.
102 |             debug (bool): set to True in order to pause after tokenizing
103 |                 each text (wait for pressing any key).
104 |                 Useful for debugging purposes, if you want to inspect each text
105 |                 as is processed.
106 | 
107 |         Kwargs ():
108 |             emojis (bool): True to keep emojis
109 |             urls (bool): True to keep urls
110 |             tags (bool): True to keep tags: <tag>
111 |             emails (bool): True to keep emails
112 |             users (bool): True to keep users handles: @cbaziotis
113 |             hashtags (bool): True to keep hashtags
114 |             cashtags (bool): True to keep cashtags
115 |             phones (bool): True to keep phones
116 |             percents (bool): True to keep percents
117 |             money (bool): True to keep money expressions
118 |             date (bool): True to keep date expressions
119 |             time (bool): True to keep time expressions
120 |             acronyms (bool): True to keep acronyms
121 |             emoticons (bool): True to keep emoticons
122 |             censored (bool): True to keep censored words: f**k
123 |             emphasis (bool): True to keep words with emphasis: *very* good
124 |             numbers (bool): True to keep numbers
125 |         """
126 | 
127 |         self.lowercase = lowercase
128 |         self.debug = debug
129 |         self.verbose = verbose
130 |         colorama.init(autoreset=False, convert=False, strip=False, wrap=True)
131 |         pipeline = []
132 |         self.regexes = ExManager().expressions
133 | 
134 |         emojis = kwargs.get("emojis", True)
135 |         urls = kwargs.get("urls", True)
136 |         tags = kwargs.get("tags", True)
137 |         emails = kwargs.get("emails", True)
138 |         users = kwargs.get("users", True)
139 |         hashtags = kwargs.get("hashtags", True)
140 |         cashtags = kwargs.get("cashtags", True)
141 |         phones = kwargs.get("phones", True)
142 |         percents = kwargs.get("percents", True)
143 |         money = kwargs.get("money", True)
144 |         date = kwargs.get("date", True)
145 |         time = kwargs.get("time", True)
146 |         acronyms = kwargs.get("acronyms", True)
147 |         emoticons = kwargs.get("emoticons", True)
148 |         censored = kwargs.get("censored", True)
149 |         emphasis = kwargs.get("emphasis", True)
150 |         numbers = kwargs.get("numbers", True)
151 | 
152 |         if urls:
153 |             pipeline.append(self.regexes["URL"])
154 | 
155 |         if tags:
156 |             pipeline.append(self.regexes["TAG"])
157 | 
158 |         if emails:
159 |             pipeline.append(self.wrap_non_matching(self.regexes["EMAIL"]))
160 | 
161 |         if users:
162 |             pipeline.append(self.wrap_non_matching(self.regexes["USER"]))
163 | 
164 |         if hashtags:
165 |             pipeline.append(self.wrap_non_matching(self.regexes["HASHTAG"]))
166 | 
167 |         if cashtags:
168 |             pipeline.append(self.wrap_non_matching(self.regexes["CASHTAG"]))
169 | 
170 |         if phones:
171 |             pipeline.append(self.wrap_non_matching(self.regexes["PHONE"]))
172 | 
173 |         if percents:
174 |             pipeline.append(self.wrap_non_matching(self.regexes["PERCENT"]))
175 | 
176 |         if money:
177 |             pipeline.append(self.wrap_non_matching(self.regexes["MONEY"]))
178 | 
179 |         if date:
180 |             pipeline.append(self.wrap_non_matching(self.regexes["DATE"]))
181 | 
182 |         if time:
183 |             pipeline.append(self.wrap_non_matching(self.regexes["TIME"]))
184 | 
185 |         if acronyms:
186 |             pipeline.append(self.wrap_non_matching(self.regexes["ACRONYM"]))
187 | 
188 |         if emoticons:
189 |             pipeline.append(self.regexes["LTR_FACE"])
190 |             pipeline.append(self.regexes["RTL_FACE"])
191 | 
192 |         if censored:
193 |             pipeline.append(self.wrap_non_matching(self.regexes["CENSORED"]))
194 | 
195 |         if emphasis:
196 |             pipeline.append(self.wrap_non_matching(self.regexes["EMPHASIS"]))
197 | 
198 |         # terms like 'eco-friendly', 'go_to', 'john's' - maybe remove the ' or add a parameter for it
199 |         # pipeline.append(r"(?:\b[a-zA-Z]+[a-zA-Z'\-_]+[a-zA-Z]+\b)")
200 | 
201 |         # <3 ^5
202 |         if emoticons:
203 |             pipeline.append(
204 |                 self.wrap_non_matching(self.regexes["REST_EMOTICONS"]))
205 | 
206 |         if numbers:
207 |             pipeline.append(self.regexes["NUMBER"])
208 | 
209 |         if emojis:
210 |             pipeline.append(self.regexes["EMOJI"])
211 | 
212 |         # any other word
213 |         pipeline.append(self.regexes["WORD"])
214 | 
215 |         # EASTERN EMOTICONS - (^_^;)   (>_<)>  ＼(^o^)／
216 |         if emoticons:
217 |             pipeline.append(
218 |                 self.wrap_non_matching(self.regexes["EASTERN_EMOTICONS"]))
219 | 
220 |         # keep repeated puncts as one term
221 |         # pipeline.append(r"")
222 | 
223 |         pipeline.append("(?:\S)")  # CATCH ALL remaining terms
224 | 
225 |         self.tok = re.compile(r"({})".format("|".join(pipeline)))
226 | 
227 |     @staticmethod
228 |     def wrap_non_matching(exp):
229 |         return "(?:{})".format(exp)
230 | 
231 |     def verbose_text(self, text, tokenized):
232 |         # print(text.rstrip())
233 |         for term in tokenized:
234 |             print(colored(term, 'red', attrs=["underline"]), end=" ")
235 |         print()
236 |         if self.debug:
237 |             input()
238 |         else:
239 |             print()
240 | 
241 |     def tokenize(self, text):
242 |         escaped = html.unescape(text)
243 |         tokenized = self.tok.findall(escaped)
244 | 
245 |         if self.verbose:
246 |             self.verbose_text(text, tokenized)
247 | 
248 |         if self.lowercase:
249 |             tokenized = [t.lower() for t in tokenized]
250 | 
251 |         return tokenized
252 | 
253 | # sentences = []
254 | 
255 | # [print(s) for s in sentences]
256 | # tokenizer = SocialTokenizer(debug=True, verbose=True)
257 | #
258 | # for s in sentences:
259 | #     tokenizer.tokenize(s)
260 | 


--------------------------------------------------------------------------------
/ekphrasis/dicts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/dicts/__init__.py


--------------------------------------------------------------------------------
/ekphrasis/dicts/emoticons.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | # todo:catch repeating parenthesis
  4 | emoticons = {
  5 |     ':*': '<kiss>',
  6 |     ':-*': '<kiss>',
  7 |     ':x': '<kiss>',
  8 |     ':-)': '<happy>',
  9 |     ':-))': '<happy>',
 10 |     ':-)))': '<happy>',
 11 |     ':-))))': '<happy>',
 12 |     ':-)))))': '<happy>',
 13 |     ':-))))))': '<happy>',
 14 |     ':)': '<happy>',
 15 |     ':))': '<happy>',
 16 |     ':)))': '<happy>',
 17 |     ':))))': '<happy>',
 18 |     ':)))))': '<happy>',
 19 |     ':))))))': '<happy>',
 20 |     ':)))))))': '<happy>',
 21 |     ':o)': '<happy>',
 22 |     ':]': '<happy>',
 23 |     ':3': '<happy>',
 24 |     ':c)': '<happy>',
 25 |     ':>': '<happy>',
 26 |     '=]': '<happy>',
 27 |     '8)': '<happy>',
 28 |     '=)': '<happy>',
 29 |     ':}': '<happy>',
 30 |     ':^)': '<happy>',
 31 |     '|;-)': '<happy>',
 32 |     ":'-)": '<happy>',
 33 |     ":')": '<happy>',
 34 |     '\o/': '<happy>',
 35 |     '*\\0/*': '<happy>',
 36 |     ':-D': '<laugh>',
 37 |     ':D': '<laugh>',
 38 |     # '(\':': '<laugh>',
 39 |     '8-D': '<laugh>',
 40 |     '8D': '<laugh>',
 41 |     'x-D': '<laugh>',
 42 |     'xD': '<laugh>',
 43 |     'X-D': '<laugh>',
 44 |     'XD': '<laugh>',
 45 |     '=-D': '<laugh>',
 46 |     '=D': '<laugh>',
 47 |     '=-3': '<laugh>',
 48 |     '=3': '<laugh>',
 49 |     'B^D': '<laugh>',
 50 |     '>:[': '<sad>',
 51 |     ':-(': '<sad>',
 52 |     ':-((': '<sad>',
 53 |     ':-(((': '<sad>',
 54 |     ':-((((': '<sad>',
 55 |     ':-(((((': '<sad>',
 56 |     ':-((((((': '<sad>',
 57 |     ':-(((((((': '<sad>',
 58 |     ':(': '<sad>',
 59 |     ':((': '<sad>',
 60 |     ':(((': '<sad>',
 61 |     ':((((': '<sad>',
 62 |     ':(((((': '<sad>',
 63 |     ':((((((': '<sad>',
 64 |     ':(((((((': '<sad>',
 65 |     ':((((((((': '<sad>',
 66 |     ':-c': '<sad>',
 67 |     ':c': '<sad>',
 68 |     ':-<': '<sad>',
 69 |     ':<': '<sad>',
 70 |     ':-[': '<sad>',
 71 |     ':[': '<sad>',
 72 |     ':{': '<sad>',
 73 |     ':-||': '<sad>',
 74 |     ':@': '<sad>',
 75 |     ":'-(": '<sad>',
 76 |     ":'(": '<sad>',
 77 |     'D:<': '<sad>',
 78 |     'D:': '<sad>',
 79 |     'D8': '<sad>',
 80 |     'D;': '<sad>',
 81 |     'D=': '<sad>',
 82 |     'DX': '<sad>',
 83 |     'v.v': '<sad>',
 84 |     "D-':": '<sad>',
 85 |     '(>_<)': '<sad>',
 86 |     ':|': '<sad>',
 87 |     '>:O': '<surprise>',
 88 |     ':-O': '<surprise>',
 89 |     ':-o': '<surprise>',
 90 |     ':O': '<surprise>',
 91 |     '°o°': '<surprise>',
 92 |     'o_O': '<surprise>',
 93 |     'o_0': '<surprise>',
 94 |     'o.O': '<surprise>',
 95 |     'o-o': '<surprise>',
 96 |     '8-0': '<surprise>',
 97 |     '|-O': '<surprise>',
 98 |     ';-)': '<wink>',
 99 |     ';)': '<wink>',
100 |     '*-)': '<wink>',
101 |     '*)': '<wink>',
102 |     ';-]': '<wink>',
103 |     ';]': '<wink>',
104 |     ';D': '<wink>',
105 |     ';^)': '<wink>',
106 |     ':-,': '<wink>',
107 |     '>:P': '<tong>',
108 |     ':-P': '<tong>',
109 |     ':P': '<tong>',
110 |     'X-P': '<tong>',
111 |     'x-p': '<tong>',
112 |     'xp': '<tong>',
113 |     'XP': '<tong>',
114 |     ':-p': '<tong>',
115 |     ':p': '<tong>',
116 |     '=p': '<tong>',
117 |     ':-Þ': '<tong>',
118 |     ':Þ': '<tong>',
119 |     ':-b': '<tong>',
120 |     ':b': '<tong>',
121 |     ':-&': '<tong>',
122 |     '>:\\': '<annoyed>',
123 |     '>:/': '<annoyed>',
124 |     ':-/': '<annoyed>',
125 |     ':-.': '<annoyed>',
126 |     ':/': '<annoyed>',
127 |     ':\\': '<annoyed>',
128 |     '=/': '<annoyed>',
129 |     '=\\': '<annoyed>',
130 |     ':L': '<annoyed>',
131 |     '=L': '<annoyed>',
132 |     ':S': '<annoyed>',
133 |     '>.<': '<annoyed>',
134 |     ':-|': '<annoyed>',
135 |     '<:-|': '<annoyed>',
136 |     ':-X': '<seallips>',
137 |     ':X': '<seallips>',
138 |     ':-#': '<seallips>',
139 |     ':#': '<seallips>',
140 |     'O:-)': '<angel>',
141 |     '0:-3': '<angel>',
142 |     '0:3': '<angel>',
143 |     '0:-)': '<angel>',
144 |     '0:)': '<angel>',
145 |     '0;^)': '<angel>',
146 |     '>:)': '<devil>',
147 |     '>:D': '<devil>',
148 |     '>:-D': '<devil>',
149 |     '>;)': '<devil>',
150 |     '>:-)': '<devil>',
151 |     '}:-)': '<devil>',
152 |     '}:)': '<devil>',
153 |     '3:-)': '<devil>',
154 |     '3:)': '<devil>',
155 |     'o/\o': '<highfive>',
156 |     '^5': '<highfive>',
157 |     '>_>^': '<highfive>',
158 |     '^<_<': '<highfive>',  # todo:fix tokenizer - MISSES THIS
159 |     '<3': '<heart>'
160 | }
161 | 
162 | # todo: clear this mess
163 | pattern = re.compile("^[:=\*\-\(\)\[\]x0oO\#\<\>8\\.\'|\{\}\@]+$")
164 | mirror_emoticons = {}
165 | for exp, tag in emoticons.items():
166 |     if pattern.match(exp) \
167 |             and any(ext in exp for ext in [";", ":", "="]) \
168 |             and not any(ext in exp for ext in ["L", "D", "p", "P", "3"]):
169 |         mirror = exp[::-1]
170 | 
171 |         if "{" in mirror:
172 |             mirror = mirror.replace("{", "}")
173 |         elif "}" in mirror:
174 |             mirror = mirror.replace("}", "{")
175 | 
176 |         if "(" in mirror:
177 |             mirror = mirror.replace("(", ")")
178 |         elif ")" in mirror:
179 |             mirror = mirror.replace(")", "(")
180 | 
181 |         if "<" in mirror:
182 |             mirror = mirror.replace("<", ">")
183 |         elif ">" in mirror:
184 |             mirror = mirror.replace(">", "<")
185 | 
186 |         if "[" in mirror:
187 |             mirror = mirror.replace("[", "]")
188 |         elif "]" in mirror:
189 |             mirror = mirror.replace("]", "[")
190 | 
191 |         if "\\" in mirror:
192 |             mirror = mirror.replace("\\", "/")
193 |         elif "/" in mirror:
194 |             mirror = mirror.replace("/", "\\")
195 | 
196 |         # print(exp + "\t\t" + mirror)
197 |         mirror_emoticons[mirror] = tag
198 | emoticons.update(mirror_emoticons)
199 | 
200 | for exp, tag in list(emoticons.items()):
201 |     if exp.lower() not in emoticons:
202 |         emoticons[exp.lower()] = tag
203 | 
204 | emoticon_groups = {
205 |     "positive": {'<highfive>', '<laugh>', '<heart>', '<happy>'},
206 |     "negative": {'<annoyed>', '<sad>', }
207 | }
208 | 
209 | 
210 | def print_positive(sentiment):
211 |     for e, tag in emoticons.items():
212 |         if tag in emoticon_groups[sentiment]:
213 |             print(e)
214 | 
215 | # print_positive("negative")
216 | # print(" ".join(list(emoticons.keys())))
217 | # [print(e) for e in list(emoticons.keys())]
218 | 


--------------------------------------------------------------------------------
/ekphrasis/dicts/noslang/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/dicts/noslang/__init__.py


--------------------------------------------------------------------------------
/ekphrasis/dicts/noslang/manager.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pickle
 4 | 
 5 | 
 6 | def read_slangdict():
 7 |     filename = os.path.join(os.path.dirname(__file__), "slangdict.pickle")
 8 |     if os.path.isfile(filename):
 9 |         print("Reading data...")
10 |         data = pickle.load(open(filename, 'rb'))
11 |         return data
12 | 


--------------------------------------------------------------------------------
/ekphrasis/dicts/noslang/slangdict.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/dicts/noslang/slangdict.pickle


--------------------------------------------------------------------------------
/ekphrasis/dicts/sentiment/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/dicts/sentiment/__init__.py


--------------------------------------------------------------------------------
/ekphrasis/dicts/sentiment/nrc_emolex/NRCEmolex.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | import pickle
 4 | 
 5 | '''
 6 | NRC Word-Emotion Association Lexicon (aka EmoLex) (14.000 entries)
 7 | --------------------------------------
 8 | format = dictionary with entries like this:
 9 | word1={'negative': 0.0, 'positive': 1.0, 'surprise': 0.0, 'trust': 0.0, 'joy': 1.0, 'fear': 0.0, 'anticipation': 0.0, 'sadness': 0.0, 'anger': 0.0, 'disgust': 0.0}
10 | '''
11 | 
12 | 
13 | class NRCEmolex:
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.raw_filename = "NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
17 |         self.parsed_filename = "emolex.pickle"
18 | 
19 |     def write(self):
20 |         if os.path.exists(
21 |                 os.path.join(os.path.dirname(__file__), self.raw_filename)):
22 |             with open(
23 |                     os.path.join(os.path.dirname(__file__), self.raw_filename),
24 |                     "r") as f:
25 |                 reader = csv.reader(f, delimiter="\t")
26 |                 reader = list(reader)
27 |                 lexicon = {}
28 |                 for row in reader:
29 |                     # lexicon[row[0]][row[1]] = float(row[2])
30 |                     lexicon.setdefault(row[0], {})[row[1]] = float(row[2])
31 | 
32 |                 for k, v in lexicon.items():
33 |                     polarity = 0
34 |                     if lexicon[k]["positive"]:
35 |                         polarity = 1
36 |                     elif lexicon[k]["negative"]:
37 |                         polarity = -1
38 |                     lexicon[k]["polarity"] = polarity
39 | 
40 |                     lexicon[k]["emotions"] = [v['fear'], v['sadness'],
41 |                                               v['trust'], v['disgust'],
42 |                                               v['surprise'],
43 |                                               v['anger'], v['joy'],
44 |                                               v['anticipation']]
45 | 
46 |                 with open(self.parsed_filename, 'wb') as pickle_file:
47 |                     pickle.dump(lexicon, pickle_file)
48 |         else:
49 |             print("input file not found!")
50 | 
51 |     def read(self):
52 |         if os.path.exists(
53 |                 os.path.join(os.path.dirname(__file__), self.parsed_filename)):
54 |             with open(os.path.join(os.path.dirname(__file__),
55 |                                    self.parsed_filename), 'rb') as f:
56 |                 data = pickle.load(f)
57 |                 return data
58 |         else:
59 |             self.write()
60 |             return self.read()
61 | 
62 | # NRCEmolex().write()
63 | # NRCEmolex().read()
64 | 


--------------------------------------------------------------------------------
/ekphrasis/dicts/sentiment/nrc_emolex/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/dicts/sentiment/nrc_emolex/__init__.py


--------------------------------------------------------------------------------
/ekphrasis/examples/SentReadMe.md:
--------------------------------------------------------------------------------
 1 | ## Sentiment Analysis Example
 2 | 
 3 | ```python
 4 | from ekphrasis.classes.preprocessor import TextPreProcessor
 5 | from ekphrasis.classes.tokenizer import SocialTokenizer
 6 | from ekphrasis.utils.nlp import polarity
 7 | 
 8 | sentences = [
 9 |     "So there is no way for me to plug it in here in the US unless I go by a converter.",
10 |     "Good case, Excellent value.",
11 |     "Works great!",
12 |     'The design is very odd, as the ear "clip" is not very comfortable at all.',
13 |     "Needless to say, I wasted my money."
14 | ]
15 | 
16 | # define preprocessing pipeline
17 | text_processor = TextPreProcessor(
18 |     fix_text=True,
19 |     unpack_contractions=True,
20 |     tokenizer=SocialTokenizer(lowercase=True).tokenize,
21 | )
22 | 
23 | # pass each sentence through the pipeline
24 | tokenized_sentences = list(text_processor.pre_process_docs(sentences))
25 | for sent in tokenized_sentences:
26 |     _polarity, _scores = polarity(sent)
27 |     print("{:.4f}\t".format(_polarity) + " ".join(sent))
28 | ```
29 | 
30 | #### Output
31 | 
32 | ```shell
33 | 0.0139	so there is no way for me to plug it in here in the us unless i go by a converter .
34 | 0.3750	good case , excellent value .
35 | 0.0000	works great !
36 | 0.0500	the design is very odd , as the ear " clip " is not very comfortable at all .
37 | 0.0500	needless to say , i wasted my money .
38 | ```


--------------------------------------------------------------------------------
/ekphrasis/examples/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Created by Christos Baziotis.
3 | """
4 | 


--------------------------------------------------------------------------------
/ekphrasis/examples/demo_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created by Christos Baziotis.
 3 | """
 4 | 
 5 | demo_sents = [
 6 |     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／ yaaaay!!! #davidlynch #tvseries :)))",
 7 |     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies >3:/",
 8 |     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! >:-D http://sentimentsymposium.com/.",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/ekphrasis/examples/demo_ext.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created by Christos Baziotis.
 3 | """
 4 | from ekphrasis.classes.tokenizer import SocialTokenizer
 5 | 
 6 | 
 7 | social_tokenizer = SocialTokenizer(lowercase=False).tokenize
 8 | 
 9 | sents = [
10 |     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／ yaaaay!!! #davidlynch #tvseries :)))",
11 |     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies 3:/",
12 |     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! >:-D http://sentimentsymposium.com/.",
13 | ]
14 | 
15 | for s in sents:
16 |     print("SC : ", social_tokenizer(s))  # social tokenizer
17 | 


--------------------------------------------------------------------------------
/ekphrasis/examples/demo_segmenter.py:
--------------------------------------------------------------------------------
 1 | from ekphrasis.classes.segmenter import Segmenter
 2 | 
 3 | # segmenter using the word statistics from english Wikipedia
 4 | seg_eng = Segmenter(corpus="english")
 5 | 
 6 | # segmenter using the word statistics from Twitter
 7 | seg_tw = Segmenter(corpus="twitter")
 8 | 
 9 | # segmenter using the word statistics from Twitter
10 | seg_tw_2018 = Segmenter(corpus="twitter_2018")
11 | 
12 | words = ["exponentialbackoff", "gamedev", "retrogaming", "thewatercooler",
13 |          "panpsychism"]
14 | for w in words:
15 |     print(w)
16 |     print("(eng):", seg_eng.segment(w))
17 |     print("(tw):", seg_tw.segment(w))
18 |     print("(tw):", seg_tw_2018.segment(w))
19 |     print()
20 | 


--------------------------------------------------------------------------------
/ekphrasis/examples/demo_tok.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created by Christos Baziotis.
 3 | """
 4 | import nltk
 5 | 
 6 | from ekphrasis.classes.tokenizer import SocialTokenizer, Tokenizer
 7 | 
 8 | 
 9 | def wsp_tokenizer(text):
10 |     return text.split(" ")
11 | 
12 | 
13 | puncttok = nltk.WordPunctTokenizer().tokenize
14 | 
15 | social_tokenizer = SocialTokenizer(lowercase=False).tokenize
16 | mytokenizer = Tokenizer(lowercase=False).tokenize
17 | 
18 | sents = [
19 |     # "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／ yaaaay!!! #davidlynch #tvseries :)))",
20 |     # "@Calum5SOS You lil *poop* please follow @EmilyBain224 ☺️💕",
21 |     # "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies 3:/",
22 |     # "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! >:-D http://sentimentsymposium.com/.",
23 |     # "Words attendees would use to describe @prosper4africa's #ALN 2015! https://t.co/hmNm8AdwOh",
24 |     "@TheTideDrew Hi, Drew! I can't wait to see you!☺ Just letting you know that you'll always be my spidey, I love you!💕 Mind following me? x215",
25 | ]
26 | 
27 | for s in sents:
28 |     print()
29 |     # print("ORG: ", s)  # original sentence
30 |     # print("WSP : ", wsp_tokenizer(s))  # whitespace tokenizer
31 |     # print("WPU : ", puncttok(s))  # WordPunct tokenizer
32 |     print("SC : ", social_tokenizer(s))  # social tokenizer
33 |     # print("SC : ", mytokenizer(s))  # social tokenizer
34 | 


--------------------------------------------------------------------------------
/ekphrasis/examples/example.py:
--------------------------------------------------------------------------------
 1 | from ekphrasis.classes.preprocessor import TextPreProcessor
 2 | from ekphrasis.classes.tokenizer import SocialTokenizer
 3 | from ekphrasis.dicts.emoticons import emoticons
 4 | 
 5 | 
 6 | def ws_tokenizer(text):
 7 |     return text.split()
 8 | 
 9 | 
10 | text_processor = TextPreProcessor(
11 |     normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time',
12 |                'date', 'number'],
13 |     annotate={"hashtag", "elongated", "allcaps", "repeated", 'emphasis',
14 |               'censored'},
15 |     all_caps_tag="wrap",
16 |     fix_text=True,
17 |     segmenter="twitter_2018",
18 |     corrector="twitter_2018",
19 |     unpack_hashtags=True,
20 |     unpack_contractions=True,
21 |     spell_correct_elong=False,
22 |     tokenizer=SocialTokenizer(lowercase=True).tokenize,
23 |     # tokenizer=ws_tokenizer,
24 |     dicts=[emoticons]
25 | )
26 | 
27 | sentences = [
28 |     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :))) ",
29 |     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
30 |     "I saw the new #JOHNDOE movie AND IT SUCKS!!! WAISTED $10... #badmovies :/",
31 |     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/.",
32 |     "Thanks x https://t.co/ZXTcDLyDS9",
33 |     "@Calum5SOS You lil poop please follow @EmilyBain224 ☺️💕",
34 |     "Words attendees would use to describe @prosper4africa's #ALN2015! https://t.co/hmNm8AdwOh",
35 | ]
36 | 
37 | for s in sentences:
38 |     print()
39 |     print(s)
40 |     print(" ".join(text_processor.pre_process_doc(s)))
41 | 


--------------------------------------------------------------------------------
/ekphrasis/examples/segmenter_diffs.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/examples/segmenter_diffs.pickle


--------------------------------------------------------------------------------
/ekphrasis/examples/segmenter_diffs.txt:
--------------------------------------------------------------------------------
  1 |                                   tags                                  english                                  twitter
  2 | 0                             #mixitup                                  mixitup                                mix it up
  3 | 1                 #traderlivetweet2015                    trader live tweet2015                   trader live tweet 2015
  4 | 2                                #dsma                                     dsma                                    ds ma
  5 | 3                           #catfished                               catfish ed                                catfished
  6 | 4                             #kpopers                                 kpop ers                                  kpopers
  7 | 5                             #vmabert                                 vma bert                                vma be rt
  8 | 6                            #weeeeeee                                we eeeeee                                 weeeeeee
  9 | 7                              #smwldn                                  smw ldn                                  sm wldn
 10 | 8                   #drinkalittledrink                      drinka little drink                      drink alittle drink
 11 | 9                             #hibeees                                hi be ees                                 hibee es
 12 | 10                         #soccernews                              soccer news                               soccernews
 13 | 11                              #amavi                                   am avi                                    amavi
 14 | 12                             #socinn                                  soc inn                                  so cinn
 15 | 13                      #linearlagebra                          linear la gebra                          linear lage bra
 16 | 14                             #iahsfb                                  iahs fb                                 ia hs fb
 17 | 15                        #ullukapatha                            ul luka patha                           ullu ka pa tha
 18 | 16                               #ootd                                    oo td                                     ootd
 19 | 17                            #gooners                                go on ers                                  gooners
 20 | 18                            #afcbcst                                afc bc st                                 afcb cst
 21 | 19                             #pascon                                  pasc on                                  pas con
 22 | 20                      #rockonrockall                          rock on rockall                         rock on rock all
 23 | 21                    #may4thbewithyou                       may 4thbe with you                        may4thbe with you
 24 | 22                    #tripofalifetime                       trip of alife time                        trip ofa lifetime
 25 | 23                           #vscogrid                               vs co grid                                vsco grid
 26 | 24                     #thewatercooler                         the water cooler                          the watercooler
 27 | 25                    #wewillmakeadate                       we will makea date                       we will make adate
 28 | 26                               #ccot                                    cc ot                                     ccot
 29 | 27                        #sqlsaturday                             sql saturday                              sqlsaturday
 30 | 28                               #bbuk                                    bb uk                                     bbuk
 31 | 29                              #wawaw                                   wa waw                                    wawaw
 32 | 30                          #btowneats                              bt own eats                               btown eats
 33 | 31                             #suvelo                                  su velo                                  suv elo
 34 | 32                         #ooohkillem                             oooh kill em                              oooh killem
 35 | 33                        #ifollowback                           if ol low back                              ifollowback
 36 | 34                              #tvtag                                   tv tag                                    tvtag
 37 | 35                               #yall                                    ya ll                                     yall
 38 | 36    #ijustnerdedsohardmyglassesbroke       ijustnerdedsohard my glasses broke    ijust nerded so hard my glasses broke
 39 | 37                 #doingjusticeishine                     doing justice ishine                    doing justice is hine
 40 | 38                              #iwish                                   iwi sh                                    iwish
 41 | 39                       #scfafootball                            scfa football                           sc fa football
 42 | 40                          #teuchters                              te uchte rs                                teuchters
 43 | 41                              #donda                                   don da                                    donda
 44 | 42                 #itsgonnabeagoodday                    its gonna bea goodday                   its gonna be agood day
 45 | 43                         #applewatch                               applewatch                              apple watch
 46 | 44           #bantemweightchampuonship            ban tem weight champu on ship            ban tem weight champ uon ship
 47 | 45                               #itoa                                     itoa                                    it oa
 48 | 46                             #buzzin                                  buzz in                                   buzzin
 49 | 47                   #bringbacksungwoo                      bring back sung woo                       bring back sungwoo
 50 | 48                            #jerzday                                 jerz day                                  jerzday
 51 | 49                            #bluemix                                 blue mix                                  bluemix
 52 | 50                        #strikeforce                              strikeforce                             strike force
 53 | 51                               #oomf                                    oo mf                                     oomf
 54 | 52                           #webradio                                web radio                                 webradio
 55 | 53               #goggleeyedhomonculus                   goggle eyed homonculus                goggle eyed ho mon cul us
 56 | 54                         #risolutore                              risoluto re                             riso luto re
 57 | 55                          #likeaboss                              like abo ss                                likeaboss
 58 | 56                            #ffshoes                                 ff shoes                                 ffs hoes
 59 | 57                         #bhramabull                              bhrama bull                             bh rama bull
 60 | 58                               #vsco                                    vs co                                     vsco
 61 | 59                        #sockersweek                             sockers week                            so ckers week
 62 | 60                           #bbcupdat                               bbc up dat                                bbc updat
 63 | 61                             #vhappy                                  vh appy                                  v happy
 64 | 62                       #fridayfunday                           friday fun day                            friday funday
 65 | 63                            #indveng                                 ind veng                                in dv eng
 66 | 64                         #bdayvaycay                               bdayvaycay                              bday vaycay
 67 | 65                           #vscoedit                               vs co edit                                vsco edit
 68 | 66                           #miss2012                                 miss2012                                miss 2012
 69 | 67                    #cantsingforshit                       can tsing for shit                       cant sing for shit
 70 | 68                       #missyallboyz                           missy all boyz                           miss yall boyz
 71 | 69                           #chessdom                                 chessdom                                chess dom
 72 | 70                        #retrogaming                              retrogaming                             retro gaming
 73 | 71                             #lowkey                                  low key                                   lowkey
 74 | 72                               #wwfc                                     wwfc                                    ww fc
 75 | 73                               #rqwn                                     rqwn                                    rq wn
 76 | 74                            #aumrsch                                 aum rsch                                au mrs ch
 77 | 75                           #2012trip                                 2012trip                                2012 trip
 78 | 76                        #latechsuckd                           la tech su ckd                            la tech suckd
 79 | 77                             #chsocm                                 ch so cm                                  chs ocm
 80 | 78                               #sedc                                     sedc                                    se dc
 81 | 79                       #guardiannews                            guardian news                             guardiannews
 82 | 80                               #whby                                     whby                                    wh by
 83 | 81                           #xc-dsign                                 xc dsign                                xc d sign
 84 | 82                           #oolegooo                               oo leg ooo                                oo legooo
 85 | 83                     #brockuproblems                          brocku problems                         broc ku problems
 86 | 84                            #iamadad                                iam ad ad                                 iama dad
 87 | 85                           #gojetsgo                                go jetsgo                               go jets go
 88 | 86                               #oann                                    oa nn                                     oann
 89 | 87                             #straya                                  st raya                                   straya
 90 | 88                    #sarurdaykitchen                       sar ur day kitchen                         sarurday kitchen
 91 | 89                          #vscoitaly                              vs co italy                               vsco italy
 92 | 90                              #motog                                   mot og                                    motog
 93 | 91                             #topoli                                   topoli                                  to poli
 94 | 92                     #destinythegame                         destiny the game                           destinythegame
 95 | 93                          #vegvegveg                                vegvegveg                              veg veg veg
 96 | 94                       #samsung-note                           samsung   note                            samsung  note
 97 | 95                        #loveyouuuuu                           love you uu uu                             love youuuuu
 98 | 96                              #bbcqt                                   bbc qt                                    bbcqt
 99 | 97                          #jellybean                                jellybean                               jelly bean
100 | 98                         #anglamigeh                             angla mig eh                             ang lamig eh
101 | 99                               #rekt                                    re kt                                     rekt
102 | 100                            #ciscos                                  cis cos                                   ciscos
103 | 101                    #socringeworthy                         so cringe worthy                          so cringeworthy
104 | 102                        #flumplover                              flum plover                              flump lover
105 | 103                 #iwishitwerebetter                     iwi shit were better                     iwish it were better
106 | 104                           #asematy                                 asema ty                                as ema ty
107 | 105                    #acwsgothenburg                          acws gothenburg                         ac ws gothenburg
108 | 106                              #nyfw                                    ny fw                                     nyfw
109 | 107                      #48hrsnosleep                            48hrs nosleep                           48hrs no sleep
110 | 108              #notenoughhoursinaday                 not enough hours in aday                 not enough hours ina day
111 | 109                        #vansonmans                             van son mans                             vans on mans
112 | 110           #perksofbeingawallflower             perks of being aw all flower              perks of being awall flower
113 | 111                            #oooosh                                 oo oo sh                                   oooosh
114 | 112                          #awaydays                                 awaydays                                away days
115 | 113                            #scenez                                  sc enez                                  scen ez
116 | 114                              #smdh                                    sm dh                                     smdh
117 | 115                         #kanye2020                                kanye2020                               kanye 2020
118 | 116                    #imissmybrother                        im iss my brother                         imiss my brother
119 | 117                         #damnnnnnn                              dam nnn nnn                                damnnnnnn
120 | 118                          #starbuzz                                star buzz                                 starbuzz
121 | 119                      #profgetcrunk                            prof getcrunk                           prof get crunk
122 | 120                       #deflategate                             deflate gate                              deflategate
123 | 121                        #bestlunche                             best lun che                             best lunc he
124 | 122                              #idid                                    id id                                     idid
125 | 123                          #suchakid                                such akid                                sucha kid
126 | 124                      #meetthepress                           meet the press                             meetthepress
127 | 125                     #stillkidrauhl                         still kid ra uhl                           still kidrauhl
128 | 126                          #appletv3                                apple tv3                               apple tv 3
129 | 127                   #justgirlythings                        just girly things                          justgirlythings
130 | 128                         #skimmlife                              ski mm life                               skimm life
131 | 129                             #samac                                    samac                                   sa mac
132 | 130                          #bizitalk                                bizi talk                                 bizitalk
133 | 131                          #tunewiki                                tune wiki                                 tunewiki
134 | 132                           #kamcord                                 kam cord                                  kamcord
135 | 133                            #coybig                                  coy big                                   coybig
136 | 134                         #ecodesign                                ecodesign                               eco design
137 | 135                            #dastal                                  das tal                                  da stal
138 | 136                              #fitn                                    fi tn                                     fitn
139 | 137                             #ffxiv                                   ff xiv                                    ffxiv
140 | 138                        #ladsontour                              ladson tour                             lads on tour
141 | 139                           #netezza                                  netezza                                 net ezza
142 | 140                           #quatchi                                  quatchi                                 quat chi
143 | 141                         #adultshit                               adults hit                               adult shit
144 | 142                            #hamont                                   hamont                                  ha mont
145 | 143                  #aggeliesergasias                       agge lies ergasias                         aggeliesergasias
146 | 144                     #blackbirdgang                           blackbird gang                          black bird gang
147 | 145                       #whatabadass                            what abad ass                             whata badass
148 | 146                             #igers                                   ige rs                                    igers
149 | 147                        #noonecares                              noone cares                             no one cares
150 | 148                    #teamfollowback                         team follow back                           teamfollowback
151 | 149               #getnthefucoutofhere                 get nthe fuc out of here                 getn the fuc out of here
152 | 150                           #vscocam                                vs co cam                                 vsco cam
153 | 151                            #jbiebs                                  jbi ebs                                   jbiebs
154 | 152                         #dragoncon                               dragon con                                dragoncon
155 | 153                       #ihateschool                            iha te school                             ihate school
156 | 154                           #kubball                                 kubb all                                 ku bball
157 | 155                         #fuckuover                              fuc ku over                               fucku over
158 | 156                            #auspol                                  aus pol                                   auspol
159 | 157                          #pimpnjoy                               pim pn joy                                pimp njoy
160 | 158                            #miaafb                                  mia afb                                  miaa fb
161 | 159              #indianamensbasetball                  indiana mens baset ball                  indiana mens base tball
162 | 160                           #ahhhhhh                                 ahhhh hh                                  ahhhhhh
163 | 161                    #catherinebelll                         catherine bel ll                          catherine belll
164 | 162                           #nacamam                                nac am am                                na cam am
165 | 163                            #socent                                  soc ent                                  so cent
166 | 164                   #ladieshereicome                       ladies he rei come                        ladies here icome
167 | 165                             #bango                                    bango                                   ban go
168 | 166                             #euref                                    euref                                   eu ref
169 | 167                     #1005chunjiday                            1005chunjiday                           1005chunji day
170 | 168                        #tweetpic66                               tweetpic66                             tweet pic 66
171 | 169                    #uncareingworld                        un care ing world                         un careing world
172 | 170                              #tlot                                     tlot                                    tl ot
173 | 171                            #socbiz                                  soc biz                                  so cbiz
174 | 172                          #juvederm                                juve derm                                 juvederm
175 | 173                         #makeamove                               makea move                              make amo ve
176 | 174                      #wordtomymuva                         word to my mu va                          word to my muva
177 | 175                        #knickstape                              knicks tape                               knickstape
178 | 176                         #whatapair                              what ap air                               whata pair
179 | 177                             #fmlll                                   fm lll                                    fmlll
180 | 178                              #tcot                                    tc ot                                     tcot
181 | 179                      #shepherdshut                            shepherd shut                            shepherds hut
182 | 180                             #gapol                                   ga pol                                   gap ol
183 | 181               #thinkaboutitnobhead                  think about it nob head                   think about it nobhead
184 | 182                       #novaturient                            no vaturi ent                           nov at uri ent
185 | 183                       #treatyoself                            treat yo self                             treat yoself
186 | 184                            #tville                                  tv ille                                   tville
187 | 185                      #speedoflight                             speedoflight                           speed of light
188 | 186                             #ecigs                                   eci gs                                    ecigs
189 | 187                          #delange9                               de lange 9                                 delange9
190 | 188                         #chavbants                              cha vb ants                               chav bants
191 | 189                        #bountygate                              bounty gate                               bountygate
192 | 190                           #worldie                                 world ie                                  worldie
193 | 191                             #gacky                                   ga cky                                   gac ky
194 | 192                          #looooser                               lo ooo ser                                 looooser
195 | 193                           #goodday                                  goodday                                 good day
196 | 194                      #ivaluemylife                          iv alue my life                          iva lue my life
197 | 195                            #trndnl                                  trn dnl                                  trnd nl
198 | 196                        #nobrollies                              nob rollies                              no brollies
199 | 197             #ultimatfighterfridays                 ulti mat fighter fridays                  ultimat fighter fridays
200 | 198                #istillloveeastview                    is till love eastview                     istill love eastview
201 | 199                       #str8likedat                              str8likedat                            str8 like dat
202 | 200                      #freeghoncheh                           free ghon cheh                            free ghoncheh
203 | 201                            #dbacks                                  db acks                                   dbacks
204 | 202                      #lallysmarine                           la llys marine                           lal lys marine
205 | 203                              #cmon                                    cm on                                     cmon
206 | 204                              #dohh                                    do hh                                     dohh
207 | 205                            #brexit                                  br exit                                   brexit
208 | 206                              #wahh                                    wa hh                                     wahh
209 | 207                       #bbcnewsline                             bbc newsline                            bbc news line
210 | 208                      #presstitutes                           press tit utes                             presstitutes
211 | 209           #illbringabottlenexttime             ill brin ga bottle next time             ill bring abott le next time
212 | 210                           #byeeeee                                 bye eeee                                  byeeeee
213 | 211                          #mumfords                                mum fords                                 mumfords
214 | 212                             #ihope                                   iho pe                                    ihope
215 | 213                     #kingturnezbet                          king turn ezbet                         king turn ez bet
216 | 214                           #pray4me                                  pray4me                                 pray4 me
217 | 215                         #trump2016                                trump2016                               trump 2016
218 | 216                      #unforgetable                           un forget able                             unforgetable
219 | 217                     #iwontbesocial                         iwo nt be social                          iwont be social
220 | 218                           #godeacs                                 gode acs                                 go deacs
221 | 219                          #loverboy                                 loverboy                                lover boy
222 | 220                        #amazeballs                              amaze balls                               amazeballs
223 | 221                            #adorbs                                  ad orbs                                   adorbs
224 | 222              #ishouldjustwatchthat                 is hould just watch that                  ishould just watch that
225 | 223                             #bruuh                                   bru uh                                    bruuh
226 | 224                    #unionjfollowme                        unio nj follow me                         unionj follow me
227 | 225                          #nekroman                                 nekroman                                nek roman
228 | 226                        #bronnygate                             bron ny gate                              bronny gate
229 | 227                          #dcfcfans                               dc fc fans                                dcfc fans
230 | 228               #onehellofanighttour                  one hello fa night tour                 one hell of an ight tour
231 | 229                #bentleyvolleyballl                     bentley volleyball l                     bentley volley balll
232 | 230                      #awaydaysonly                            awaydays only                           away days only
233 | 231                         #carly2016                                carly2016                               carly 2016
234 | 232                          #lituania                                 lituania                               lit ua nia
235 | 233                            #saddos                                  sad dos                                   saddos
236 | 234                            #thunde                                  thun de                                   thunde
237 | 235                     #throwawaydogs                           throwaway dogs                          throw away dogs
238 | 236                           #lambily                                 lamb ily                                  lambily
239 | 237                        #cyber1news                              cyber1 news                             cyber 1 news
240 | 238                     #sorrynotsorry                          sorry not sorry                            sorrynotsorry
241 | 239                           #nextime                                 nex time                                  nextime
242 | 240                            #iusocc                                  ius occ                                  iu socc
243 | 241                            #motwyw                                   motwyw                                  motw yw
244 | 242  #dressinglikeaslutcomeswithaprice  dressing like as lut comes with ap rice  dressing like asl ut comes wit ha price
245 | 243                       #backtostroz                           back to st roz                            back to stroz
246 | 244                           #gamedev                                  gamedev                                 game dev
247 | 245                          #2012shit                                 2012shit                                2012 shit
248 | 246                              #ocra                                     ocra                                    oc ra
249 | 247                           #ekloges                                 ek loges                                ek log es
250 | 248                            #xoxoxo                                 xo xo xo                                   xoxoxo
251 | 249                   #oopsididitagain                        oops ididit again                       oops idid it again
252 | 250                           #fuckkkk                                 fuck kkk                                  fuckkkk
253 | 251                      #theellenshow                           the ellen show                             theellenshow


--------------------------------------------------------------------------------
/ekphrasis/examples/sentiment.py:
--------------------------------------------------------------------------------
 1 | from ekphrasis.classes.preprocessor import TextPreProcessor
 2 | from ekphrasis.classes.tokenizer import SocialTokenizer
 3 | from ekphrasis.utils.nlp import polarity
 4 | 
 5 | sentences = [
 6 |     "So there is no way for me to plug it in here in the US unless I go by a converter.",
 7 |     "Good case, Excellent value.",
 8 |     "Works great!",
 9 |     'The design is very odd, as the ear "clip" is not very comfortable at all.',
10 |     "Needless to say, I wasted my money."
11 | ]
12 | 
13 | # define preprocessing pipeline
14 | text_processor = TextPreProcessor(
15 |     fix_text=True,
16 |     unpack_contractions=True,
17 |     tokenizer=SocialTokenizer(lowercase=True).tokenize,
18 | )
19 | 
20 | # pass each sentence through the pipeline
21 | tokenized_sentences = list(text_processor.pre_process_docs(sentences))
22 | for sent in tokenized_sentences:
23 |     _polarity, _scores = polarity(sent)
24 |     print("{:.4f}\t".format(_polarity) + " ".join(sent))
25 | 


--------------------------------------------------------------------------------
/ekphrasis/examples/word_segmentation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 4,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "Reading english - 1grams ...\n"
 15 |      ]
 16 |     },
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Reading english - 2grams ...\n"
 22 |      ]
 23 |     },
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "Reading twitter - 1grams ...\nReading twitter - 2grams ...\n"
 29 |      ]
 30 |     },
 31 |     {
 32 |      "name": "stdout",
 33 |      "output_type": "stream",
 34 |      "text": [
 35 |       "Reading text8 - 1grams ...\nReading text8 - 2grams ...\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "from ekphrasis.classes.segmenter import Segmenter\n",
 41 |     "\n",
 42 |     "seg_eng = Segmenter(corpus=\"english\")\n",
 43 |     "seg_tw = Segmenter(corpus=\"twitter\")\n",
 44 |     "seg_t8 = Segmenter(corpus=\"text8\")"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "import pandas as pd\n",
 56 |     "\n",
 57 |     "diffs = pd.read_pickle(\"segmenter_diffs.pickle\")"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 15,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stdout",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "smallandinsignificant\n(eng): small and insignificant\n(tw): small and insignificant\n\ninsufficientnumbers\n(eng): insufficient numbers\n(tw): insufficient numbers\n\nexponentialbackoff\n(eng): exponential backoff\n(tw): exponential back off\n\nsitdown\n(eng): sit down\n(tw): sit down\n\ngamedev\n(eng): gamedev\n(tw): game dev\n\nretrogaming\n(eng): retrogaming\n(tw): retro gaming\n\nthewatercooler\n(eng): the water cooler\n(tw): the watercooler\n\nhomonculus\n(eng): homonculus\n(tw): ho mon cul us\n\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "words = [\"smallandinsignificant\", \"insufficientnumbers\", \"exponentialbackoff\", \"sitdown\", \"gamedev\", \"retrogaming\",\"thewatercooler\", \"homonculus\"]\n",
 77 |     "for w in words:\n",
 78 |     "    print(w)\n",
 79 |     "    print(\"(eng):\", seg_eng.segment(w))\n",
 80 |     "    print(\"(tw):\", seg_tw.segment(w))\n",
 81 |     "    # print(\"(t8):\", seg_t8.segment(w))\n",
 82 |     "    print()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "collapsed": true
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     ""
 94 |    ]
 95 |   }
 96 |  ],
 97 |  "metadata": {
 98 |   "kernelspec": {
 99 |    "display_name": "Python 3",
100 |    "language": "python",
101 |    "name": "python3"
102 |   },
103 |   "language_info": {
104 |    "codemirror_mode": {
105 |     "name": "ipython",
106 |     "version": 3.0
107 |    },
108 |    "file_extension": ".py",
109 |    "mimetype": "text/x-python",
110 |    "name": "python",
111 |    "nbconvert_exporter": "python",
112 |    "pygments_lexer": "ipython3",
113 |    "version": "3.5.2"
114 |   }
115 |  },
116 |  "nbformat": 4,
117 |  "nbformat_minor": 0
118 | }


--------------------------------------------------------------------------------
/ekphrasis/regexes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/regexes/__init__.py


--------------------------------------------------------------------------------
/ekphrasis/regexes/expressions.txt:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ACRONYM": "\\b(?:[A-Z]\\.)(?:[A-Z]\\.)+(?:\\.(?!\\.))?(?:[A-Z]\\b)?",
 3 |     "ALLCAPS": "(?<![#@$])\\b([A-Z][A-Z ]{1,}[A-Z])\\b",
 4 |     "CAMEL_SPLIT": "((?<=[a-z])[A-Z]|(?<!^)[A-Z](?=[a-z])|[0-9]+|(?<=[0-9\\-\\_])[A-Za-z]|[\\-\\_])",
 5 |     "CASHTAG": "(?<![A-Z])\\$[A-Z]+\\b",
 6 |     "CENSORED": "(?:\\b\\w+\\*+\\w+\\b)",
 7 |     "DATE": "(?:(?:(?:(?:(?<!:)\\b\\'?\\d{1,4},? ?)?\\b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|May|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\\b(?:(?:,? ?\\'?)?\\d{1,4}(?:st|nd|rd|n?th)?\\b(?:[,\\/]? ?\\'?\\d{2,4}[a-zA-Z]*)?(?: ?- ?\\d{2,4}[a-zA-Z]*)?(?!:\\d{1,4})\\b))|(?:(?:(?<!:)\\b\\'?\\d{1,4},? ?)\\b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|May|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\\b(?:(?:,? ?\\'?)?\\d{1,4}(?:st|nd|rd|n?th)?\\b(?:[,\\/]? ?\\'?\\d{2,4}[a-zA-Z]*)?(?: ?- ?\\d{2,4}[a-zA-Z]*)?(?!:\\d{1,4})\\b)?))|(?:\\b(?<!\\d\\.)(?:(?:(?:[0123]?[0-9][\\.\\-\\/])?[0123]?[0-9][\\.\\-\\/][12][0-9]{3})|(?:[0123]?[0-9][\\.\\-\\/][0123]?[0-9][\\.\\-\\/][12]?[0-9]{2,3}))(?!\\.\\d)\\b))",
 8 |     "EASTERN_EMOTICONS": "(?<![\\w])(?:(?:[<>]?[\\^;][\\W_m][\\;^][;<>]?)|(?:[^\\s()]?m?[\\(][\\W_oTOJ]{1,3}[\\s]?[\\W_oTOJ]{1,3}[)]m?[^\\s()]?)|(?:\\*?[v>\\-\\/\\\\][o0O\\_\\.][v\\-<\\/\\\\]\\*?)|(?:[oO0>][\\-_\\/oO\\.\\\\]{1,2}[oO0>])|(?:\\^\\^))(?![\\w])",
 9 |     "ELONGATED": "\\b[A-Za-z]*([a-zA-Z])\\1\\1[A-Za-z]*\\b",
10 |     "EMAIL": "(?:^|(?<=[^\\w@.)]))(?:[\\w+-](?:\\.(?!\\.))?)*?[\\w+-]@(?:\\w-?)*?\\w+(?:\\.(?:[a-z]{2,})){1,3}(?:$|(?=\\b))",
11 |     "EMOJI": "(?:\\uD83C\\uDFF4\\uDB40\\uDC67\\uDB40\\uDC62(?:\\uDB40\\uDC65\\uDB40\\uDC6E\\uDB40\\uDC67|\\uDB40\\uDC77\\uDB40\\uDC6C\\uDB40\\uDC73|\\uDB40\\uDC73\\uDB40\\uDC63\\uDB40\\uDC74)\\uDB40\\uDC7F|\\uD83D\\uDC69\\u200D\\uD83D\\uDC69\\u200D(?:\\uD83D\\uDC66\\u200D\\uD83D\\uDC66|\\uD83D\\uDC67\\u200D(?:\\uD83D[\\uDC66\\uDC67]))|\\uD83D\\uDC68(?:\\u200D(?:\\u2764\\uFE0F\\u200D(?:\\uD83D\\uDC8B\\u200D)?\\uD83D\\uDC68|(?:\\uD83D[\\uDC68\\uDC69])\\u200D(?:\\uD83D\\uDC66\\u200D\\uD83D\\uDC66|\\uD83D\\uDC67\\u200D(?:\\uD83D[\\uDC66\\uDC67]))|\\uD83D\\uDC66\\u200D\\uD83D\\uDC66|\\uD83D\\uDC67\\u200D(?:\\uD83D[\\uDC66\\uDC67])|[\\u2695\\u2696\\u2708]\\uFE0F|\\uD83C[\\uDF3E\\uDF73\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92])|(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D[\\u2695\\u2696\\u2708]\\uFE0F|(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D(?:\\uD83C[\\uDF3E\\uDF73\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92]))|\\uD83D\\uDC69\\u200D(?:\\u2764\\uFE0F\\u200D(?:\\uD83D\\uDC8B\\u200D(?:\\uD83D[\\uDC68\\uDC69])|\\uD83D[\\uDC68\\uDC69])|\\uD83C[\\uDF3E\\uDF73\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92])|\\uD83D\\uDC69\\u200D\\uD83D\\uDC66\\u200D\\uD83D\\uDC66|(?:\\uD83D\\uDC41\\uFE0F\\u200D\\uD83D\\uDDE8|\\uD83D\\uDC69(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D[\\u2695\\u2696\\u2708]|(?:(?:\\u26F9|\\uD83C[\\uDFCB\\uDFCC]|\\uD83D\\uDD75)\\uFE0F|\\uD83D\\uDC6F|\\uD83E[\\uDD3C\\uDDDE\\uDDDF])\\u200D[\\u2640\\u2642]|(?:\\u26F9|\\uD83C[\\uDFCB\\uDFCC]|\\uD83D\\uDD75)(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D[\\u2640\\u2642]|(?:\\uD83C[\\uDFC3\\uDFC4\\uDFCA]|\\uD83D[\\uDC6E\\uDC71\\uDC73\\uDC77\\uDC81\\uDC82\\uDC86\\uDC87\\uDE45-\\uDE47\\uDE4B\\uDE4D\\uDE4E\\uDEA3\\uDEB4-\\uDEB6]|\\uD83E[\\uDD26\\uDD37-\\uDD39\\uDD3D\\uDD3E\\uDDD6-\\uDDDD])(?:(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D[\\u2640\\u2642]|\\u200D[\\u2640\\u2642])|\\uD83D\\uDC69\\u200D[\\u2695\\u2696\\u2708])\\uFE0F|\\uD83D\\uDC69\\u200D\\uD83D\\uDC67\\u200D(?:\\uD83D[\\uDC66\\uDC67])|\\uD83D\\uDC69\\u200D\\uD83D\\uDC69\\u200D(?:\\uD83D[\\uDC66\\uDC67])|\\uD83D\\uDC68(?:\\u200D(?:(?:\\uD83D[\\uDC68\\uDC69])\\u200D(?:\\uD83D[\\uDC66\\uDC67])|\\uD83D[\\uDC66\\uDC67])|\\uD83C[\\uDFFB-\\uDFFF])|\\uD83C\\uDFF3\\uFE0F\\u200D\\uD83C\\uDF08|\\uD83D\\uDC69\\u200D\\uD83D\\uDC67|\\uD83D\\uDC69(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D(?:\\uD83C[\\uDF3E\\uDF73\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92])|\\uD83D\\uDC69\\u200D\\uD83D\\uDC66|\\uD83C\\uDDF4\\uD83C\\uDDF2|\\uD83C\\uDDFD\\uD83C\\uDDF0|\\uD83C\\uDDF6\\uD83C\\uDDE6|\\uD83D\\uDC69(?:\\uD83C[\\uDFFB-\\uDFFF])|\\uD83C\\uDDFC(?:\\uD83C[\\uDDEB\\uDDF8])|\\uD83C\\uDDEB(?:\\uD83C[\\uDDEE-\\uDDF0\\uDDF2\\uDDF4\\uDDF7])|\\uD83C\\uDDE9(?:\\uD83C[\\uDDEA\\uDDEC\\uDDEF\\uDDF0\\uDDF2\\uDDF4\\uDDFF])|\\uD83C\\uDDE7(?:\\uD83C[\\uDDE6\\uDDE7\\uDDE9-\\uDDEF\\uDDF1-\\uDDF4\\uDDF6-\\uDDF9\\uDDFB\\uDDFC\\uDDFE\\uDDFF])|\\uD83C\\uDDF1(?:\\uD83C[\\uDDE6-\\uDDE8\\uDDEE\\uDDF0\\uDDF7-\\uDDFB\\uDDFE])|\\uD83C\\uDDFE(?:\\uD83C[\\uDDEA\\uDDF9])|\\uD83C\\uDDF9(?:\\uD83C[\\uDDE6\\uDDE8\\uDDE9\\uDDEB-\\uDDED\\uDDEF-\\uDDF4\\uDDF7\\uDDF9\\uDDFB\\uDDFC\\uDDFF])|\\uD83C\\uDDF5(?:\\uD83C[\\uDDE6\\uDDEA-\\uDDED\\uDDF0-\\uDDF3\\uDDF7-\\uDDF9\\uDDFC\\uDDFE])|\\uD83C\\uDDEF(?:\\uD83C[\\uDDEA\\uDDF2\\uDDF4\\uDDF5])|\\uD83C\\uDDED(?:\\uD83C[\\uDDF0\\uDDF2\\uDDF3\\uDDF7\\uDDF9\\uDDFA])|\\uD83C\\uDDEE(?:\\uD83C[\\uDDE8-\\uDDEA\\uDDF1-\\uDDF4\\uDDF6-\\uDDF9])|\\uD83C\\uDDFB(?:\\uD83C[\\uDDE6\\uDDE8\\uDDEA\\uDDEC\\uDDEE\\uDDF3\\uDDFA])|\\uD83C\\uDDEC(?:\\uD83C[\\uDDE6\\uDDE7\\uDDE9-\\uDDEE\\uDDF1-\\uDDF3\\uDDF5-\\uDDFA\\uDDFC\\uDDFE])|\\uD83C\\uDDF7(?:\\uD83C[\\uDDEA\\uDDF4\\uDDF8\\uDDFA\\uDDFC])|\\uD83C\\uDDEA(?:\\uD83C[\\uDDE6\\uDDE8\\uDDEA\\uDDEC\\uDDED\\uDDF7-\\uDDFA])|\\uD83C\\uDDFA(?:\\uD83C[\\uDDE6\\uDDEC\\uDDF2\\uDDF3\\uDDF8\\uDDFE\\uDDFF])|\\uD83C\\uDDE8(?:\\uD83C[\\uDDE6\\uDDE8\\uDDE9\\uDDEB-\\uDDEE\\uDDF0-\\uDDF5\\uDDF7\\uDDFA-\\uDDFF])|\\uD83C\\uDDE6(?:\\uD83C[\\uDDE8-\\uDDEC\\uDDEE\\uDDF1\\uDDF2\\uDDF4\\uDDF6-\\uDDFA\\uDDFC\\uDDFD\\uDDFF])|[#\\*0-9]\\uFE0F\\u20E3|\\uD83C\\uDDF8(?:\\uD83C[\\uDDE6-\\uDDEA\\uDDEC-\\uDDF4\\uDDF7-\\uDDF9\\uDDFB\\uDDFD-\\uDDFF])|\\uD83C\\uDDFF(?:\\uD83C[\\uDDE6\\uDDF2\\uDDFC])|\\uD83C\\uDDF0(?:\\uD83C[\\uDDEA\\uDDEC-\\uDDEE\\uDDF2\\uDDF3\\uDDF5\\uDDF7\\uDDFC\\uDDFE\\uDDFF])|\\uD83C\\uDDF3(?:\\uD83C[\\uDDE6\\uDDE8\\uDDEA-\\uDDEC\\uDDEE\\uDDF1\\uDDF4\\uDDF5\\uDDF7\\uDDFA\\uDDFF])|\\uD83C\\uDDF2(?:\\uD83C[\\uDDE6\\uDDE8-\\uDDED\\uDDF0-\\uDDFF])|(?:\\uD83C[\\uDFC3\\uDFC4\\uDFCA]|\\uD83D[\\uDC6E\\uDC71\\uDC73\\uDC77\\uDC81\\uDC82\\uDC86\\uDC87\\uDE45-\\uDE47\\uDE4B\\uDE4D\\uDE4E\\uDEA3\\uDEB4-\\uDEB6]|\\uD83E[\\uDD26\\uDD37-\\uDD39\\uDD3D\\uDD3E\\uDDD6-\\uDDDD])(?:\\uD83C[\\uDFFB-\\uDFFF])|(?:\\u26F9|\\uD83C[\\uDFCB\\uDFCC]|\\uD83D\\uDD75)(?:\\uD83C[\\uDFFB-\\uDFFF])|(?:[\\u261D\\u270A-\\u270D]|\\uD83C[\\uDF85\\uDFC2\\uDFC7]|\\uD83D[\\uDC42\\uDC43\\uDC46-\\uDC50\\uDC66\\uDC67\\uDC70\\uDC72\\uDC74-\\uDC76\\uDC78\\uDC7C\\uDC83\\uDC85\\uDCAA\\uDD74\\uDD7A\\uDD90\\uDD95\\uDD96\\uDE4C\\uDE4F\\uDEC0\\uDECC]|\\uD83E[\\uDD18-\\uDD1C\\uDD1E\\uDD1F\\uDD30-\\uDD36\\uDDD1-\\uDDD5])(?:\\uD83C[\\uDFFB-\\uDFFF])|(?:[\\u261D\\u26F9\\u270A-\\u270D]|\\uD83C[\\uDF85\\uDFC2-\\uDFC4\\uDFC7\\uDFCA-\\uDFCC]|\\uD83D[\\uDC42\\uDC43\\uDC46-\\uDC50\\uDC66-\\uDC69\\uDC6E\\uDC70-\\uDC78\\uDC7C\\uDC81-\\uDC83\\uDC85-\\uDC87\\uDCAA\\uDD74\\uDD75\\uDD7A\\uDD90\\uDD95\\uDD96\\uDE45-\\uDE47\\uDE4B-\\uDE4F\\uDEA3\\uDEB4-\\uDEB6\\uDEC0\\uDECC]|\\uD83E[\\uDD18-\\uDD1C\\uDD1E\\uDD1F\\uDD26\\uDD30-\\uDD39\\uDD3D\\uDD3E\\uDDD1-\\uDDDD])(?:\\uD83C[\\uDFFB-\\uDFFF])?|(?:[\\u231A\\u231B\\u23E9-\\u23EC\\u23F0\\u23F3\\u25FD\\u25FE\\u2614\\u2615\\u2648-\\u2653\\u267F\\u2693\\u26A1\\u26AA\\u26AB\\u26BD\\u26BE\\u26C4\\u26C5\\u26CE\\u26D4\\u26EA\\u26F2\\u26F3\\u26F5\\u26FA\\u26FD\\u2705\\u270A\\u270B\\u2728\\u274C\\u274E\\u2753-\\u2755\\u2757\\u2795-\\u2797\\u27B0\\u27BF\\u2B1B\\u2B1C\\u2B50\\u2B55]|\\uD83C[\\uDC04\\uDCCF\\uDD8E\\uDD91-\\uDD9A\\uDDE6-\\uDDFF\\uDE01\\uDE1A\\uDE2F\\uDE32-\\uDE36\\uDE38-\\uDE3A\\uDE50\\uDE51\\uDF00-\\uDF20\\uDF2D-\\uDF35\\uDF37-\\uDF7C\\uDF7E-\\uDF93\\uDFA0-\\uDFCA\\uDFCF-\\uDFD3\\uDFE0-\\uDFF0\\uDFF4\\uDFF8-\\uDFFF]|\\uD83D[\\uDC00-\\uDC3E\\uDC40\\uDC42-\\uDCFC\\uDCFF-\\uDD3D\\uDD4B-\\uDD4E\\uDD50-\\uDD67\\uDD7A\\uDD95\\uDD96\\uDDA4\\uDDFB-\\uDE4F\\uDE80-\\uDEC5\\uDECC\\uDED0-\\uDED2\\uDEEB\\uDEEC\\uDEF4-\\uDEF8]|\\uD83E[\\uDD10-\\uDD3A\\uDD3C-\\uDD3E\\uDD40-\\uDD45\\uDD47-\\uDD4C\\uDD50-\\uDD6B\\uDD80-\\uDD97\\uDDC0\\uDDD0-\\uDDE6])|(?:[#\\*0-9\\xA9\\xAE\\u203C\\u2049\\u2122\\u2139\\u2194-\\u2199\\u21A9\\u21AA\\u231A\\u231B\\u2328\\u23CF\\u23E9-\\u23F3\\u23F8-\\u23FA\\u24C2\\u25AA\\u25AB\\u25B6\\u25C0\\u25FB-\\u25FE\\u2600-\\u2604\\u260E\\u2611\\u2614\\u2615\\u2618\\u261D\\u2620\\u2622\\u2623\\u2626\\u262A\\u262E\\u262F\\u2638-\\u263A\\u2640\\u2642\\u2648-\\u2653\\u2660\\u2663\\u2665\\u2666\\u2668\\u267B\\u267F\\u2692-\\u2697\\u2699\\u269B\\u269C\\u26A0\\u26A1\\u26AA\\u26AB\\u26B0\\u26B1\\u26BD\\u26BE\\u26C4\\u26C5\\u26C8\\u26CE\\u26CF\\u26D1\\u26D3\\u26D4\\u26E9\\u26EA\\u26F0-\\u26F5\\u26F7-\\u26FA\\u26FD\\u2702\\u2705\\u2708-\\u270D\\u270F\\u2712\\u2714\\u2716\\u271D\\u2721\\u2728\\u2733\\u2734\\u2744\\u2747\\u274C\\u274E\\u2753-\\u2755\\u2757\\u2763\\u2764\\u2795-\\u2797\\u27A1\\u27B0\\u27BF\\u2934\\u2935\\u2B05-\\u2B07\\u2B1B\\u2B1C\\u2B50\\u2B55\\u3030\\u303D\\u3297\\u3299]|\\uD83C[\\uDC04\\uDCCF\\uDD70\\uDD71\\uDD7E\\uDD7F\\uDD8E\\uDD91-\\uDD9A\\uDDE6-\\uDDFF\\uDE01\\uDE02\\uDE1A\\uDE2F\\uDE32-\\uDE3A\\uDE50\\uDE51\\uDF00-\\uDF21\\uDF24-\\uDF93\\uDF96\\uDF97\\uDF99-\\uDF9B\\uDF9E-\\uDFF0\\uDFF3-\\uDFF5\\uDFF7-\\uDFFF]|\\uD83D[\\uDC00-\\uDCFD\\uDCFF-\\uDD3D\\uDD49-\\uDD4E\\uDD50-\\uDD67\\uDD6F\\uDD70\\uDD73-\\uDD7A\\uDD87\\uDD8A-\\uDD8D\\uDD90\\uDD95\\uDD96\\uDDA4\\uDDA5\\uDDA8\\uDDB1\\uDDB2\\uDDBC\\uDDC2-\\uDDC4\\uDDD1-\\uDDD3\\uDDDC-\\uDDDE\\uDDE1\\uDDE3\\uDDE8\\uDDEF\\uDDF3\\uDDFA-\\uDE4F\\uDE80-\\uDEC5\\uDECB-\\uDED2\\uDEE0-\\uDEE5\\uDEE9\\uDEEB\\uDEEC\\uDEF0\\uDEF3-\\uDEF8]|\\uD83E[\\uDD10-\\uDD3A\\uDD3C-\\uDD3E\\uDD40-\\uDD45\\uDD47-\\uDD4C\\uDD50-\\uDD6B\\uDD80-\\uDD97\\uDDC0\\uDDD0-\\uDDE6])\\uFE0F?)",
12 |     "EMPHASIS": "(?:\\*\\b\\w+\\b\\*)",
13 |     "HASHTAG": "\\#\\b[\\w\\-\\_]+\\b",
14 |     "LTR_FACE": "(?:(?<![a-zA-Z])[DPO]|(?<!\\d)[03]|[|}><=])?(?:(?<![a-zA-Z\\(])[xXB](?![a-ce-oq-zA-CE-OQ-Z,\\.\\/])|(?<![:])[:=|](?![\\.])|(?<![%#\\d])[%#](?![%#\\d])|(?<![\\d\\$])[$](?![\\d\\.,\\$])|[;](?!\\()|(?<![\\d\\(\\-\\+])8(?![\\da-ce-zA-CE-Z\\\\/])|\\*(?![\\*\\d,.]))(?:['\\\",])?(?:(?<![\\w*])[oc](?![a-zA-Z])|(?:[-\u2011^]))?(?:[(){}\\[\\]<>|/\\\\]+|[\u00de\u00d7\u00fe]|(?<!\\d)[30](?!\\d)|(?<![\\d\\*])[*,.@#&](?![\\*\\d,.])|(?<![\\d\\$])[$](?![\\d\\.,\\$])|[DOosSJLxXpPbc](?![a-zA-Z]))",
15 |     "MONEY": "(?:[$\u20ac\u00a3\u00a2]\\d+(?:[\\.,']\\d+)?(?:[MmKkBb](?:n|(?:il(?:lion)?))?)?)|(?:\\d+(?:[\\.,']\\d+)?[$\u20ac\u00a3\u00a2])",
16 |     "NORMALIZE_ELONG": "(.)\\1{2,}",
17 |     "NUMBER": "\\b\\d+(?:[\\.,']\\d+)?\\b",
18 |     "PERCENT": "\\b\\d+(?:[\\.,']\\d+)?\\b%",
19 |     "PHONE": "(?<![0-9])(?:\\+\\d{1,2}\\s)?\\(?\\d{3}\\)?[\\s.-]?\\d{3}[\\s.-]?\\d{4}(?![0-9])",
20 |     "QUOTES": "\\\"(\\\\.|[^\\\"]){2,}\\\"",
21 |     "REPEAT_PUNCTS": "([!?.]){2,}",
22 |     "REST_EMOTICONS": "(?<![A-Za-z0-9/()])(?:(?:\\^5)|(?:\\<3))(?![[A-Za-z0-9/()])",
23 |     "RTL_FACE": "(?<![\\w])(?:[(){}\\[\\]<>|/\\\\]+|(?<![\\d\\.\\,])[0](?![\\d\\.])|(?![\\d\\*,.@#&])[*,.@#&]|[$]|(?<![a-zA-Z])[DOosSxX])(?:[-\u2011^])?(?:['\\\",])?(?:[xX]|[:=|]|[%#]|[$8](?![\\d\\.])|[;]|\\*)(?:[O]|[0]|[|{><=])?(?![a-zA-Z])",
24 |     "TAG": "<[\\/]?\\w+[\\/]?>",
25 |     "TIME": "(?:(?:\\d+)?\\.?\\d+(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))|(?:(?:[0-2]?[0-9]|[2][0-3]):(?:[0-5][0-9])(?::(?:[0-5][0-9]))?(?: ?(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))?)",
26 |     "URL": "(?:https?:\\/\\/(?:www\\.|(?!www))[^\\s\\.]+\\.[^\\s]{2,}|www\\.[^\\s]+\\.[^\\s]{2,})",
27 |     "USER": "\\@\\w+",
28 |     "WORD": "(?:[\\w_]+)"
29 | }


--------------------------------------------------------------------------------
/ekphrasis/regexes/generate_expressions.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | ##############################################################################
  4 | # EMOTICONS
  5 | ##############################################################################
  6 | # [DPO023|}><=]?    # optional hat
  7 | # [xXB:#%=|;8\*]    # eyes
  8 | # ['\",]?           # optional tears
  9 | # [oc\-^]?          # optional nose
 10 | # [DpPO03cboÞþJSLxX*@$#&,.\|<>}{()\[\]\\/] # mouth
 11 | ##############################################################################
 12 | 
 13 | __ltr_emoticon = [
 14 |     # optional hat
 15 |     r"(?:(?<![a-zA-Z])[DPO]|(?<!\d)[03]|[|}><=])?",
 16 | 
 17 |     # eyes
 18 |     r"(?:(?<![a-zA-Z\(])[xXB](?![a-ce-oq-zA-CE-OQ-Z,\.\/])|(?<![:])[:=|](?![\.])|(?<![%#\d])[%#](?![%#\d])|(?<![\d\$])[$](?![\d\.,\$])|[;](?!\()|(?<![\d\(\-\+])8(?![\da-ce-zA-CE-Z\\/])|\*(?![\*\d,.]))",
 19 | 
 20 |     # optional tears
 21 |     r"(?:['\",])?",
 22 | 
 23 |     # optional nose
 24 |     r"(?:(?<![\w*])[oc](?![a-zA-Z])|(?:[-‑^]))?",
 25 | 
 26 |     # mouth
 27 |     r"(?:[(){}\[\]<>|/\\]+|[Þ×þ]|(?<!\d)[30](?!\d)|(?<![\d\*])[*,.@#&](?![\*\d,.])|(?<![\d\$])[$](?![\d\.,\$])|[DOosSJLxXpPbc](?![a-zA-Z]))",
 28 | ]
 29 | 
 30 | __rtl_emoticon = [
 31 |     r"(?<![\w])",
 32 |     r"(?:[(){}\[\]<>|/\\]+|(?<![\d\.\,])[0](?![\d\.])|(?![\d\*,.@#&])[*,.@#&]|[$]|(?<![a-zA-Z])[DOosSxX])",
 33 |     # mouth
 34 |     r"(?:[-‑^])?",  # optional nose
 35 |     r"(?:['\",])?",  # optional tears
 36 |     r"(?:[xX]|[:=|]|[%#]|[$8](?![\d\.])|[;]|\*)",  # eyes
 37 |     r"(?:[O]|[0]|[|{><=])?",  # optional hat
 38 |     r"(?![a-zA-Z])",
 39 | ]
 40 | 
 41 | __LTR_FACE = "".join(__ltr_emoticon)
 42 | __RTL_FACE = "".join(__rtl_emoticon)
 43 | 
 44 | ##############################################################################
 45 | # DATES/TIMES     todo: add days
 46 | # the regex captures most ways a date my be expressed in natural language.
 47 | ##############################################################################
 48 | __short_date = r"(?:\b(?<!\d\.)(?:(?:(?:[0123]?[0-9][\.\-\/])?[0123]?[0-9][\.\-\/][12][0-9]{3})|(?:[0123]?[0-9][\.\-\/][0123]?[0-9][\.\-\/][12]?[0-9]{2,3}))(?!\.\d)\b)"
 49 | __full_date_parts = [
 50 |     # prefix
 51 |     r"(?:(?<!:)\b\'?\d{1,4},? ?)",
 52 | 
 53 |     # month names
 54 |     r"\b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|May|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b",
 55 | 
 56 |     # suffix
 57 |     r"(?:(?:,? ?\'?)?\d{1,4}(?:st|nd|rd|n?th)?\b(?:[,\/]? ?\'?\d{2,4}[a-zA-Z]*)?(?: ?- ?\d{2,4}[a-zA-Z]*)?(?!:\d{1,4})\b)",
 58 | ]
 59 | __fd1 = "(?:{})".format("".join(
 60 |     [__full_date_parts[0] + "?", __full_date_parts[1], __full_date_parts[2]]))
 61 | __fd2 = "(?:{})".format("".join(
 62 |     [__full_date_parts[0], __full_date_parts[1], __full_date_parts[2] + "?"]))
 63 | __date = "(?:" + "(?:" + __fd1 + "|" + __fd2 + ")" + "|" + __short_date + ")"
 64 | 
 65 | # print(__date)
 66 | # print(__date)
 67 | 
 68 | ##############################################################################
 69 | # NUMBERS
 70 | ##############################################################################
 71 | __number = r"\b\d+(?:[\.,']\d+)?\b"
 72 | __percentage = __number + "%"
 73 | __money = r"(?:[$€£¢]\d+(?:[\.,']\d+)?(?:[MmKkBb](?:n|(?:il(?:lion)?))?)?)|(?:\d+(?:[\.,']\d+)?[$€£¢])"
 74 | 
 75 | ##############################################################################
 76 | # EXPRESSIONS
 77 | ##############################################################################
 78 | EXPRESSIONS = {
 79 |     "HASHTAG": r"\#\b[\w\-\_]+\b",
 80 |     "CASHTAG": r"(?<![A-Z])\$[A-Z]+\b",
 81 |     "TAG": r"<[\/]?\w+[\/]?>",
 82 |     "USER": r"\@\w+",
 83 |     "EMPHASIS": r"(?:\*\b\w+\b\*)",
 84 |     "CENSORED": r"(?:\b\w+\*+\w+\b)",
 85 |     "ACRONYM": r"\b(?:[A-Z]\.)(?:[A-Z]\.)+(?:\.(?!\.))?(?:[A-Z]\b)?",
 86 |     "ELONGATED": r"\b[A-Za-z]*([a-zA-Z])\1\1[A-Za-z]*\b",
 87 |     "RTL_FACE": __RTL_FACE,
 88 |     "LTR_FACE": __LTR_FACE,
 89 |     "EASTERN_EMOTICONS": r"(?<![\w])(?:(?:[<>]?[\^;][\W_m][\;^][;<>]?)|(?:[^\s()]?m?[\(][\W_oTOJ]{1,3}[\s]?[\W_oTOJ]{1,3}[)]m?[^\s()]?)|(?:\*?[v>\-\/\\][o0O\_\.][v\-<\/\\]\*?)|(?:[oO0>][\-_\/oO\.\\]{1,2}[oO0>])|(?:\^\^))(?![\w])",
 90 |     "REST_EMOTICONS": r"(?<![A-Za-z0-9/()])(?:(?:\^5)|(?:\<3))(?![[A-Za-z0-9/()])",
 91 | 
 92 |     # from https://github.com/mathiasbynens/emoji-regex/blob/master/text.js
 93 |     "EMOJI": r"(?:\uD83C\uDFF4\uDB40\uDC67\uDB40\uDC62(?:\uDB40\uDC65\uDB40\uDC6E\uDB40\uDC67|\uDB40\uDC77\uDB40\uDC6C\uDB40\uDC73|\uDB40\uDC73\uDB40\uDC63\uDB40\uDC74)\uDB40\uDC7F|\uD83D\uDC69\u200D\uD83D\uDC69\u200D(?:\uD83D\uDC66\u200D\uD83D\uDC66|\uD83D\uDC67\u200D(?:\uD83D[\uDC66\uDC67]))|\uD83D\uDC68(?:\u200D(?:\u2764\uFE0F\u200D(?:\uD83D\uDC8B\u200D)?\uD83D\uDC68|(?:\uD83D[\uDC68\uDC69])\u200D(?:\uD83D\uDC66\u200D\uD83D\uDC66|\uD83D\uDC67\u200D(?:\uD83D[\uDC66\uDC67]))|\uD83D\uDC66\u200D\uD83D\uDC66|\uD83D\uDC67\u200D(?:\uD83D[\uDC66\uDC67])|[\u2695\u2696\u2708]\uFE0F|\uD83C[\uDF3E\uDF73\uDF93\uDFA4\uDFA8\uDFEB\uDFED]|\uD83D[\uDCBB\uDCBC\uDD27\uDD2C\uDE80\uDE92])|(?:\uD83C[\uDFFB-\uDFFF])\u200D[\u2695\u2696\u2708]\uFE0F|(?:\uD83C[\uDFFB-\uDFFF])\u200D(?:\uD83C[\uDF3E\uDF73\uDF93\uDFA4\uDFA8\uDFEB\uDFED]|\uD83D[\uDCBB\uDCBC\uDD27\uDD2C\uDE80\uDE92]))|\uD83D\uDC69\u200D(?:\u2764\uFE0F\u200D(?:\uD83D\uDC8B\u200D(?:\uD83D[\uDC68\uDC69])|\uD83D[\uDC68\uDC69])|\uD83C[\uDF3E\uDF73\uDF93\uDFA4\uDFA8\uDFEB\uDFED]|\uD83D[\uDCBB\uDCBC\uDD27\uDD2C\uDE80\uDE92])|\uD83D\uDC69\u200D\uD83D\uDC66\u200D\uD83D\uDC66|(?:\uD83D\uDC41\uFE0F\u200D\uD83D\uDDE8|\uD83D\uDC69(?:\uD83C[\uDFFB-\uDFFF])\u200D[\u2695\u2696\u2708]|(?:(?:\u26F9|\uD83C[\uDFCB\uDFCC]|\uD83D\uDD75)\uFE0F|\uD83D\uDC6F|\uD83E[\uDD3C\uDDDE\uDDDF])\u200D[\u2640\u2642]|(?:\u26F9|\uD83C[\uDFCB\uDFCC]|\uD83D\uDD75)(?:\uD83C[\uDFFB-\uDFFF])\u200D[\u2640\u2642]|(?:\uD83C[\uDFC3\uDFC4\uDFCA]|\uD83D[\uDC6E\uDC71\uDC73\uDC77\uDC81\uDC82\uDC86\uDC87\uDE45-\uDE47\uDE4B\uDE4D\uDE4E\uDEA3\uDEB4-\uDEB6]|\uD83E[\uDD26\uDD37-\uDD39\uDD3D\uDD3E\uDDD6-\uDDDD])(?:(?:\uD83C[\uDFFB-\uDFFF])\u200D[\u2640\u2642]|\u200D[\u2640\u2642])|\uD83D\uDC69\u200D[\u2695\u2696\u2708])\uFE0F|\uD83D\uDC69\u200D\uD83D\uDC67\u200D(?:\uD83D[\uDC66\uDC67])|\uD83D\uDC69\u200D\uD83D\uDC69\u200D(?:\uD83D[\uDC66\uDC67])|\uD83D\uDC68(?:\u200D(?:(?:\uD83D[\uDC68\uDC69])\u200D(?:\uD83D[\uDC66\uDC67])|\uD83D[\uDC66\uDC67])|\uD83C[\uDFFB-\uDFFF])|\uD83C\uDFF3\uFE0F\u200D\uD83C\uDF08|\uD83D\uDC69\u200D\uD83D\uDC67|\uD83D\uDC69(?:\uD83C[\uDFFB-\uDFFF])\u200D(?:\uD83C[\uDF3E\uDF73\uDF93\uDFA4\uDFA8\uDFEB\uDFED]|\uD83D[\uDCBB\uDCBC\uDD27\uDD2C\uDE80\uDE92])|\uD83D\uDC69\u200D\uD83D\uDC66|\uD83C\uDDF4\uD83C\uDDF2|\uD83C\uDDFD\uD83C\uDDF0|\uD83C\uDDF6\uD83C\uDDE6|\uD83D\uDC69(?:\uD83C[\uDFFB-\uDFFF])|\uD83C\uDDFC(?:\uD83C[\uDDEB\uDDF8])|\uD83C\uDDEB(?:\uD83C[\uDDEE-\uDDF0\uDDF2\uDDF4\uDDF7])|\uD83C\uDDE9(?:\uD83C[\uDDEA\uDDEC\uDDEF\uDDF0\uDDF2\uDDF4\uDDFF])|\uD83C\uDDE7(?:\uD83C[\uDDE6\uDDE7\uDDE9-\uDDEF\uDDF1-\uDDF4\uDDF6-\uDDF9\uDDFB\uDDFC\uDDFE\uDDFF])|\uD83C\uDDF1(?:\uD83C[\uDDE6-\uDDE8\uDDEE\uDDF0\uDDF7-\uDDFB\uDDFE])|\uD83C\uDDFE(?:\uD83C[\uDDEA\uDDF9])|\uD83C\uDDF9(?:\uD83C[\uDDE6\uDDE8\uDDE9\uDDEB-\uDDED\uDDEF-\uDDF4\uDDF7\uDDF9\uDDFB\uDDFC\uDDFF])|\uD83C\uDDF5(?:\uD83C[\uDDE6\uDDEA-\uDDED\uDDF0-\uDDF3\uDDF7-\uDDF9\uDDFC\uDDFE])|\uD83C\uDDEF(?:\uD83C[\uDDEA\uDDF2\uDDF4\uDDF5])|\uD83C\uDDED(?:\uD83C[\uDDF0\uDDF2\uDDF3\uDDF7\uDDF9\uDDFA])|\uD83C\uDDEE(?:\uD83C[\uDDE8-\uDDEA\uDDF1-\uDDF4\uDDF6-\uDDF9])|\uD83C\uDDFB(?:\uD83C[\uDDE6\uDDE8\uDDEA\uDDEC\uDDEE\uDDF3\uDDFA])|\uD83C\uDDEC(?:\uD83C[\uDDE6\uDDE7\uDDE9-\uDDEE\uDDF1-\uDDF3\uDDF5-\uDDFA\uDDFC\uDDFE])|\uD83C\uDDF7(?:\uD83C[\uDDEA\uDDF4\uDDF8\uDDFA\uDDFC])|\uD83C\uDDEA(?:\uD83C[\uDDE6\uDDE8\uDDEA\uDDEC\uDDED\uDDF7-\uDDFA])|\uD83C\uDDFA(?:\uD83C[\uDDE6\uDDEC\uDDF2\uDDF3\uDDF8\uDDFE\uDDFF])|\uD83C\uDDE8(?:\uD83C[\uDDE6\uDDE8\uDDE9\uDDEB-\uDDEE\uDDF0-\uDDF5\uDDF7\uDDFA-\uDDFF])|\uD83C\uDDE6(?:\uD83C[\uDDE8-\uDDEC\uDDEE\uDDF1\uDDF2\uDDF4\uDDF6-\uDDFA\uDDFC\uDDFD\uDDFF])|[#\*0-9]\uFE0F\u20E3|\uD83C\uDDF8(?:\uD83C[\uDDE6-\uDDEA\uDDEC-\uDDF4\uDDF7-\uDDF9\uDDFB\uDDFD-\uDDFF])|\uD83C\uDDFF(?:\uD83C[\uDDE6\uDDF2\uDDFC])|\uD83C\uDDF0(?:\uD83C[\uDDEA\uDDEC-\uDDEE\uDDF2\uDDF3\uDDF5\uDDF7\uDDFC\uDDFE\uDDFF])|\uD83C\uDDF3(?:\uD83C[\uDDE6\uDDE8\uDDEA-\uDDEC\uDDEE\uDDF1\uDDF4\uDDF5\uDDF7\uDDFA\uDDFF])|\uD83C\uDDF2(?:\uD83C[\uDDE6\uDDE8-\uDDED\uDDF0-\uDDFF])|(?:\uD83C[\uDFC3\uDFC4\uDFCA]|\uD83D[\uDC6E\uDC71\uDC73\uDC77\uDC81\uDC82\uDC86\uDC87\uDE45-\uDE47\uDE4B\uDE4D\uDE4E\uDEA3\uDEB4-\uDEB6]|\uD83E[\uDD26\uDD37-\uDD39\uDD3D\uDD3E\uDDD6-\uDDDD])(?:\uD83C[\uDFFB-\uDFFF])|(?:\u26F9|\uD83C[\uDFCB\uDFCC]|\uD83D\uDD75)(?:\uD83C[\uDFFB-\uDFFF])|(?:[\u261D\u270A-\u270D]|\uD83C[\uDF85\uDFC2\uDFC7]|\uD83D[\uDC42\uDC43\uDC46-\uDC50\uDC66\uDC67\uDC70\uDC72\uDC74-\uDC76\uDC78\uDC7C\uDC83\uDC85\uDCAA\uDD74\uDD7A\uDD90\uDD95\uDD96\uDE4C\uDE4F\uDEC0\uDECC]|\uD83E[\uDD18-\uDD1C\uDD1E\uDD1F\uDD30-\uDD36\uDDD1-\uDDD5])(?:\uD83C[\uDFFB-\uDFFF])|(?:[\u261D\u26F9\u270A-\u270D]|\uD83C[\uDF85\uDFC2-\uDFC4\uDFC7\uDFCA-\uDFCC]|\uD83D[\uDC42\uDC43\uDC46-\uDC50\uDC66-\uDC69\uDC6E\uDC70-\uDC78\uDC7C\uDC81-\uDC83\uDC85-\uDC87\uDCAA\uDD74\uDD75\uDD7A\uDD90\uDD95\uDD96\uDE45-\uDE47\uDE4B-\uDE4F\uDEA3\uDEB4-\uDEB6\uDEC0\uDECC]|\uD83E[\uDD18-\uDD1C\uDD1E\uDD1F\uDD26\uDD30-\uDD39\uDD3D\uDD3E\uDDD1-\uDDDD])(?:\uD83C[\uDFFB-\uDFFF])?|(?:[\u231A\u231B\u23E9-\u23EC\u23F0\u23F3\u25FD\u25FE\u2614\u2615\u2648-\u2653\u267F\u2693\u26A1\u26AA\u26AB\u26BD\u26BE\u26C4\u26C5\u26CE\u26D4\u26EA\u26F2\u26F3\u26F5\u26FA\u26FD\u2705\u270A\u270B\u2728\u274C\u274E\u2753-\u2755\u2757\u2795-\u2797\u27B0\u27BF\u2B1B\u2B1C\u2B50\u2B55]|\uD83C[\uDC04\uDCCF\uDD8E\uDD91-\uDD9A\uDDE6-\uDDFF\uDE01\uDE1A\uDE2F\uDE32-\uDE36\uDE38-\uDE3A\uDE50\uDE51\uDF00-\uDF20\uDF2D-\uDF35\uDF37-\uDF7C\uDF7E-\uDF93\uDFA0-\uDFCA\uDFCF-\uDFD3\uDFE0-\uDFF0\uDFF4\uDFF8-\uDFFF]|\uD83D[\uDC00-\uDC3E\uDC40\uDC42-\uDCFC\uDCFF-\uDD3D\uDD4B-\uDD4E\uDD50-\uDD67\uDD7A\uDD95\uDD96\uDDA4\uDDFB-\uDE4F\uDE80-\uDEC5\uDECC\uDED0-\uDED2\uDEEB\uDEEC\uDEF4-\uDEF8]|\uD83E[\uDD10-\uDD3A\uDD3C-\uDD3E\uDD40-\uDD45\uDD47-\uDD4C\uDD50-\uDD6B\uDD80-\uDD97\uDDC0\uDDD0-\uDDE6])|(?:[#\*0-9\xA9\xAE\u203C\u2049\u2122\u2139\u2194-\u2199\u21A9\u21AA\u231A\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA\u25AB\u25B6\u25C0\u25FB-\u25FE\u2600-\u2604\u260E\u2611\u2614\u2615\u2618\u261D\u2620\u2622\u2623\u2626\u262A\u262E\u262F\u2638-\u263A\u2640\u2642\u2648-\u2653\u2660\u2663\u2665\u2666\u2668\u267B\u267F\u2692-\u2697\u2699\u269B\u269C\u26A0\u26A1\u26AA\u26AB\u26B0\u26B1\u26BD\u26BE\u26C4\u26C5\u26C8\u26CE\u26CF\u26D1\u26D3\u26D4\u26E9\u26EA\u26F0-\u26F5\u26F7-\u26FA\u26FD\u2702\u2705\u2708-\u270D\u270F\u2712\u2714\u2716\u271D\u2721\u2728\u2733\u2734\u2744\u2747\u274C\u274E\u2753-\u2755\u2757\u2763\u2764\u2795-\u2797\u27A1\u27B0\u27BF\u2934\u2935\u2B05-\u2B07\u2B1B\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299]|\uD83C[\uDC04\uDCCF\uDD70\uDD71\uDD7E\uDD7F\uDD8E\uDD91-\uDD9A\uDDE6-\uDDFF\uDE01\uDE02\uDE1A\uDE2F\uDE32-\uDE3A\uDE50\uDE51\uDF00-\uDF21\uDF24-\uDF93\uDF96\uDF97\uDF99-\uDF9B\uDF9E-\uDFF0\uDFF3-\uDFF5\uDFF7-\uDFFF]|\uD83D[\uDC00-\uDCFD\uDCFF-\uDD3D\uDD49-\uDD4E\uDD50-\uDD67\uDD6F\uDD70\uDD73-\uDD7A\uDD87\uDD8A-\uDD8D\uDD90\uDD95\uDD96\uDDA4\uDDA5\uDDA8\uDDB1\uDDB2\uDDBC\uDDC2-\uDDC4\uDDD1-\uDDD3\uDDDC-\uDDDE\uDDE1\uDDE3\uDDE8\uDDEF\uDDF3\uDDFA-\uDE4F\uDE80-\uDEC5\uDECB-\uDED2\uDEE0-\uDEE5\uDEE9\uDEEB\uDEEC\uDEF0\uDEF3-\uDEF8]|\uD83E[\uDD10-\uDD3A\uDD3C-\uDD3E\uDD40-\uDD45\uDD47-\uDD4C\uDD50-\uDD6B\uDD80-\uDD97\uDDC0\uDDD0-\uDDE6])\uFE0F?)",
 94 |     # "EMOJI": r"(?:[\u00A9\u00AE\u203C\u2049\u2122\u2139\u2194-\u2199\u21A9-\u21AA\u231A-\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA-\u25AB\u25B6\u25C0\u25FB-\u25FE\u2600-\u2604\u260E\u2611\u2614-\u2615\u2618\u261D\u2620\u2622-\u2623\u2626\u262A\u262E-\u262F\u2638-\u263A\u2640\u2642\u2648-\u2653\u2660\u2663\u2665-\u2666\u2668\u267B\u267F\u2692-\u2697\u2699\u269B-\u269C\u26A0-\u26A1\u26AA-\u26AB\u26B0-\u26B1\u26BD-\u26BE\u26C4-\u26C5\u26C8\u26CE-\u26CF\u26D1\u26D3-\u26D4\u26E9-\u26EA\u26F0-\u26F5\u26F7-\u26FA\u26FD\u2702\u2705\u2708-\u270D\u270F\u2712\u2714\u2716\u271D\u2721\u2728\u2733-\u2734\u2744\u2747\u274C\u274E\u2753-\u2755\u2757\u2763-\u2764\u2795-\u2797\u27A1\u27B0\u27BF\u2934-\u2935\u2B05-\u2B07\u2B1B-\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299]|(?:\uD83C[\uDC04\uDCCF\uDD70-\uDD71\uDD7E-\uDD7F\uDD8E\uDD91-\uDD9A\uDDE6-\uDDFF\uDE01-\uDE02\uDE1A\uDE2F\uDE32-\uDE3A\uDE50-\uDE51\uDF00-\uDF21\uDF24-\uDF93\uDF96-\uDF97\uDF99-\uDF9B\uDF9E-\uDFF0\uDFF3-\uDFF5\uDFF7-\uDFFF]|\uD83D[\uDC00-\uDCFD\uDCFF-\uDD3D\uDD49-\uDD4E\uDD50-\uDD67\uDD6F-\uDD70\uDD73-\uDD7A\uDD87\uDD8A-\uDD8D\uDD90\uDD95-\uDD96\uDDA4-\uDDA5\uDDA8\uDDB1-\uDDB2\uDDBC\uDDC2-\uDDC4\uDDD1-\uDDD3\uDDDC-\uDDDE\uDDE1\uDDE3\uDDE8\uDDEF\uDDF3\uDDFA-\uDE4F\uDE80-\uDEC5\uDECB-\uDED2\uDEE0-\uDEE5\uDEE9\uDEEB-\uDEEC\uDEF0\uDEF3-\uDEF6]|\uD83E[\uDD10-\uDD1E\uDD20-\uDD27\uDD30\uDD33-\uDD3A\uDD3C-\uDD3E\uDD40-\uDD45\uDD47-\uDD4B\uDD50-\uDD5E\uDD80-\uDD91\uDDC0]))",
 95 |     "QUOTES": r"\"(\\.|[^\"]){2,}\"",
 96 |     "PERCENT": __percentage,
 97 |     "REPEAT_PUNCTS": r"([!?.]){2,}",
 98 |     "MONEY": __money,
 99 |     "EMAIL": r"(?:^|(?<=[^\w@.)]))(?:[\w+-](?:\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(?:\.(?:[a-z]{2,})){1,3}(?:$|(?=\b))",
100 |     "PHONE": r'(?<![0-9])(?:\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}(?![0-9])',
101 |     "NUMBER": __number,
102 |     "ALLCAPS": r"(?<![#@$])\b([A-Z][A-Z ]{1,}[A-Z])\b",
103 |     # todo: fix - urls also capture trailing puncts, such as .
104 |     "URL": r"(?:https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})",
105 |     "DATE": __date,
106 |     "TIME": r"(?:(?:\d+)?\.?\d+(?:AM|PM|am|pm|a\.m\.|p\.m\.))|(?:(?:[0-2]?[0-9]|[2][0-3]):(?:[0-5][0-9])(?::(?:[0-5][0-9]))?(?: ?(?:AM|PM|am|pm|a\.m\.|p\.m\.))?)",
107 |     # "CAMEL_SPLIT": '((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))',
108 |     # i think it can be simplified...
109 |     # r'((?<=[a-z])[A-Z]|(?<=[A-Z][A-Z])[a-z]|(?<!^)(?<![A-Z])[A-Z](?=[a-z])|[0-9]+|(?<=[0-9\-\_])[A-Za-z]|[\-\_])',
110 |     "CAMEL_SPLIT": r'((?<=[a-z])[A-Z]|(?<!^)[A-Z](?=[a-z])|[0-9]+|(?<=[0-9\-\_])[A-Za-z]|[\-\_])',
111 |     # REGEX_NORMALIZE_ELONG = '(.)\1+')
112 |     "NORMALIZE_ELONG": r'(.)\1{2,}',
113 |     "WORD": r"(?:[\w_]+)"
114 | }
115 | 
116 | # # cant do all. win some, lose some ...
117 | # testlist = [
118 | #     "CamelCase",
119 | #     "snake_case",
120 | #     "MILvsPIT",
121 | #     "WWENetwork",  # since this type of hashtag is more frequent i will optimize the regex for this case
122 | #     "AlfaBetaGAMMA111deltaEpsilon123zeta-eta_theta",
123 | # ]
124 | #
125 | # for t in testlist:
126 | #     print(REGEXES["CAMEL_SPLIT"].sub(r' \1', t))
127 | 
128 | with open('expressions.txt', 'w') as file:
129 |     file.write(json.dumps(EXPRESSIONS, sort_keys=True, indent=4, ))
130 | 


--------------------------------------------------------------------------------
/ekphrasis/stats/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/stats/.gitkeep


--------------------------------------------------------------------------------
/ekphrasis/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/tools/__init__.py


--------------------------------------------------------------------------------
/ekphrasis/tools/generate_stats.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import html
  4 | import os
  5 | import pickle
  6 | import re
  7 | import time
  8 | from collections import Counter
  9 | from collections import defaultdict
 10 | 
 11 | import matplotlib.pyplot as plt
 12 | import numpy
 13 | from tqdm import tqdm
 14 | 
 15 | REGEX_TOKEN = re.compile(r'(?<![#@])\b[a-z]{1,15}\b')
 16 | REGEX_URL = re.compile(
 17 |     r"(https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})")
 18 | SEPARATOR = "_"
 19 | 
 20 | 
 21 | ###############################################################################
 22 | # Parse Arguments
 23 | ###############################################################################
 24 | 
 25 | def check_empty_arg(value):
 26 |     if len(str(value)) == 0:
 27 |         raise argparse.ArgumentTypeError("Invalid argument - no value passed.")
 28 |     return value
 29 | 
 30 | 
 31 | def parse_int_list(value):
 32 |     if len(str(value)) == 0:
 33 |         raise argparse.ArgumentTypeError("Invalid argument - no value passed.")
 34 | 
 35 |     return [int(x) for x in value.split(",")]
 36 | 
 37 | 
 38 | parser = argparse.ArgumentParser()
 39 | 
 40 | # add arguments ########################################
 41 | parser.add_argument('--input', nargs='?', type=check_empty_arg, default="./",
 42 |                     help='path to file or directory containing the files for '
 43 |                          'calculating the statistics.')
 44 | parser.add_argument('--name', nargs='?', type=check_empty_arg,
 45 |                     default="mycorpus", help='')
 46 | parser.add_argument('--ngrams', type=int, default=2,
 47 |                     help='up-to how many ngrams to calculate statistics.')
 48 | parser.add_argument('--mincount', nargs='+', type=int, default=[60, 25],
 49 |                     help='eliminate all ngrams below the given count.')
 50 | parser.add_argument('--perc', nargs='+', type=int, default=0,
 51 |                     help='eliminate all ngrams below the given percentile. '
 52 |                          '0=ALL')
 53 | 
 54 | pickle_parser = parser.add_mutually_exclusive_group()
 55 | pickle_parser.add_argument('--pickle', dest='pickle', action='store_true')
 56 | pickle_parser.add_argument('--no-pickle', dest='pickle', action='store_false')
 57 | parser.set_defaults(pickle=False)
 58 | 
 59 | web_parser = parser.add_mutually_exclusive_group()
 60 | web_parser.add_argument('--web-fix', dest='web_fix', action='store_true')
 61 | web_parser.add_argument('--no-web-fix', dest='web_fix', action='store_false')
 62 | parser.set_defaults(web_fix=True)
 63 | 
 64 | args = parser.parse_args()
 65 | 
 66 | 
 67 | ###############################################################################
 68 | 
 69 | def tokenize(text):
 70 |     """
 71 |     extract words from text
 72 |     :param text:
 73 |     :return:
 74 |     """
 75 |     if args.web_fix:
 76 |         text = REGEX_URL.sub(' ', text)
 77 |         text = html.unescape(text)
 78 |     return REGEX_TOKEN.findall(text.lower())
 79 | 
 80 | 
 81 | def get_ngrams(input_list, n):
 82 |     return zip(*[input_list[i:] for i in range(n)])
 83 | 
 84 | 
 85 | def write_stats_to_file(file, counts, mincount):
 86 |     with open(file, 'w', encoding="utf-8") as f:
 87 |         if args.perc == 0:
 88 |             percentile = 0
 89 |         else:
 90 |             percentile = numpy.percentile(
 91 |                 numpy.fromiter(counts.values(), numpy.int32), args.perc)
 92 |         threshold = max(percentile, mincount)
 93 | 
 94 |         for k, v in counts.items():
 95 | 
 96 |             if v >= threshold:
 97 |                 entry = k.split(SEPARATOR)
 98 |                 entry.append(str(v))
 99 |                 f.write('\t'.join(entry) + '\n')
100 | 
101 |     if args.pickle:
102 |         with open(file + ".pickle", 'wb') as f:
103 |             pickle.dump(counts, f)
104 | 
105 | 
106 | def count_file(filename, countkeeper, desc=""):
107 |     """
108 |     Count the word statistics of a file
109 |     :param desc:
110 |     :param filename:
111 |     :param countkeeper:
112 |     :return:
113 |     """
114 |     print()
115 |     print("computing statistics for file: ", filename)
116 |     with open(filename, "r", encoding="utf-8", errors='ignore') as infile:
117 |         num_lines = sum(1 for line in open(filename, "r", encoding="utf-8"))
118 |         for line in tqdm(infile, total=num_lines, desc=desc):
119 |             try:
120 |                 toks = tokenize(line)
121 |                 for i in range(args.ngrams):
122 |                     ngram = i + 1
123 |                     if ngram > 1:
124 |                         toks = ["<S>"] + toks
125 |                     for token in get_ngrams(toks, ngram):
126 |                         countkeeper[ngram][SEPARATOR.join(token)] += 1
127 |             except Exception as e:
128 |                 print("ERROR - ", e, infile)
129 | 
130 | 
131 | def write_stats(counts):
132 |     print()
133 |     dir_path = os.path.dirname(os.path.realpath(__file__))
134 |     for k, v in counts.items():
135 |         print("Writing " + str(k) + "-grams...")
136 |         counter = Counter(counts[k])
137 |         print("entries:{}\t-\ttokens:{}".format(format(len(counter), ','),
138 |                                                 format(sum(counter.values()),
139 |                                                        ',')))
140 | 
141 |         name = "counts_{}grams.txt".format(str(k))
142 |         filename = os.path.join(dir_path, "..", "stats", args.name, name)
143 | 
144 |         print("writing stats to file {}".format(filename))
145 |         os.makedirs(os.path.dirname(filename), exist_ok=True)
146 | 
147 |         write_stats_to_file(filename, counter, args.mincount[int(k) - 1])
148 | 
149 | 
150 | def prune_low_freq(word_stats, threshold):
151 |     """
152 |     remove ngrams with count less than mincount
153 |     avoid dict comprehension as it creates a new temp dict
154 |     and overloads the memory
155 |     Args:
156 |         word_stats ():
157 |         threshold ():
158 | 
159 |     Returns:
160 | 
161 |     """
162 |     for ng in list(word_stats.keys()):
163 |         for t in list(word_stats[ng].keys()):
164 |             if not word_stats[ng][t] >= threshold:
165 |                 del word_stats[ng][t]
166 | 
167 | 
168 | def plot_statistics(statistics):
169 |     fig = plt.figure(figsize=(5 * len(statistics), 5))
170 |     for i, (k, v) in enumerate(statistics.items()):
171 |         ax = fig.add_subplot(1, len(statistics), i + 1)
172 |         ax.set_title("{}-gram - total={}".format(k, len(v)))
173 |         ax.grid(True)
174 |         values = numpy.fromiter(statistics[k].values(), numpy.int32)
175 |         ax.hist(values, bins=100, range=(0, 100))
176 |     fig.tight_layout()
177 |     fig.canvas.draw()
178 |     fig.canvas.flush_events()
179 | 
180 | 
181 | if __name__ == '__main__':
182 |     plt.ion()  # set plot to animated
183 |     stats = defaultdict(lambda: defaultdict(int))
184 |     pruning_size_threshold = 5000000
185 |     low_freq_threshold = 3
186 | 
187 |     if os.path.isfile(args.input):
188 |         count_file(args.input, stats)
189 |         time.sleep(0.01)
190 |         prune_low_freq(stats, 1)
191 |         write_stats(stats)
192 | 
193 |     elif os.path.isdir(args.input):
194 |         files = glob.glob(args.input + "*.txt")
195 |         for i, file in enumerate(files):
196 |             try:
197 |                 count_file(file, stats, str(i + 1) + "/" + str(len(files)))
198 |             except Exception as e:
199 |                 print("ERROR - ", e, file)
200 | 
201 |             time.sleep(0.01)
202 | 
203 |             if any(len(stats[ngram]) > pruning_size_threshold for ngram in
204 |                    list(stats.keys())):
205 |                 print("Cleaning entries with only one occurrence, "
206 |                       "in order to save memory...")
207 |                 prune_low_freq(stats, low_freq_threshold)
208 |                 # write progress
209 |                 # plot_statistics(stats)
210 | 
211 |             write_stats(stats)
212 | 
213 |         prune_low_freq(stats, low_freq_threshold)
214 |         write_stats(stats)
215 |     else:
216 |         print("Wrong input. Give a file or directory!")
217 | 


--------------------------------------------------------------------------------
/ekphrasis/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/utils/__init__.py


--------------------------------------------------------------------------------
/ekphrasis/utils/helpers.py:
--------------------------------------------------------------------------------
  1 | from functools import reduce
  2 | import operator
  3 | import os
  4 | from os import path
  5 | from os.path import expanduser
  6 | import sys
  7 | import ujson as json
  8 | from urllib.request import urlretrieve
  9 | import zipfile
 10 | 
 11 | 
 12 | def get_stats_dir():
 13 |     home = expanduser("~")
 14 | 
 15 |     ekphrasis_dir = path.join(home, '.ekphrasis')
 16 | 
 17 |     if not os.path.exists(ekphrasis_dir):
 18 |         os.makedirs(ekphrasis_dir)
 19 | 
 20 |     stats_dir = path.join(ekphrasis_dir, 'stats')
 21 | 
 22 |     if not os.path.exists(stats_dir):
 23 |         os.makedirs(stats_dir)
 24 | 
 25 |     return stats_dir
 26 | 
 27 | 
 28 | def parse_stats(name, sep='\t', ngram_sep='_'):
 29 |     """
 30 |     Read key,value pairs from file.
 31 |     """
 32 |     print("reading ngrams", name)
 33 |     d = {}
 34 |     with open(name, "r", encoding="utf-8") as f:
 35 |         for line in f:
 36 |             values = line.split(sep)
 37 |             if len(values) > 2:
 38 |                 d[ngram_sep.join(values[:-1])] = int(values[-1])
 39 |             else:
 40 |                 d[values[0]] = int(values[1])
 41 | 
 42 |     return d
 43 | 
 44 | 
 45 | def read_stats(corpus, ngram):
 46 |     stats_dir = get_stats_dir()
 47 |     check_stats_files()
 48 |     print("Reading " + "{} - {}grams ...".format(corpus, ngram))
 49 |     text = path.join(*[stats_dir, corpus, "counts_{}grams.txt".format(ngram)])
 50 |     dumped = path.join(
 51 |         *[stats_dir, corpus, "counts_{}grams.json".format(ngram)])
 52 | 
 53 |     if os.path.isfile(dumped):
 54 |         with open(dumped, "r") as f:
 55 |             stats = json.load(f)
 56 |             return stats
 57 |     elif os.path.isfile(text):
 58 |         print("generating cache file for faster loading...")
 59 |         stats = parse_stats(text)
 60 |         with open(dumped, "w") as f:
 61 |             json.dump(stats, f)
 62 |         return stats
 63 |     else:
 64 |         print("stats file not available!")
 65 |         sys.exit(1)
 66 | 
 67 | 
 68 | def listdir_nohidden(path):
 69 |     return [f for f in os.listdir(path) if not f.startswith('.')]
 70 | 
 71 | 
 72 | def download_statistics():
 73 |     stats_dir = get_stats_dir()
 74 |     print("Word statistics files not found!\nDownloading...", end=" ")
 75 |     # url = "https://www.dropbox.com/s/a84otqrg6u1c5je/stats.zip?dl=1"
 76 |     url = "https://data.statmt.org/cbaziotis/projects/ekphrasis/stats.zip"
 77 |     urlretrieve(url, "stats.zip")
 78 |     print("done!")
 79 | 
 80 |     print("Unpacking...", end=" ")
 81 |     with zipfile.ZipFile("stats.zip", "r") as zip_ref:
 82 |         zip_ref.extractall(stats_dir)
 83 | 
 84 |     os.remove("stats.zip")
 85 |     print("done!")
 86 | 
 87 | 
 88 | def check_stats_files():
 89 |     stats_dir = get_stats_dir()
 90 |     if not os.path.exists(stats_dir) or len(listdir_nohidden(stats_dir)) == 0:
 91 |         download_statistics()
 92 | 
 93 | 
 94 | def product(nums):
 95 |     """
 96 |     Return the product of a sequence of numbers.
 97 |     """
 98 |     return reduce(operator.mul, nums, 1)
 99 | 
100 | def remove_tags(doc):
101 |     """
102 |     Remove tags from sentence
103 |     """
104 |     doc = ' '.join(word for word in doc.split() if word[0]!='<')
105 |     return doc
106 | 
107 | # check_stats_files()
108 | 


--------------------------------------------------------------------------------
/ekphrasis/utils/nlp.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import re
  3 | 
  4 | import nltk
  5 | import numpy
  6 | from nltk.corpus import sentiwordnet as swn
  7 | from termcolor import cprint
  8 | 
  9 | wordnet_lemmatizer = nltk.WordNetLemmatizer()
 10 | # additional negations: nowhere
 11 | 
 12 | negation_words = {"'t", "ain't", 'aint', "aren't", 'arent', 'cant',
 13 |                   "didn't", 'didnt', "doesn't", 'doesnt', "don't", 'dont',
 14 |                   "hadn't", 'hadnt', "hasn't", 'hasnt', "haven't", 'havent',
 15 |                   "isn't", 'isnt',
 16 |                   'never', 'no', 'none', 'noone', 'not', 'nothing', 'wont', }
 17 | negation_modals = {"couldn't", 'couldnt', "shouldn't", 'shouldnt', "wouldn't",
 18 |                    'wouldnt'}
 19 | contrast_words = {"but", "although", "though", "however", "despite", "whereas",
 20 |                   "while", "unlike", "still"}
 21 | neg_puncts = {"\n", ".", "?", ":", "..."}
 22 | 
 23 | 
 24 | def unpack_contractions(text):
 25 |     """
 26 |     Replace *English* contractions in ``text`` str with their unshortened forms.
 27 |     N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
 28 |     so are left as-is.
 29 | 
 30 |     ---------
 31 |     ---------
 32 | 
 33 |     Important Note: The function is taken from textacy (https://github.com/chartbeat-labs/textacy).
 34 | 
 35 |     See textacy.preprocess.unpack_contractions(text)
 36 |     -> http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.preprocess.unpack_contractions
 37 | 
 38 | 
 39 |     The reason that textacy is not added as a dependency is to avoid having the user to install it's dependencies (such as SpaCy),
 40 |     in order to just use this function.
 41 | 
 42 |     """
 43 |     # standard
 44 |     text = re.sub(
 45 |         r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n['’]t",
 46 |         r"\1\2 not", text)
 47 |     text = re.sub(
 48 |         r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)['’]ll",
 49 |         r"\1\2 will", text)
 50 |     text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)['’]re", r"\1\2 are",
 51 |                   text)
 52 |     text = re.sub(
 53 |         r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)['’]ve",
 54 |         r"\1\2 have", text)
 55 |     # non-standard
 56 |     text = re.sub(r"(\b)([Cc]a)n['’]t", r"\1\2n not", text)
 57 |     text = re.sub(r"(\b)([Ii])['’]m", r"\1\2 am", text)
 58 |     text = re.sub(r"(\b)([Ll]et)['’]s", r"\1\2 us", text)
 59 |     text = re.sub(r"(\b)([Ww])on['’]t", r"\1\2ill not", text)
 60 |     text = re.sub(r"(\b)([Ss])han['’]t", r"\1\2hall not", text)
 61 |     text = re.sub(r"(\b)([Yy])(?:['’]all|a['’]ll)", r"\1\2ou all", text)
 62 |     return text
 63 | 
 64 | 
 65 | def doc_ngrams(doc, n_from=1, n_to=2):
 66 |     return list(itertools.chain.from_iterable(
 67 |         [[doc[i:i + n] for i in range(len(doc) - (n - 1))]
 68 |          for n in range(n_from, n_to + 1)]))
 69 | 
 70 | 
 71 | def find_negations(doc, neg_comma=True, neg_modals=True, debug=False):
 72 |     """
 73 |     Takes as input a list of words and returns the positions (indices) of the words
 74 |     that are in the context of a negation.
 75 | 
 76 |     :param list doc: a list of words (strings)
 77 |     :param bool neg_comma: if True, the negation context ends on a comma
 78 |     :param bool neg_modals: if True, include negation modals in the set of negation words
 79 |     :param bool debug: if True, print the text color coded by context
 80 |     :return set: a set of the word positions inside a negation
 81 | 
 82 |     """
 83 |     doc_context = []
 84 |     append = doc_context.append
 85 |     negation_stopset = neg_puncts | {","} if neg_comma else set()
 86 |     negation_startset = negation_words | negation_modals if neg_modals else set()
 87 | 
 88 |     # status == "normal" means outside of parentheses
 89 |     # status == "parentheses" means inside parentheses
 90 |     # status[XXX] == True means that the context XXX is negated
 91 |     # status[XXX] == False means that the context XXX is affirmative
 92 |     status = {"normal": False, "parentheses": False}
 93 | 
 94 |     # pointer to the current context
 95 |     current = "normal"
 96 | 
 97 |     for i, tok in enumerate(doc):
 98 | 
 99 |         if tok in negation_startset:
100 |             status[current] = True
101 |             if debug:
102 |                 cprint(tok, 'red', attrs=['bold'], end=' ')
103 |             continue
104 | 
105 |         if tok in negation_stopset | contrast_words:
106 |             if debug:
107 |                 if status[current]:
108 |                     cprint(tok, 'green', attrs=['bold'], end=' ')
109 |                 else:
110 |                     print(tok, end=" ")
111 |             status[current] = False
112 |             continue
113 | 
114 |         if tok == "(":
115 |             current = "parentheses"
116 |             if debug:
117 |                 cprint(tok, 'green', attrs=['bold'], end=' ')
118 |             continue
119 | 
120 |         if tok == ")":
121 |             status[
122 |                 "parentheses"] = False  # in order to be false the next time it goes in to a parentheses
123 |             current = "normal"
124 |             if debug:
125 |                 cprint(tok, 'green', attrs=['bold'], end=' ')
126 |             continue
127 | 
128 |         if debug:
129 |             if status[current]:
130 |                 cprint(tok, 'magenta', end=' ')
131 |             else:
132 |                 print(tok, end=" ")
133 | 
134 |         if status[current]:
135 |             append(i)
136 | 
137 |     if debug:
138 |         print()
139 |     # input("press to continue...")
140 | 
141 |     return set(doc_context)
142 | 
143 | 
144 | def mark_doc(doc, wids, mark=None, pos=None):
145 |     """
146 |     Given a list of words and a set of word positions, mark the words in those positions.
147 |     :param list doc: a list of words (strings)
148 |     :param set wids: the positions of the words to be marked
149 |     :param string mark: a string that sets the mark that will be applied
150 |                         to each of the selected words
151 |     :param string pos: can be one of {"prefix", "suffix"}
152 |     :return: the marked list of words
153 |     """
154 |     if mark is None:
155 |         mark = "NEG"
156 | 
157 |     if pos is None:
158 |         pos = "suffix"
159 | 
160 |     marked_doc = []
161 | 
162 |     for i, tok in enumerate(doc):
163 |         if i in wids:
164 |             if pos == "prefix":
165 |                 word = mark + "_" + tok
166 |             else:
167 |                 word = tok + "_" + mark
168 |             marked_doc.append(word)
169 |         else:
170 |             marked_doc.append(tok)
171 | 
172 |     return marked_doc
173 | 
174 | 
175 | def polarity(doc, neg_comma=True, neg_modals=True):
176 |     """
177 |     Estimate the sentiment polarity of a tokenized document.
178 |     Args:
179 |         doc (): a list of words (strings)
180 |         neg_comma (): if True, the negation context ends on a comma
181 |         neg_modals (): if True, include negation modals in the set of negation words
182 | 
183 |     Returns:
184 |         - polarity
185 |         - [positive, negative, neutral] scores
186 | 
187 |     """
188 | 
189 |     tagged = nltk.pos_tag([wordnet_lemmatizer.lemmatize(w)
190 |                            for w in doc])
191 |     negations = find_negations(doc, neg_comma=neg_comma, neg_modals=neg_modals)
192 |     scores = []
193 |     for i, (word, tag) in enumerate(tagged):
194 |         try:
195 |             ss_set = None
196 |             if 'NN' in tag and swn.senti_synsets(word):
197 |                 ss_set = list(swn.senti_synsets(word))[0]
198 |             elif 'VB' in tag and swn.senti_synsets(word):
199 |                 ss_set = list(swn.senti_synsets(word))[0]
200 |             elif 'JJ' in tag and swn.senti_synsets(word):
201 |                 ss_set = list(swn.senti_synsets(word))[0]
202 |             elif 'RB' in tag and swn.senti_synsets(word):
203 |                 ss_set = list(swn.senti_synsets(word))[0]
204 |             if ss_set:
205 |                 pos = ss_set.pos_score()
206 |                 neg = ss_set.neg_score()
207 |                 obj = ss_set.obj_score()
208 |                 if i in negations:
209 |                     pos, neg = neg, pos
210 |                 scores.append([pos, neg, obj])
211 |         except:
212 |             pass
213 | 
214 |     _scores = numpy.mean(numpy.array(scores), axis=0)
215 |     _polarity = _scores[0] - _scores[1]
216 | 
217 |     return _polarity, _scores
218 | 


--------------------------------------------------------------------------------
/local_install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm -rf  build
4 | rm -rf  ekphrasis.egg-info
5 | rm -rf  dist
6 | 
7 | python setup.py sdist bdist_wheel
8 | 
9 | pip install --no-index --find-links=dist\ ekphrasis --force-reinstall --no-deps -U


--------------------------------------------------------------------------------
/pypi_push.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -rf build
 4 | rm -rf ekphrasis.egg-info
 5 | rm -rf dist
 6 | 
 7 | python setup.py sdist bdist_wheel
 8 | pip wheel -r requirements.txt
 9 | 
10 | # twine register dist/*.tar.gz
11 | twine upload dist/*
12 | # python setup.py sdist upload -r pypi
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tqdm==4.19.4
 2 | colorama==0.3.9
 3 | matplotlib==2.2.2
 4 | setuptools==36.2.5
 5 | termcolor==1.1.0
 6 | numpy==1.19.1
 7 | nltk==3.2.4
 8 | ujson==1.35
 9 | ftfy==4.4.3
10 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(name='ekphrasis',
 4 |       version='0.5.4',
 5 |       description='Text processing tool, geared towards text from '
 6 |                   'social networks, such as Twitter or Facebook. '
 7 |                   'Ekphrasis performs tokenization, word normalization, '
 8 |                   'word segmentation (for splitting hashtags) '
 9 |                   'and spell correction.',
10 |       url='https://github.com/cbaziotis/ekphrasis',
11 |       author='Christos Baziotis',
12 |       author_email='christos.baziotis@gmail.com',
13 |       license='MIT',
14 |       packages=find_packages(exclude=['docs', 'tests*', 'analysis']),
15 |       install_requires=[
16 |           'termcolor',
17 |           'tqdm',
18 |           'colorama',
19 |           'ujson',
20 |           'matplotlib',
21 |           'nltk',
22 |           'ftfy',
23 |           'numpy'
24 |       ],
25 |       include_package_data=True
26 |       )
27 | 


--------------------------------------------------------------------------------