├── .flake8 ├── .gitattributes ├── .gitignore ├── LICENCE ├── MANIFEST.in ├── README.md ├── ekphrasis ├── LICENCE ├── __init__.py ├── classes │ ├── __init__.py │ ├── exmanager.py │ ├── preprocessor.py │ ├── segmenter.py │ ├── spellcorrect.py │ └── tokenizer.py ├── dicts │ ├── __init__.py │ ├── emoticons.py │ ├── noslang │ │ ├── __init__.py │ │ ├── manager.py │ │ ├── slangdict.pickle │ │ └── slangdict.py │ └── sentiment │ │ ├── __init__.py │ │ └── nrc_emolex │ │ ├── NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt │ │ ├── NRCEmolex.py │ │ └── __init__.py ├── examples │ ├── SentReadMe.md │ ├── __init__.py │ ├── demo_data.py │ ├── demo_ext.py │ ├── demo_segmenter.py │ ├── demo_tok.py │ ├── example.py │ ├── segmenter_diffs.pickle │ ├── segmenter_diffs.txt │ ├── sentiment.py │ └── word_segmentation.ipynb ├── regexes │ ├── __init__.py │ ├── expressions.txt │ └── generate_expressions.py ├── stats │ └── .gitkeep ├── tools │ ├── __init__.py │ └── generate_stats.py └── utils │ ├── __init__.py │ ├── helpers.py │ └── nlp.py ├── local_install.sh ├── pypi_push.sh ├── requirements.txt ├── setup.cfg └── setup.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | 4 | exclude = 5 | .tox, 6 | __pycache__, 7 | build, 8 | dist 9 | 10 | ignore = 11 | # F401 imported but unused 12 | F401, 13 | # E501 line too long 14 | E501, 15 | # E303 too many blank lines 16 | E303, 17 | # E731 do not assign a lambda expression, use a def 18 | E731, 19 | # F812: list comprehension redefines ... 20 | F812, 21 | # E402 module level import not at top of file 22 | E402, 23 | # W292 no newline at end of file 24 | W292, 25 | # E999 SyntaxError: invalid syntax 26 | E999, 27 | # F821 undefined name 28 | F821, 29 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/.gitattributes -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | analysis/ 3 | corpus/ 4 | gen_corpus/ 5 | dist/ 6 | build/ 7 | *.egg-info/ 8 | classes/__pycache__ 9 | *.pyc 10 | /prototyping.py 11 | ekphrasis/__pycache__/ 12 | ekphrasis/classes/__pycache__/ 13 | ekphrasis/dicts/__pycache__/ 14 | ekphrasis/dicts/noslang/__pycache__/ 15 | ekphrasis/examples/.ipynb_checkpoints/ 16 | ekphrasis/examples/word_segmentation-Copy1.ipynb 17 | ekphrasis/stats/** 18 | ekphrasis/utils/__pycache__/ 19 | /local_install.bat 20 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Christos Baziotis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # MANIFEST.in 2 | exclude .gitignore 3 | exclude .coverage 4 | exclude .travis.yml 5 | include README.rst 6 | include README.md 7 | include setup.cfg 8 | prune .cache 9 | prune .git 10 | prune build 11 | prune dist 12 | recursive-exclude *.egg-info * 13 | recursive-include tests * 14 | recursive-include ekphrasis/regexes * 15 | recursive-include regexes * 16 | recursive-exclude ekphrasis/stats * 17 | 18 | # data files 19 | #include stats/**/**/*.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Collection of lightweight text tools, geared towards text from social networks, such as Twitter or Facebook, for tokenization, word normalization, word segmentation (for splitting hashtags) and spell correction, 2 | using word statistics from 2 big corpora (english Wikipedia, twitter - 330mil english tweets). 3 | 4 | _ekphrasis_ was developed as part of the text processing pipeline for 5 | _DataStories_ team's submission for _SemEval-2017 Task 4 (English), Sentiment Analysis in Twitter_. 6 | 7 | If you use the library in you research project, please cite the paper 8 | ["DataStories at SemEval-2017 Task 4: Deep LSTM with Attention for Message-level and Topic-based Sentiment Analysis"](http://www.aclweb.org/anthology/S17-2126). 9 | 10 | Citation: 11 | ``` 12 | @InProceedings{baziotis-pelekis-doulkeridis:2017:SemEval2, 13 | author = {Baziotis, Christos and Pelekis, Nikos and Doulkeridis, Christos}, 14 | title = {DataStories at SemEval-2017 Task 4: Deep LSTM with Attention for Message-level and Topic-based Sentiment Analysis}, 15 | booktitle = {Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)}, 16 | month = {August}, 17 | year = {2017}, 18 | address = {Vancouver, Canada}, 19 | publisher = {Association for Computational Linguistics}, 20 | pages = {747--754} 21 | } 22 | ``` 23 | 24 | **Disclaimer:** The library is no longer actively developed. I will try to resolve important issues, but I can't make any promises. 25 | 26 | # Installation 27 | 28 | build from source 29 | ``` 30 | pip install git+git://github.com/cbaziotis/ekphrasis.git 31 | ``` 32 | or install from pypi 33 | ``` 34 | pip install ekphrasis -U 35 | ``` 36 | 37 | # Overview 38 | 39 | _ekphrasis_ offers the following functionality: 40 | 41 | 1. **Social Tokenizer**. A text tokenizer geared towards social networks (Facebook, Twitter...), 42 | which understands complex emoticons, emojis and other unstructured expressions like dates, times and more. 43 | 44 | 2. **Word Segmentation**. You can split a long string to its constituent words. Suitable for hashtag segmentation. 45 | 46 | 3. **Spell Correction**. You can replace a misspelled word, with the most probable candidate word. 47 | 48 | 4. **Customization**. Taylor the word-segmentation, spell-correction and term identification, to suit your needs. 49 | 50 | Word Segmentation and Spell Correction mechanisms, operate on top of word statistics, collected from a given corpus. We provide word statistics from 2 big corpora (from Wikipedia and Twitter), but you can also generate word statistics from your own corpus. You may need to do that if you are working with domain-specific texts, like biomedical documents. For example a word describing a technique or a chemical compound may be treated as a misspelled word, using the word statistics from a general purposed corpus. 51 | 52 | _ekphrasis_ tokenizes the text based on a list of regular expressions. You can easily enable _ekphrasis_ to identify new entities, by simply adding a new entry to the dictionary of regular expressions (`ekphrasis/regexes/expressions.txt`). 53 | 54 | 5. **Pre-Processing Pipeline**. You can combine all the above steps in an easy way, in order to prepare the text files in your dataset for some kind of analysis or for machine learning. 55 | In addition, to the aforementioned actions, you can perform text normalization, word annotation (labeling) and more. 56 | 57 | 58 | 59 | 60 | ## Text Pre-Processing pipeline 61 | 62 | You can easily define a preprocessing pipeline, by using the ``TextPreProcessor``. 63 | 64 | ```python 65 | from ekphrasis.classes.preprocessor import TextPreProcessor 66 | from ekphrasis.classes.tokenizer import SocialTokenizer 67 | from ekphrasis.dicts.emoticons import emoticons 68 | 69 | text_processor = TextPreProcessor( 70 | # terms that will be normalized 71 | normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 72 | 'time', 'url', 'date', 'number'], 73 | # terms that will be annotated 74 | annotate={"hashtag", "allcaps", "elongated", "repeated", 75 | 'emphasis', 'censored'}, 76 | fix_html=True, # fix HTML tokens 77 | 78 | # corpus from which the word statistics are going to be used 79 | # for word segmentation 80 | segmenter="twitter", 81 | 82 | # corpus from which the word statistics are going to be used 83 | # for spell correction 84 | corrector="twitter", 85 | 86 | unpack_hashtags=True, # perform word segmentation on hashtags 87 | unpack_contractions=True, # Unpack contractions (can't -> can not) 88 | spell_correct_elong=False, # spell correction for elongated words 89 | 90 | # select a tokenizer. You can use SocialTokenizer, or pass your own 91 | # the tokenizer, should take as input a string and return a list of tokens 92 | tokenizer=SocialTokenizer(lowercase=True).tokenize, 93 | 94 | # list of dictionaries, for replacing tokens extracted from the text, 95 | # with other expressions. You can pass more than one dictionaries. 96 | dicts=[emoticons] 97 | ) 98 | 99 | sentences = [ 100 | "CANT WAIT for the new season of #TwinPeaks \(^o^)/!!! #davidlynch #tvseries :)))", 101 | "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/", 102 | "@SentimentSymp: can't wait for the Nov 9 #Sentiment talks! YAAAAAAY !!! :-D http://sentimentsymposium.com/." 103 | ] 104 | 105 | for s in sentences: 106 | print(" ".join(text_processor.pre_process_doc(s))) 107 | ``` 108 | 109 | Output: 110 | 111 | ``` 112 | cant wait for the new season of twin peaks \(^o^)/ ! david lynch tv series 113 | 114 | i saw the new john doe movie and it sucks ! waisted . bad movies 115 | 116 | : can not wait for the sentiment talks ! yay ! 117 | ``` 118 | 119 | 120 | Notes: 121 | 122 | * elongated words are automatically normalized. 123 | * Spell correction affects performance. 124 | 125 | --- 126 | 127 | ### Word Statistics 128 | _ekphrasis_ provides word statistics (unigrams and bigrams) from 2 big corpora: 129 | * the english Wikipedia 130 | * a collection of 330 million english Twitter messages 131 | 132 | These word statistics are required for the word segmentation and spell correction. 133 | Moreover, you can generate word statistics from your own corpus. 134 | You can use `ekphrasis/tools/generate_stats.py` and generate statistics from a text file, or a directory that contains a collection of text files. 135 | For example, in order generate word statistics for [text8](http://mattmahoney.net/dc/textdata.html) (http://mattmahoney.net/dc/text8.zip), you can do: 136 | 137 | ``` 138 | python generate_stats.py --input text8.txt --name text8 --ngrams 2 --mincount 70 30 139 | ``` 140 | * input: path to file or directory containing the files for calculating the statistics. 141 | * name: the name of the corpus. 142 | * ngrams: up-to how many ngrams to calculate statistics. 143 | * mincount: the minimum count of each ngram, in order to be included. 144 | In this case, the mincount for unigrams is 70 and for bigrams is 30. 145 | 146 | After you run the script, you will see a new directory inside `ekphrasis/stats/` with the statistics of your corpus. 147 | In the case of the example above, `ekphrasis/stats/text8/`. 148 | 149 | 150 | 151 | ### Word Segmentation 152 | The word segmentation implementation uses the Viterbi algorithm and is based on [CH14](http://norvig.com/ngrams/ch14.pdf) from the book [Beautiful Data (Segaran and Hammerbacher, 2009)](http://shop.oreilly.com/product/9780596157128.do). 153 | The implementation requires word statistics in order to identify and separating the words in a string. 154 | You can use the word statistics from one of the 2 provided corpora, or from your own corpus. 155 | 156 | 157 | **Example:** 158 | In order to perform word segmentation, first you have to instantiate a segmenter with a given corpus, and then just use the `segment()` method: 159 | ```python 160 | from ekphrasis.classes.segmenter import Segmenter 161 | seg = Segmenter(corpus="mycorpus") 162 | print(seg.segment("smallandinsignificant")) 163 | ``` 164 | Output: 165 | ``` 166 | > small and insignificant 167 | ``` 168 | 169 | You can test the output using statistics from the different corpora: 170 | ```python 171 | from ekphrasis.classes.segmenter import Segmenter 172 | 173 | # segmenter using the word statistics from english Wikipedia 174 | seg_eng = Segmenter(corpus="english") 175 | 176 | # segmenter using the word statistics from Twitter 177 | seg_tw = Segmenter(corpus="twitter") 178 | 179 | words = ["exponentialbackoff", "gamedev", "retrogaming", "thewatercooler", "panpsychism"] 180 | for w in words: 181 | print(w) 182 | print("(eng):", seg_eng.segment(w)) 183 | print("(tw):", seg_tw.segment(w)) 184 | print() 185 | ``` 186 | Output: 187 | ``` 188 | exponentialbackoff 189 | (eng): exponential backoff 190 | (tw): exponential back off 191 | 192 | gamedev 193 | (eng): gamedev 194 | (tw): game dev 195 | 196 | retrogaming 197 | (eng): retrogaming 198 | (tw): retro gaming 199 | 200 | thewatercooler 201 | (eng): the water cooler 202 | (tw): the watercooler 203 | 204 | panpsychism 205 | (eng): panpsychism 206 | (tw): pan psych is m 207 | 208 | ``` 209 | 210 | Finally, if the word is camelCased or PascalCased, then the algorithm splits the words based on the case of the characters. 211 | ```python 212 | from ekphrasis.classes.segmenter import Segmenter 213 | seg = Segmenter() 214 | print(seg.segment("camelCased")) 215 | print(seg.segment("PascalCased")) 216 | ``` 217 | Output: 218 | ``` 219 | > camel cased 220 | > pascal cased 221 | ``` 222 | 223 | ### Spell Correction 224 | The Spell Corrector is based on [Peter Norvig's spell-corrector](http://norvig.com/spell-correct.html). 225 | Just like the segmentation algorithm, we utilize word statistics in order to find the most probable candidate. 226 | Besides the provided statistics, you can use your own. 227 | 228 | **Example:** 229 | 230 | You can perform the spell correction, just like the word segmentation. 231 | First you have to instantiate a `SpellCorrector` object, 232 | that uses the statistics from the corpus of your choice and then use on of the available methods. 233 | ```python 234 | from ekphrasis.classes.spellcorrect import SpellCorrector 235 | sp = SpellCorrector(corpus="english") 236 | print(sp.correct("korrect")) 237 | ``` 238 | Output: 239 | ``` 240 | > correct 241 | ``` 242 | 243 | 244 | ### Social Tokenizer 245 | The difficulty in tokenization is to avoid splitting expressions or words that should be kept intact (as one token). 246 | This is more important in texts from social networks, with "creative" writing and expressions like emoticons, hashtags and so on. 247 | Although there are some tokenizers geared towards Twitter [1],[2], 248 | that recognize the Twitter markup and some basic sentiment expressions or simple emoticons, 249 | our tokenizer is able to identify almost all emoticons, emojis and many complex expressions. 250 | 251 | Especially for tasks such as sentiment analysis, there are many expressions that play a decisive role in identifying the sentiment expressed in text. Expressions like these are: 252 | 253 | - Censored words, such as ``f**k``, ``s**t``. 254 | - Words with emphasis, such as ``a *great* time``, ``I don't *think* I ...``. 255 | - Emoticons, such as ``>:(``, ``:))``, ``\o/``. 256 | - Dash-separated words, such as ``over-consumption``, ``anti-american``, ``mind-blowing``. 257 | 258 | Moreover, ekphrasis can identify information-bearing expressions. Depending on the task, you may want to keep preserve / extract them as one token (IR) and then normalize them since this information may be irrelevant for the task (sentiment analysis). Expressions like these are: 259 | 260 | 261 | - Dates, such as ``Feb 18th``, ``December 2, 2016``, ``December 2-2016``, 262 | ``10/17/94``, ``3 December 2016``, ``April 25, 1995``, ``11.15.16``, 263 | ``November 24th 2016``, ``January 21st``. 264 | - Times, such as ``5:45pm``, ``11:36 AM``, ``2:45 pm``, ``5:30``. 265 | - Currencies, such as ``$220M``, ``$2B``, ``$65.000``, ``€10``, ``$50K``. 266 | - Phone numbers. 267 | - URLs, such as ``http://www.cs.unipi.gr``, ``https://t.co/Wfw5Z1iSEt``. 268 | 269 | **Example**: 270 | 271 | ```python 272 | import nltk 273 | from ekphrasis.classes.tokenizer import SocialTokenizer 274 | 275 | 276 | def wsp_tokenizer(text): 277 | return text.split(" ") 278 | 279 | puncttok = nltk.WordPunctTokenizer().tokenize 280 | 281 | social_tokenizer = SocialTokenizer(lowercase=False).tokenize 282 | 283 | sents = [ 284 | "CANT WAIT for the new season of #TwinPeaks \(^o^)/ yaaaay!!! #davidlynch #tvseries :)))", 285 | "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies >3:/", 286 | "@SentimentSymp: can't wait for the Nov 9 #Sentiment talks! YAAAAAAY !!! >:-D http://sentimentsymposium.com/.", 287 | ] 288 | 289 | for s in sents: 290 | print() 291 | print("ORG: ", s) # original sentence 292 | print("WSP : ", wsp_tokenizer(s)) # whitespace tokenizer 293 | print("WPU : ", puncttok(s)) # WordPunct tokenizer 294 | print("SC : ", social_tokenizer(s)) # social tokenizer 295 | 296 | ``` 297 | 298 | Output: 299 | 300 | ``` 301 | ORG: CANT WAIT for the new season of #TwinPeaks \(^o^)/ yaaaay!!! #davidlynch #tvseries :))) 302 | WSP : ['CANT', 'WAIT', 'for', 'the', 'new', 'season', 'of', '#TwinPeaks', '\(^o^)/', 'yaaaay!!!', '#davidlynch', '#tvseries', ':)))'] 303 | WPU : ['CANT', 'WAIT', 'for', 'the', 'new', 'season', 'of', '#', 'TwinPeaks', '\(^', 'o', '^)/', 'yaaaay', '!!!', '#', 'davidlynch', '#', 'tvseries', ':)))'] 304 | SC : ['CANT', 'WAIT', 'for', 'the', 'new', 'season', 'of', '#TwinPeaks', '\(^o^)/', 'yaaaay', '!', '!', '!', '#davidlynch', '#tvseries', ':)))'] 305 | 306 | ORG: I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies >3:/ 307 | WSP : ['I', 'saw', 'the', 'new', '#johndoe', 'movie', 'and', 'it', 'suuuuucks!!!', 'WAISTED', '$10...', '#badmovies', '>3:/'] 308 | WPU : ['I', 'saw', 'the', 'new', '#', 'johndoe', 'movie', 'and', 'it', 'suuuuucks', '!!!', 'WAISTED', '$', '10', '...', '#', 'badmovies', '>', '3', ':/'] 309 | SC : ['I', 'saw', 'the', 'new', '#johndoe', 'movie', 'and', 'it', 'suuuuucks', '!', '!', '!', 'WAISTED', '$10', '.', '.', '.', '#badmovies', '>', '3:/'] 310 | ``` 311 | 312 | 313 | 314 | 321 | 322 | #### References 323 | 324 | [1] K. Gimpel et al., “Part-of-speech tagging for twitter: Annotation, features, and experiments,” in Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies: short papers-Volume 2, 2011, pp. 42–47. 325 | 326 | [2] C. Potts, “Sentiment Symposium Tutorial: Tokenizing,” Sentiment Symposium Tutorial, 2011. [Online]. Available: http://sentiment.christopherpotts.net/tokenizing.html. 327 | -------------------------------------------------------------------------------- /ekphrasis/LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Christos Baziotis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ekphrasis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/__init__.py -------------------------------------------------------------------------------- /ekphrasis/classes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/classes/__init__.py -------------------------------------------------------------------------------- /ekphrasis/classes/exmanager.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | 5 | 6 | class ExManager: 7 | ext_path = os.path.join(os.path.dirname(__file__), 8 | '../regexes/expressions.txt') 9 | 10 | with open(ext_path) as fh: 11 | expressions = json.load(fh) 12 | 13 | def get_compiled(self): 14 | regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in 15 | self.expressions.items()} 16 | return regexes 17 | 18 | def print_expressions(self): 19 | {print(k.lower(), ":", self.expressions[k]) 20 | for k, v in sorted(self.expressions.items())} 21 | -------------------------------------------------------------------------------- /ekphrasis/classes/preprocessor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from functools import lru_cache 3 | 4 | import ftfy 5 | 6 | from ekphrasis.classes.exmanager import ExManager 7 | from ekphrasis.classes.segmenter import Segmenter 8 | from ekphrasis.classes.spellcorrect import SpellCorrector 9 | from ekphrasis.utils.nlp import unpack_contractions 10 | from ekphrasis.utils.helpers import remove_tags 11 | 12 | # noinspection PyPackageRequirements 13 | class TextPreProcessor: 14 | def __init__(self, **kwargs): 15 | """ 16 | Kwargs: 17 | omit (list): choose what tokens that you want to omit from the text. 18 | possible values: ['email', 'percent', 'money', 'phone', 'user', 19 | 'time', 'url', 'date', 'hashtag'] 20 | Important Notes: 21 | 1 - put url at front, if you plan to use it. 22 | Messes with the regexes! 23 | 2 - if you use hashtag then unpack_hashtags will 24 | automatically be set to False 25 | 26 | normalize (list): choose what tokens that you want to normalize 27 | from the text. 28 | possible values: ['email', 'percent', 'money', 'phone', 'user', 29 | 'time', 'url', 'date', 'hashtag'] 30 | for example: myaddress@mysite.com will be transformed to 31 | Important Notes: 32 | 1 - put url at front, if you plan to use it. 33 | Messes with the regexes! 34 | 2 - if you use hashtag then unpack_hashtags will 35 | automatically be set to False 36 | 37 | unpack_contractions (bool): Replace *English* contractions in 38 | ``text`` str with their unshortened forms 39 | for example: can't -> can not, wouldn't -> would not, and so on... 40 | 41 | unpack_hashtags (bool): split a hashtag to it's constituent words. 42 | for example: #ilikedogs -> i like dogs 43 | 44 | annotate (list): add special tags to special tokens. 45 | possible values: ['hashtag', 'allcaps', 'elongated', 'repeated'] 46 | for example: myaddress@mysite.com -> myaddress@mysite.com 47 | 48 | tokenizer (callable): callable function that accepts a string and 49 | returns a list of strings if no tokenizer is provided then 50 | the text will be tokenized on whitespace 51 | 52 | segmenter (str): define the statistics of what corpus you would 53 | like to use [english, twitter] 54 | 55 | corrector (str): define the statistics of what corpus you would 56 | like to use [english, twitter] 57 | 58 | all_caps_tag (str): how to wrap the capitalized words 59 | values [single, wrap, every] 60 | Note: applicable only when `allcaps` is included in annotate[] 61 | - single: add a tag after the last capitalized word 62 | - wrap: wrap all words with opening and closing tags 63 | - every: add a tag after each word 64 | 65 | spell_correct_elong (bool): choose if you want to perform 66 | spell correction after the normalization of elongated words. 67 | * significantly affects performance (speed) 68 | 69 | spell_correction (bool): choose if you want to perform 70 | spell correction to the text 71 | * significantly affects performance (speed) 72 | 73 | fix_text (bool): choose if you want to fix bad unicode terms and 74 | html entities. 75 | 76 | remove_tags (bool): Choose to remove tags after processing 77 | """ 78 | self.omit = kwargs.get("omit", {}) 79 | self.backoff = kwargs.get("normalize", {}) 80 | self.include_tags = kwargs.get("annotate", {}) 81 | self.unpack_contractions = kwargs.get("unpack_contractions", False) 82 | self.tokenizer = kwargs.get("tokenizer", None) 83 | self.dicts = kwargs.get("dicts", None) 84 | self.spell_correction = kwargs.get("spell_correction", False) 85 | self.spell_correct_elong = kwargs.get("spell_correct_elong", False) 86 | self.fix_text = kwargs.get("fix_bad_unicode", False) 87 | self.unpack_hashtags = kwargs.get("unpack_hashtags", False) 88 | self.segmenter_corpus = kwargs.get("segmenter", "english") 89 | self.corrector_corpus = kwargs.get("corrector", "english") 90 | self.all_caps_tag = kwargs.get("all_caps_tag", "wrap") 91 | self.mode = kwargs.get("mode", "normal") 92 | self.remove_tags = kwargs.get("remove_tags", False) 93 | 94 | if self.unpack_hashtags: 95 | self.segmenter = Segmenter(corpus=self.segmenter_corpus) 96 | if self.mode != "fast": 97 | self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) 98 | 99 | self.regexes = ExManager().get_compiled() 100 | if 'hashtag' in self.omit or 'hashtag' in self.backoff: 101 | print("You can't omit/backoff and unpack hashtags!\n " 102 | "unpack_hashtags will be set to False") 103 | self.unpack_hashtags = False 104 | 105 | def __copy__(self): 106 | return self 107 | 108 | def __deepcopy__(self, memo): 109 | return self 110 | 111 | @staticmethod 112 | def add_special_tag(m, tag, mode="single"): 113 | 114 | if isinstance(m, str): 115 | text = m 116 | else: 117 | text = m.group() 118 | 119 | if mode == "single": 120 | return " {} <{}> ".format(text, tag) 121 | elif mode == "wrap": 122 | return " ".join([" <{}> {} ".format(tag, text, tag)]) + " " 123 | elif mode == "every": 124 | tokens = text.split() 125 | processed = " ".join([" {} <{}> ".format(t, tag) 126 | for t in tokens]) 127 | return " " + processed + " " 128 | 129 | @lru_cache(maxsize=65536) 130 | def handle_hashtag_match(self, m): 131 | """ 132 | Break a string to its constituent words (using Viterbi algorithm) 133 | """ 134 | text = m.group()[1:] 135 | 136 | # todo:simplify routine 137 | if text.islower(): 138 | expanded = self.segmenter.segment(text) 139 | expanded = " ".join(expanded.split("-")) 140 | expanded = " ".join(expanded.split("_")) 141 | # print(m.group(), " - ", expanded) 142 | # with open("analysis/segmenter_" + 143 | # self.segmenter_corpus + ".txt", "a") as f: 144 | # f.write(m.group() + "\t" + expanded + "\n") 145 | 146 | else: 147 | # split words following CamelCase convention 148 | expanded = self.regexes["camel_split"].sub(r' \1', text) 149 | expanded = expanded.replace("-", "") 150 | expanded = expanded.replace("_", "") 151 | # print(m.group(), " - ", expanded) 152 | 153 | if "hashtag" in self.include_tags: 154 | expanded = self.add_special_tag(expanded, "hashtag", mode="wrap") 155 | 156 | return expanded 157 | 158 | def handle_elongated_match(self, m): 159 | text = m.group() 160 | # normalize to at most 2 repeating chars 161 | text = self.regexes["normalize_elong"].sub(r'\1\1', text) 162 | normalized = self.spell_corrector.normalize_elongated(text) 163 | if normalized: 164 | text = normalized 165 | 166 | # try to spell correct the word 167 | if self.spell_correct_elong: 168 | text = self.spell_corrector.correct_word(text, assume_wrong=True, 169 | fast=True) 170 | # with open("analysis/spell_corrector_" + 171 | # self.corrector_corpus + ".txt", "a") as f: 172 | # f.write(m.group() + " - " + text + "\n") 173 | 174 | # print(m.group(), "-", text) 175 | if "elongated" in self.include_tags: 176 | text = self.add_special_tag(text, "elongated") 177 | 178 | return text 179 | 180 | 181 | @lru_cache(maxsize=65536) 182 | def handle_repeated_puncts(self, m): 183 | """ 184 | return the sorted set so mathes random combinations of puncts 185 | will be mapped to the same token 186 | "!??!?!!", "?!!!!?!", "!!?", "!?!?" --> "?!" 187 | "!...", "...?!" --> ".!" 188 | :param m: 189 | :return: 190 | """ 191 | text = m.group() 192 | text = "".join(sorted(set(text), reverse=True)) 193 | 194 | if "repeated" in self.include_tags: 195 | text = self.add_special_tag(text, "repeated") 196 | 197 | return text 198 | 199 | @lru_cache(maxsize=65536) 200 | def handle_generic_match(self, m, tag, mode="every"): 201 | """ 202 | 203 | Args: 204 | m (): 205 | tag (): 206 | mode (): 207 | 208 | Returns: 209 | 210 | """ 211 | text = m.group() 212 | text = self.add_special_tag(text, tag, mode=mode) 213 | 214 | return text 215 | 216 | @lru_cache(maxsize=65536) 217 | def handle_emphasis_match(self, m): 218 | """ 219 | :param m: 220 | :return: 221 | """ 222 | text = m.group().replace("*", "") 223 | if "emphasis" in self.include_tags: 224 | text = self.add_special_tag(text, "emphasis") 225 | 226 | return text 227 | 228 | @staticmethod 229 | def dict_replace(wordlist, _dict): 230 | return [_dict[w] if w in _dict else w for w in wordlist] 231 | 232 | @staticmethod 233 | def remove_hashtag_allcaps(wordlist): 234 | in_hashtag = False 235 | _words = [] 236 | for word in wordlist: 237 | 238 | if word == "": 239 | in_hashtag = True 240 | elif word == "": 241 | in_hashtag = False 242 | elif word in {"", ""} and in_hashtag: 243 | continue 244 | 245 | _words.append(word) 246 | 247 | return _words 248 | 249 | def pre_process_doc(self, doc): 250 | 251 | doc = re.sub(r' +', ' ', doc) # remove repeating spaces 252 | 253 | # ########################### 254 | # # fix bad unicode 255 | # ########################### 256 | # if self.fix_bad_unicode: 257 | # doc = textacy.preprocess.fix_bad_unicode(doc) 258 | # 259 | # ########################### 260 | # # fix html leftovers 261 | # ########################### 262 | # doc = html.unescape(doc) 263 | 264 | ########################### 265 | # fix text 266 | ########################### 267 | if self.fix_text: 268 | doc = ftfy.fix_text(doc) 269 | 270 | ########################### 271 | # BACKOFF & OMIT 272 | ########################### 273 | for item in self.backoff: 274 | # better add an extra space after the match. 275 | # Just to be safe. extra spaces will be normalized later anyway 276 | doc = self.regexes[item].sub(lambda m: " " + "<" + item + ">" + " ", 277 | doc) 278 | for item in self.omit: 279 | doc = doc.replace("<" + item + ">", '') 280 | 281 | ########################### 282 | # unpack hashtags 283 | ########################### 284 | if self.unpack_hashtags: 285 | doc = self.regexes["hashtag"].sub( 286 | lambda w: self.handle_hashtag_match(w), doc) 287 | 288 | ########################### 289 | # handle special cases 290 | ########################### 291 | if self.mode != "fast": 292 | if "allcaps" in self.include_tags: 293 | doc = self.regexes["allcaps"].sub( 294 | lambda w: self.handle_generic_match(w, "allcaps", 295 | mode=self.all_caps_tag), 296 | doc) 297 | 298 | if "elongated" in self.include_tags: 299 | doc = self.regexes["elongated"].sub( 300 | lambda w: self.handle_elongated_match(w), doc) 301 | 302 | if "repeated" in self.include_tags: 303 | doc = self.regexes["repeat_puncts"].sub( 304 | lambda w: self.handle_repeated_puncts(w), doc) 305 | 306 | if "emphasis" in self.include_tags: 307 | doc = self.regexes["emphasis"].sub( 308 | lambda w: self.handle_emphasis_match(w), doc) 309 | 310 | if "censored" in self.include_tags: 311 | doc = self.regexes["censored"].sub( 312 | lambda w: self.handle_generic_match(w, "censored"), doc) 313 | 314 | ########################### 315 | # unpack contractions: i'm -> i am, can't -> can not... 316 | ########################### 317 | 318 | # remove textacy dependency 319 | if self.unpack_contractions: 320 | doc = unpack_contractions(doc) 321 | 322 | if self.remove_tags: 323 | doc = remove_tags(doc) 324 | 325 | # omit allcaps if inside hashtags 326 | doc = re.sub(r' +', ' ', doc) # remove repeating spaces 327 | # doc = re.sub(r'', '', doc) # remove repeating spaces 328 | # doc = doc.replace(' ', '') 329 | # doc = doc.replace(' ', '') 330 | 331 | ########################### 332 | # Tokenize 333 | ########################### 334 | doc = self.remove_hashtag_allcaps(doc.split()) 335 | doc = " ".join(doc) # normalize whitespace 336 | if self.tokenizer: 337 | doc = self.tokenizer(doc) 338 | 339 | # Replace tokens with special dictionaries (slang,emoticons ...) 340 | # todo: add spell check before! 341 | if self.dicts: 342 | for d in self.dicts: 343 | doc = self.dict_replace(doc, d) 344 | 345 | return doc 346 | 347 | def pre_process_docs(self, docs, lazy=True): 348 | from tqdm import tqdm 349 | for d in tqdm(docs, desc="PreProcessing..."): 350 | yield self.pre_process_doc(d) 351 | -------------------------------------------------------------------------------- /ekphrasis/classes/segmenter.py: -------------------------------------------------------------------------------- 1 | import re 2 | from functools import lru_cache 3 | from math import log10 4 | 5 | from ekphrasis.classes.exmanager import ExManager 6 | from ekphrasis.utils.helpers import read_stats 7 | 8 | """ 9 | The Segmenter Class implements the Viterbi algorithm for word segmentation. 10 | Based on CH14 from the book Beautiful Data (Segaran and Hammerbacher, 2009) 11 | """ 12 | 13 | REGEX_TOKEN = re.compile(r'\b[a-z]{2,}\b') 14 | NGRAM_SEP = "_" # todo: move to values 15 | 16 | 17 | class Pdist(dict): 18 | """ 19 | A probability distribution estimated from word counts 20 | Notice: if pw = Pdist(unigrams, n_tokens: 21 | * pw[w] is the raw count of the word w 22 | * pw(w) is the probability of the word w 23 | """ 24 | 25 | @staticmethod 26 | def default_unk_func(key, total): 27 | return 1. / total 28 | 29 | def __init__(self, data=None, total=None, unk_func=None, **kwargs): 30 | super().__init__(**kwargs) 31 | 32 | # insert the word counts 33 | data = data or {} 34 | for key, count in data.items(): 35 | self[key] = self.get(key, 0) + int(count) 36 | 37 | self.total = float(total or sum(self.values())) 38 | self.unk_prob = unk_func or self.default_unk_func 39 | 40 | def __call__(self, key): 41 | if key in self: 42 | return self[key] / self.total 43 | else: 44 | return self.unk_prob(key, self.total) 45 | 46 | 47 | class Segmenter: 48 | def __init__(self, corpus="english", max_split_length=20): 49 | """ 50 | Args: 51 | corpus (str): the statistics from which corpus to use for 52 | the spell correction. 53 | max_split_length (int): the maximum length of that a word can have 54 | for looking for splits 55 | """ 56 | 57 | # self.unigrams = Counter(read_stats(corpus, 1)) 58 | # self.bigrams = Counter(read_stats(corpus, 2)) 59 | self.unigrams = read_stats(corpus, 1) 60 | self.bigrams = read_stats(corpus, 2) 61 | self.N = sum(self.unigrams.values()) 62 | self.L = max_split_length 63 | 64 | self.Pw = Pdist(self.unigrams, self.N, self.unk_probability) 65 | self.P2w = Pdist(self.bigrams, self.N) 66 | 67 | self.case_split = ExManager().get_compiled()["camel_split"] 68 | 69 | def condProbWord(self, word, prev): 70 | """ 71 | Conditional probability of word, given previous word 72 | if bigram is not in our list, then fall back to unigrams 73 | Args: 74 | word (): candidate word 75 | prev (): previous observed word 76 | 77 | Returns: 78 | 79 | """ 80 | try: 81 | return self.P2w[prev + NGRAM_SEP + word] / float(self.Pw[prev]) 82 | except KeyError: 83 | return self.Pw(word) 84 | 85 | @staticmethod 86 | def unk_probability(key, total): 87 | """ 88 | Estimate the probability of an unknown word, penalizing its length 89 | :param key: the word 90 | :param total: the count of all tokens 91 | :return: 92 | """ 93 | return 10. / (total * 10 ** len(key)) 94 | 95 | @staticmethod 96 | def combine(first, rem): 97 | """ 98 | Combine first and rem results into one (probability, words) pair 99 | :param first: a tuple in the form: probability, word 100 | :param rem: a tuple in the form: probability, list_of_words 101 | :return: 102 | """ 103 | (first_prob, first_word) = first 104 | (rem_prob, rem_words) = rem 105 | return first_prob + rem_prob, [first_word] + rem_words 106 | 107 | def splits(self, text): 108 | """ 109 | Return a list of all possible (first, rem) pairs with max length of first <=L 110 | :param text: 111 | :return: 112 | """ 113 | return [(text[:i + 1], text[i + 1:]) 114 | for i in range(min(len(text), self.L))] 115 | 116 | # if you don't have enough RAM lower the maxsize 117 | @lru_cache(maxsize=65536) 118 | def find_segment(self, text, prev=''): 119 | """ 120 | Return (log P(words), words), where words is the best estimated segmentation 121 | :param text: the text to be segmented 122 | :param prev: 123 | :return: 124 | """ 125 | if not text: 126 | return 0.0, [] 127 | candidates = [self.combine((log10(self.condProbWord(first, prev)), first), self.find_segment(rem, first)) 128 | for first, rem in self.splits(text)] 129 | return max(candidates) 130 | 131 | # if you don't have enough RAM lower the maxsize 132 | @lru_cache(maxsize=65536) 133 | def segment(self, word): 134 | if word.islower(): 135 | return " ".join(self.find_segment(word)[1]) 136 | else: 137 | return self.case_split.sub(r' \1', word).lower() 138 | 139 | def demo(self): 140 | print("BBCtest: ", self.segment('BbcTest')) 141 | print("choosespain: ", self.segment('choosespain')) 142 | print("speedofart: ", self.segment('speedofart')) 143 | print("smallandinsignificant: ", self.segment('smallandinsignificant')) 144 | 145 | # Segmenter().demo() 146 | -------------------------------------------------------------------------------- /ekphrasis/classes/spellcorrect.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import Counter 3 | from difflib import SequenceMatcher 4 | from functools import lru_cache 5 | 6 | from ekphrasis.utils.helpers import read_stats 7 | 8 | REGEX_TOKEN = re.compile(r'\b[a-z]{2,}\b') 9 | 10 | 11 | class SpellCorrector: 12 | """ 13 | The SpellCorrector extends the functionality of the Peter Norvig's 14 | spell-corrector in http://norvig.com/spell-correct.html 15 | """ 16 | 17 | def __init__(self, corpus="english"): 18 | """ 19 | 20 | :param corpus: the statistics from which corpus to use for the spell correction. 21 | """ 22 | super().__init__() 23 | self.WORDS = Counter(read_stats(corpus, 1)) 24 | self.N = sum(self.WORDS.values()) 25 | 26 | @staticmethod 27 | def tokens(text): 28 | return REGEX_TOKEN.findall(text.lower()) 29 | 30 | def P(self, word): 31 | """ 32 | Probability of `word`. 33 | """ 34 | return self.WORDS[word] / self.N 35 | 36 | def most_probable(self, words): 37 | _known = self.known(words) 38 | if _known: 39 | return max(_known, key=self.P) 40 | else: 41 | return [] 42 | 43 | @staticmethod 44 | def edit_step(word): 45 | """ 46 | All edits that are one edit away from `word`. 47 | """ 48 | letters = 'abcdefghijklmnopqrstuvwxyz' 49 | splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] 50 | deletes = [L + R[1:] for L, R in splits if R] 51 | transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] 52 | replaces = [L + c + R[1:] for L, R in splits if R for c in letters] 53 | inserts = [L + c + R for L, R in splits for c in letters] 54 | return set(deletes + transposes + replaces + inserts) 55 | 56 | def edits2(self, word): 57 | """ 58 | All edits that are two edits away from `word`. 59 | """ 60 | return (e2 for e1 in self.edit_step(word) 61 | for e2 in self.edit_step(e1)) 62 | 63 | def known(self, words): 64 | """ 65 | The subset of `words` that appear in the dictionary of WORDS. 66 | """ 67 | return set(w for w in words if w in self.WORDS) 68 | 69 | @staticmethod 70 | def similar(a, b): 71 | return SequenceMatcher(None, a, b).ratio() 72 | 73 | def edit_candidates(self, word, assume_wrong=False, fast=True): 74 | """ 75 | Generate possible spelling corrections for word. 76 | """ 77 | 78 | if fast: 79 | if assume_wrong: 80 | return self.known(self.edit_step(word)) or [word] 81 | else: 82 | return self.known([word]) or self.known(self.edit_step(word)) or [word] 83 | else: 84 | if assume_wrong: 85 | ttt = self.known(self.edit_step(word)) or self.known(self.edits2(word)) or {word} 86 | return ttt 87 | else: 88 | return self.known([word]) or self.known(self.edit_step(word)) or self.known(self.edits2(word)) or [word] 89 | 90 | # def distance_candidates(self, word, max_distance=3): 91 | # """ 92 | # Generate possible spelling corrections for word. 93 | # """ 94 | # candidates = [w for w in self.WORDS if w] 95 | # return self.known([word]) or self.known(self.edit_step(word)) or self.known(self.edits2(word)) or [word] 96 | 97 | @lru_cache(maxsize=65536) 98 | def correct(self, word, assume_wrong=False, fast=False): 99 | """ 100 | Most probable spelling correction for word. 101 | """ 102 | return max(self.edit_candidates(word, assume_wrong=assume_wrong, fast=fast), key=self.P) 103 | 104 | def correct_text(self, text): 105 | """ 106 | Correct all the words within a text, returning the corrected text.""" 107 | 108 | return re.sub('[a-zA-Z]+', self.correct_match, text) 109 | 110 | def correct_match(self, match): 111 | """ 112 | Spell-correct word in match, and preserve proper upper/lower/title case. 113 | """ 114 | 115 | word = match.group() 116 | return self.case_of(word)(self.correct(word.lower())) 117 | 118 | def correct_word(self, word, assume_wrong=False, fast=False): 119 | """ 120 | Spell-correct word in match, and preserve proper upper/lower/title case. 121 | """ 122 | 123 | return self.case_of(word)(self.correct(word.lower(), assume_wrong=assume_wrong, fast=fast)) 124 | 125 | @staticmethod 126 | def case_of(text): 127 | """ 128 | Return the case-function appropriate for text: upper, lower, title, or just str. 129 | """ 130 | 131 | return (str.upper if text.isupper() else 132 | str.lower if text.islower() else 133 | str.title if text.istitle() else 134 | str) 135 | 136 | def elong_normalized_candidates(self, word, acc=None): 137 | if acc is None: 138 | acc = [] 139 | candidates = [w for w in set(word) if word.count(w) > 1] 140 | for c in candidates: 141 | _w = word.replace(c + c, c) 142 | if _w in acc: 143 | continue 144 | acc.append(_w) 145 | self.elong_normalized_candidates(_w, acc) 146 | return acc + [word] 147 | 148 | def best_elong_candidate(self, word): 149 | candidates = self.elong_normalized_candidates(word) 150 | best = self.most_probable(candidates) 151 | return best or word 152 | 153 | def normalize_elongated(self, word): 154 | return self.case_of(word)(self.best_elong_candidate(word.lower())) 155 | -------------------------------------------------------------------------------- /ekphrasis/classes/tokenizer.py: -------------------------------------------------------------------------------- 1 | import html 2 | import re 3 | 4 | import colorama 5 | from termcolor import colored 6 | 7 | from ekphrasis.classes.exmanager import ExManager 8 | 9 | 10 | class Tokenizer: 11 | social_pipeline = [ 12 | "EMOJI", "URL", "TAG", "EMAIL", "USER", "HASHTAG", 13 | "CASHTAG", "PHONE", "PERCENT", "MONEY", "DATE", "TIME", 14 | "ACRONYM", "LTR_FACE", "RTL_FACE", "CENSORED", "EMPHASIS", 15 | "REST_EMOTICONS", "NUMBER", "WORD", "EASTERN_EMOTICONS", 16 | ] 17 | default_pipeline = social_pipeline 18 | 19 | def __init__(self, pipeline=None, lowercase=False, verbose=False, 20 | debug=False): 21 | """ 22 | Args: 23 | pipeline (list): list of terms to use for tokenization. 24 | Each term, is a key from the dict of regexes `expressions.txt`. 25 | Order matters! 26 | lowercase (bool): set to True in order to lowercase the text 27 | verbose (bool): set to True to print each text after tokenization. 28 | Useful for debugging purposes. 29 | debug (bool): set to True in order to pause after tokenizing 30 | each text (wait for pressing any key). 31 | Useful for debugging purposes, if you want to inspect each text 32 | as is processed. 33 | """ 34 | self.lowercase = lowercase 35 | self.debug = debug 36 | self.verbose = verbose 37 | colorama.init(autoreset=False, convert=False, strip=False, wrap=True) 38 | 39 | self.pipeline = [] 40 | 41 | self.regexes = ExManager().expressions 42 | 43 | if pipeline is None: 44 | pipeline = self.default_pipeline 45 | 46 | self.build(pipeline) 47 | 48 | self.pipeline.append("(?:\S)") # CATCH ALL remaining terms 49 | self.tok = re.compile(r"({})".format("|".join(self.pipeline))) 50 | 51 | def add_to_pipeline(self, term): 52 | # todo: don't wrap all terms 53 | self.pipeline.append(self.wrap_non_matching(self.regexes[term])) 54 | 55 | def build(self, pipeline): 56 | for term in pipeline: 57 | self.add_to_pipeline(term) 58 | 59 | 60 | @staticmethod 61 | def wrap_non_matching(exp): 62 | return "(?:{})".format(exp) 63 | 64 | def verbose_text(self, text, tokenized): 65 | # print(text.rstrip()) 66 | for term in tokenized: 67 | print(colored(term, 'red', attrs=["underline"]), end=" ") 68 | print() 69 | if self.debug: 70 | input() 71 | else: 72 | print() 73 | 74 | def tokenize(self, text): 75 | escaped = html.unescape(text) 76 | tokenized = self.tok.findall(escaped) 77 | 78 | if self.verbose: 79 | self.verbose_text(text, tokenized) 80 | 81 | if self.lowercase: 82 | tokenized = [t.lower() for t in tokenized] 83 | 84 | return tokenized 85 | 86 | 87 | class SocialTokenizer: 88 | """ 89 | **Deprecated** 90 | 91 | A parametric tokenizer that understands many expression found in natural 92 | language such as hashtags, dates, times, emoticons and much more. 93 | """ 94 | 95 | def __init__(self, lowercase=False, verbose=False, debug=False, **kwargs): 96 | """ 97 | 98 | Args: 99 | lowercase (bool): set to True in order to lowercase the text 100 | verbose (bool): set to True to print each text after tokenization. 101 | Useful for debugging purposes. 102 | debug (bool): set to True in order to pause after tokenizing 103 | each text (wait for pressing any key). 104 | Useful for debugging purposes, if you want to inspect each text 105 | as is processed. 106 | 107 | Kwargs (): 108 | emojis (bool): True to keep emojis 109 | urls (bool): True to keep urls 110 | tags (bool): True to keep tags: 111 | emails (bool): True to keep emails 112 | users (bool): True to keep users handles: @cbaziotis 113 | hashtags (bool): True to keep hashtags 114 | cashtags (bool): True to keep cashtags 115 | phones (bool): True to keep phones 116 | percents (bool): True to keep percents 117 | money (bool): True to keep money expressions 118 | date (bool): True to keep date expressions 119 | time (bool): True to keep time expressions 120 | acronyms (bool): True to keep acronyms 121 | emoticons (bool): True to keep emoticons 122 | censored (bool): True to keep censored words: f**k 123 | emphasis (bool): True to keep words with emphasis: *very* good 124 | numbers (bool): True to keep numbers 125 | """ 126 | 127 | self.lowercase = lowercase 128 | self.debug = debug 129 | self.verbose = verbose 130 | colorama.init(autoreset=False, convert=False, strip=False, wrap=True) 131 | pipeline = [] 132 | self.regexes = ExManager().expressions 133 | 134 | emojis = kwargs.get("emojis", True) 135 | urls = kwargs.get("urls", True) 136 | tags = kwargs.get("tags", True) 137 | emails = kwargs.get("emails", True) 138 | users = kwargs.get("users", True) 139 | hashtags = kwargs.get("hashtags", True) 140 | cashtags = kwargs.get("cashtags", True) 141 | phones = kwargs.get("phones", True) 142 | percents = kwargs.get("percents", True) 143 | money = kwargs.get("money", True) 144 | date = kwargs.get("date", True) 145 | time = kwargs.get("time", True) 146 | acronyms = kwargs.get("acronyms", True) 147 | emoticons = kwargs.get("emoticons", True) 148 | censored = kwargs.get("censored", True) 149 | emphasis = kwargs.get("emphasis", True) 150 | numbers = kwargs.get("numbers", True) 151 | 152 | if urls: 153 | pipeline.append(self.regexes["URL"]) 154 | 155 | if tags: 156 | pipeline.append(self.regexes["TAG"]) 157 | 158 | if emails: 159 | pipeline.append(self.wrap_non_matching(self.regexes["EMAIL"])) 160 | 161 | if users: 162 | pipeline.append(self.wrap_non_matching(self.regexes["USER"])) 163 | 164 | if hashtags: 165 | pipeline.append(self.wrap_non_matching(self.regexes["HASHTAG"])) 166 | 167 | if cashtags: 168 | pipeline.append(self.wrap_non_matching(self.regexes["CASHTAG"])) 169 | 170 | if phones: 171 | pipeline.append(self.wrap_non_matching(self.regexes["PHONE"])) 172 | 173 | if percents: 174 | pipeline.append(self.wrap_non_matching(self.regexes["PERCENT"])) 175 | 176 | if money: 177 | pipeline.append(self.wrap_non_matching(self.regexes["MONEY"])) 178 | 179 | if date: 180 | pipeline.append(self.wrap_non_matching(self.regexes["DATE"])) 181 | 182 | if time: 183 | pipeline.append(self.wrap_non_matching(self.regexes["TIME"])) 184 | 185 | if acronyms: 186 | pipeline.append(self.wrap_non_matching(self.regexes["ACRONYM"])) 187 | 188 | if emoticons: 189 | pipeline.append(self.regexes["LTR_FACE"]) 190 | pipeline.append(self.regexes["RTL_FACE"]) 191 | 192 | if censored: 193 | pipeline.append(self.wrap_non_matching(self.regexes["CENSORED"])) 194 | 195 | if emphasis: 196 | pipeline.append(self.wrap_non_matching(self.regexes["EMPHASIS"])) 197 | 198 | # terms like 'eco-friendly', 'go_to', 'john's' - maybe remove the ' or add a parameter for it 199 | # pipeline.append(r"(?:\b[a-zA-Z]+[a-zA-Z'\-_]+[a-zA-Z]+\b)") 200 | 201 | # <3 ^5 202 | if emoticons: 203 | pipeline.append( 204 | self.wrap_non_matching(self.regexes["REST_EMOTICONS"])) 205 | 206 | if numbers: 207 | pipeline.append(self.regexes["NUMBER"]) 208 | 209 | if emojis: 210 | pipeline.append(self.regexes["EMOJI"]) 211 | 212 | # any other word 213 | pipeline.append(self.regexes["WORD"]) 214 | 215 | # EASTERN EMOTICONS - (^_^;) (>_<)> \(^o^)/ 216 | if emoticons: 217 | pipeline.append( 218 | self.wrap_non_matching(self.regexes["EASTERN_EMOTICONS"])) 219 | 220 | # keep repeated puncts as one term 221 | # pipeline.append(r"") 222 | 223 | pipeline.append("(?:\S)") # CATCH ALL remaining terms 224 | 225 | self.tok = re.compile(r"({})".format("|".join(pipeline))) 226 | 227 | @staticmethod 228 | def wrap_non_matching(exp): 229 | return "(?:{})".format(exp) 230 | 231 | def verbose_text(self, text, tokenized): 232 | # print(text.rstrip()) 233 | for term in tokenized: 234 | print(colored(term, 'red', attrs=["underline"]), end=" ") 235 | print() 236 | if self.debug: 237 | input() 238 | else: 239 | print() 240 | 241 | def tokenize(self, text): 242 | escaped = html.unescape(text) 243 | tokenized = self.tok.findall(escaped) 244 | 245 | if self.verbose: 246 | self.verbose_text(text, tokenized) 247 | 248 | if self.lowercase: 249 | tokenized = [t.lower() for t in tokenized] 250 | 251 | return tokenized 252 | 253 | # sentences = [] 254 | 255 | # [print(s) for s in sentences] 256 | # tokenizer = SocialTokenizer(debug=True, verbose=True) 257 | # 258 | # for s in sentences: 259 | # tokenizer.tokenize(s) 260 | -------------------------------------------------------------------------------- /ekphrasis/dicts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/dicts/__init__.py -------------------------------------------------------------------------------- /ekphrasis/dicts/emoticons.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # todo:catch repeating parenthesis 4 | emoticons = { 5 | ':*': '', 6 | ':-*': '', 7 | ':x': '', 8 | ':-)': '', 9 | ':-))': '', 10 | ':-)))': '', 11 | ':-))))': '', 12 | ':-)))))': '', 13 | ':-))))))': '', 14 | ':)': '', 15 | ':))': '', 16 | ':)))': '', 17 | ':))))': '', 18 | ':)))))': '', 19 | ':))))))': '', 20 | ':)))))))': '', 21 | ':o)': '', 22 | ':]': '', 23 | ':3': '', 24 | ':c)': '', 25 | ':>': '', 26 | '=]': '', 27 | '8)': '', 28 | '=)': '', 29 | ':}': '', 30 | ':^)': '', 31 | '|;-)': '', 32 | ":'-)": '', 33 | ":')": '', 34 | '\o/': '', 35 | '*\\0/*': '', 36 | ':-D': '', 37 | ':D': '', 38 | # '(\':': '', 39 | '8-D': '', 40 | '8D': '', 41 | 'x-D': '', 42 | 'xD': '', 43 | 'X-D': '', 44 | 'XD': '', 45 | '=-D': '', 46 | '=D': '', 47 | '=-3': '', 48 | '=3': '', 49 | 'B^D': '', 50 | '>:[': '', 51 | ':-(': '', 52 | ':-((': '', 53 | ':-(((': '', 54 | ':-((((': '', 55 | ':-(((((': '', 56 | ':-((((((': '', 57 | ':-(((((((': '', 58 | ':(': '', 59 | ':((': '', 60 | ':(((': '', 61 | ':((((': '', 62 | ':(((((': '', 63 | ':((((((': '', 64 | ':(((((((': '', 65 | ':((((((((': '', 66 | ':-c': '', 67 | ':c': '', 68 | ':-<': '', 69 | ':<': '', 70 | ':-[': '', 71 | ':[': '', 72 | ':{': '', 73 | ':-||': '', 74 | ':@': '', 75 | ":'-(": '', 76 | ":'(": '', 77 | 'D:<': '', 78 | 'D:': '', 79 | 'D8': '', 80 | 'D;': '', 81 | 'D=': '', 82 | 'DX': '', 83 | 'v.v': '', 84 | "D-':": '', 85 | '(>_<)': '', 86 | ':|': '', 87 | '>:O': '', 88 | ':-O': '', 89 | ':-o': '', 90 | ':O': '', 91 | '°o°': '', 92 | 'o_O': '', 93 | 'o_0': '', 94 | 'o.O': '', 95 | 'o-o': '', 96 | '8-0': '', 97 | '|-O': '', 98 | ';-)': '', 99 | ';)': '', 100 | '*-)': '', 101 | '*)': '', 102 | ';-]': '', 103 | ';]': '', 104 | ';D': '', 105 | ';^)': '', 106 | ':-,': '', 107 | '>:P': '', 108 | ':-P': '', 109 | ':P': '', 110 | 'X-P': '', 111 | 'x-p': '', 112 | 'xp': '', 113 | 'XP': '', 114 | ':-p': '', 115 | ':p': '', 116 | '=p': '', 117 | ':-Þ': '', 118 | ':Þ': '', 119 | ':-b': '', 120 | ':b': '', 121 | ':-&': '', 122 | '>:\\': '', 123 | '>:/': '', 124 | ':-/': '', 125 | ':-.': '', 126 | ':/': '', 127 | ':\\': '', 128 | '=/': '', 129 | '=\\': '', 130 | ':L': '', 131 | '=L': '', 132 | ':S': '', 133 | '>.<': '', 134 | ':-|': '', 135 | '<:-|': '', 136 | ':-X': '', 137 | ':X': '', 138 | ':-#': '', 139 | ':#': '', 140 | 'O:-)': '', 141 | '0:-3': '', 142 | '0:3': '', 143 | '0:-)': '', 144 | '0:)': '', 145 | '0;^)': '', 146 | '>:)': '', 147 | '>:D': '', 148 | '>:-D': '', 149 | '>;)': '', 150 | '>:-)': '', 151 | '}:-)': '', 152 | '}:)': '', 153 | '3:-)': '', 154 | '3:)': '', 155 | 'o/\o': '', 156 | '^5': '', 157 | '>_>^': '', 158 | '^<_<': '', # todo:fix tokenizer - MISSES THIS 159 | '<3': '' 160 | } 161 | 162 | # todo: clear this mess 163 | pattern = re.compile("^[:=\*\-\(\)\[\]x0oO\#\<\>8\\.\'|\{\}\@]+$") 164 | mirror_emoticons = {} 165 | for exp, tag in emoticons.items(): 166 | if pattern.match(exp) \ 167 | and any(ext in exp for ext in [";", ":", "="]) \ 168 | and not any(ext in exp for ext in ["L", "D", "p", "P", "3"]): 169 | mirror = exp[::-1] 170 | 171 | if "{" in mirror: 172 | mirror = mirror.replace("{", "}") 173 | elif "}" in mirror: 174 | mirror = mirror.replace("}", "{") 175 | 176 | if "(" in mirror: 177 | mirror = mirror.replace("(", ")") 178 | elif ")" in mirror: 179 | mirror = mirror.replace(")", "(") 180 | 181 | if "<" in mirror: 182 | mirror = mirror.replace("<", ">") 183 | elif ">" in mirror: 184 | mirror = mirror.replace(">", "<") 185 | 186 | if "[" in mirror: 187 | mirror = mirror.replace("[", "]") 188 | elif "]" in mirror: 189 | mirror = mirror.replace("]", "[") 190 | 191 | if "\\" in mirror: 192 | mirror = mirror.replace("\\", "/") 193 | elif "/" in mirror: 194 | mirror = mirror.replace("/", "\\") 195 | 196 | # print(exp + "\t\t" + mirror) 197 | mirror_emoticons[mirror] = tag 198 | emoticons.update(mirror_emoticons) 199 | 200 | for exp, tag in list(emoticons.items()): 201 | if exp.lower() not in emoticons: 202 | emoticons[exp.lower()] = tag 203 | 204 | emoticon_groups = { 205 | "positive": {'', '', '', ''}, 206 | "negative": {'', '', } 207 | } 208 | 209 | 210 | def print_positive(sentiment): 211 | for e, tag in emoticons.items(): 212 | if tag in emoticon_groups[sentiment]: 213 | print(e) 214 | 215 | # print_positive("negative") 216 | # print(" ".join(list(emoticons.keys()))) 217 | # [print(e) for e in list(emoticons.keys())] 218 | -------------------------------------------------------------------------------- /ekphrasis/dicts/noslang/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/dicts/noslang/__init__.py -------------------------------------------------------------------------------- /ekphrasis/dicts/noslang/manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pickle 4 | 5 | 6 | def read_slangdict(): 7 | filename = os.path.join(os.path.dirname(__file__), "slangdict.pickle") 8 | if os.path.isfile(filename): 9 | print("Reading data...") 10 | data = pickle.load(open(filename, 'rb')) 11 | return data 12 | -------------------------------------------------------------------------------- /ekphrasis/dicts/noslang/slangdict.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/dicts/noslang/slangdict.pickle -------------------------------------------------------------------------------- /ekphrasis/dicts/sentiment/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/dicts/sentiment/__init__.py -------------------------------------------------------------------------------- /ekphrasis/dicts/sentiment/nrc_emolex/NRCEmolex.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import pickle 4 | 5 | ''' 6 | NRC Word-Emotion Association Lexicon (aka EmoLex) (14.000 entries) 7 | -------------------------------------- 8 | format = dictionary with entries like this: 9 | word1={'negative': 0.0, 'positive': 1.0, 'surprise': 0.0, 'trust': 0.0, 'joy': 1.0, 'fear': 0.0, 'anticipation': 0.0, 'sadness': 0.0, 'anger': 0.0, 'disgust': 0.0} 10 | ''' 11 | 12 | 13 | class NRCEmolex: 14 | def __init__(self): 15 | super().__init__() 16 | self.raw_filename = "NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt" 17 | self.parsed_filename = "emolex.pickle" 18 | 19 | def write(self): 20 | if os.path.exists( 21 | os.path.join(os.path.dirname(__file__), self.raw_filename)): 22 | with open( 23 | os.path.join(os.path.dirname(__file__), self.raw_filename), 24 | "r") as f: 25 | reader = csv.reader(f, delimiter="\t") 26 | reader = list(reader) 27 | lexicon = {} 28 | for row in reader: 29 | # lexicon[row[0]][row[1]] = float(row[2]) 30 | lexicon.setdefault(row[0], {})[row[1]] = float(row[2]) 31 | 32 | for k, v in lexicon.items(): 33 | polarity = 0 34 | if lexicon[k]["positive"]: 35 | polarity = 1 36 | elif lexicon[k]["negative"]: 37 | polarity = -1 38 | lexicon[k]["polarity"] = polarity 39 | 40 | lexicon[k]["emotions"] = [v['fear'], v['sadness'], 41 | v['trust'], v['disgust'], 42 | v['surprise'], 43 | v['anger'], v['joy'], 44 | v['anticipation']] 45 | 46 | with open(self.parsed_filename, 'wb') as pickle_file: 47 | pickle.dump(lexicon, pickle_file) 48 | else: 49 | print("input file not found!") 50 | 51 | def read(self): 52 | if os.path.exists( 53 | os.path.join(os.path.dirname(__file__), self.parsed_filename)): 54 | with open(os.path.join(os.path.dirname(__file__), 55 | self.parsed_filename), 'rb') as f: 56 | data = pickle.load(f) 57 | return data 58 | else: 59 | self.write() 60 | return self.read() 61 | 62 | # NRCEmolex().write() 63 | # NRCEmolex().read() 64 | -------------------------------------------------------------------------------- /ekphrasis/dicts/sentiment/nrc_emolex/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/dicts/sentiment/nrc_emolex/__init__.py -------------------------------------------------------------------------------- /ekphrasis/examples/SentReadMe.md: -------------------------------------------------------------------------------- 1 | ## Sentiment Analysis Example 2 | 3 | ```python 4 | from ekphrasis.classes.preprocessor import TextPreProcessor 5 | from ekphrasis.classes.tokenizer import SocialTokenizer 6 | from ekphrasis.utils.nlp import polarity 7 | 8 | sentences = [ 9 | "So there is no way for me to plug it in here in the US unless I go by a converter.", 10 | "Good case, Excellent value.", 11 | "Works great!", 12 | 'The design is very odd, as the ear "clip" is not very comfortable at all.', 13 | "Needless to say, I wasted my money." 14 | ] 15 | 16 | # define preprocessing pipeline 17 | text_processor = TextPreProcessor( 18 | fix_text=True, 19 | unpack_contractions=True, 20 | tokenizer=SocialTokenizer(lowercase=True).tokenize, 21 | ) 22 | 23 | # pass each sentence through the pipeline 24 | tokenized_sentences = list(text_processor.pre_process_docs(sentences)) 25 | for sent in tokenized_sentences: 26 | _polarity, _scores = polarity(sent) 27 | print("{:.4f}\t".format(_polarity) + " ".join(sent)) 28 | ``` 29 | 30 | #### Output 31 | 32 | ```shell 33 | 0.0139 so there is no way for me to plug it in here in the us unless i go by a converter . 34 | 0.3750 good case , excellent value . 35 | 0.0000 works great ! 36 | 0.0500 the design is very odd , as the ear " clip " is not very comfortable at all . 37 | 0.0500 needless to say , i wasted my money . 38 | ``` -------------------------------------------------------------------------------- /ekphrasis/examples/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created by Christos Baziotis. 3 | """ 4 | -------------------------------------------------------------------------------- /ekphrasis/examples/demo_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created by Christos Baziotis. 3 | """ 4 | 5 | demo_sents = [ 6 | "CANT WAIT for the new season of #TwinPeaks \(^o^)/ yaaaay!!! #davidlynch #tvseries :)))", 7 | "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies >3:/", 8 | "@SentimentSymp: can't wait for the Nov 9 #Sentiment talks! YAAAAAAY !!! >:-D http://sentimentsymposium.com/.", 9 | ] 10 | -------------------------------------------------------------------------------- /ekphrasis/examples/demo_ext.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created by Christos Baziotis. 3 | """ 4 | from ekphrasis.classes.tokenizer import SocialTokenizer 5 | 6 | 7 | social_tokenizer = SocialTokenizer(lowercase=False).tokenize 8 | 9 | sents = [ 10 | "CANT WAIT for the new season of #TwinPeaks \(^o^)/ yaaaay!!! #davidlynch #tvseries :)))", 11 | "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies 3:/", 12 | "@SentimentSymp: can't wait for the Nov 9 #Sentiment talks! YAAAAAAY !!! >:-D http://sentimentsymposium.com/.", 13 | ] 14 | 15 | for s in sents: 16 | print("SC : ", social_tokenizer(s)) # social tokenizer 17 | -------------------------------------------------------------------------------- /ekphrasis/examples/demo_segmenter.py: -------------------------------------------------------------------------------- 1 | from ekphrasis.classes.segmenter import Segmenter 2 | 3 | # segmenter using the word statistics from english Wikipedia 4 | seg_eng = Segmenter(corpus="english") 5 | 6 | # segmenter using the word statistics from Twitter 7 | seg_tw = Segmenter(corpus="twitter") 8 | 9 | # segmenter using the word statistics from Twitter 10 | seg_tw_2018 = Segmenter(corpus="twitter_2018") 11 | 12 | words = ["exponentialbackoff", "gamedev", "retrogaming", "thewatercooler", 13 | "panpsychism"] 14 | for w in words: 15 | print(w) 16 | print("(eng):", seg_eng.segment(w)) 17 | print("(tw):", seg_tw.segment(w)) 18 | print("(tw):", seg_tw_2018.segment(w)) 19 | print() 20 | -------------------------------------------------------------------------------- /ekphrasis/examples/demo_tok.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created by Christos Baziotis. 3 | """ 4 | import nltk 5 | 6 | from ekphrasis.classes.tokenizer import SocialTokenizer, Tokenizer 7 | 8 | 9 | def wsp_tokenizer(text): 10 | return text.split(" ") 11 | 12 | 13 | puncttok = nltk.WordPunctTokenizer().tokenize 14 | 15 | social_tokenizer = SocialTokenizer(lowercase=False).tokenize 16 | mytokenizer = Tokenizer(lowercase=False).tokenize 17 | 18 | sents = [ 19 | # "CANT WAIT for the new season of #TwinPeaks \(^o^)/ yaaaay!!! #davidlynch #tvseries :)))", 20 | # "@Calum5SOS You lil *poop* please follow @EmilyBain224 ☺️💕", 21 | # "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies 3:/", 22 | # "@SentimentSymp: can't wait for the Nov 9 #Sentiment talks! YAAAAAAY !!! >:-D http://sentimentsymposium.com/.", 23 | # "Words attendees would use to describe @prosper4africa's #ALN 2015! https://t.co/hmNm8AdwOh", 24 | "@TheTideDrew Hi, Drew! I can't wait to see you!☺ Just letting you know that you'll always be my spidey, I love you!💕 Mind following me? x215", 25 | ] 26 | 27 | for s in sents: 28 | print() 29 | # print("ORG: ", s) # original sentence 30 | # print("WSP : ", wsp_tokenizer(s)) # whitespace tokenizer 31 | # print("WPU : ", puncttok(s)) # WordPunct tokenizer 32 | print("SC : ", social_tokenizer(s)) # social tokenizer 33 | # print("SC : ", mytokenizer(s)) # social tokenizer 34 | -------------------------------------------------------------------------------- /ekphrasis/examples/example.py: -------------------------------------------------------------------------------- 1 | from ekphrasis.classes.preprocessor import TextPreProcessor 2 | from ekphrasis.classes.tokenizer import SocialTokenizer 3 | from ekphrasis.dicts.emoticons import emoticons 4 | 5 | 6 | def ws_tokenizer(text): 7 | return text.split() 8 | 9 | 10 | text_processor = TextPreProcessor( 11 | normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 12 | 'date', 'number'], 13 | annotate={"hashtag", "elongated", "allcaps", "repeated", 'emphasis', 14 | 'censored'}, 15 | all_caps_tag="wrap", 16 | fix_text=True, 17 | segmenter="twitter_2018", 18 | corrector="twitter_2018", 19 | unpack_hashtags=True, 20 | unpack_contractions=True, 21 | spell_correct_elong=False, 22 | tokenizer=SocialTokenizer(lowercase=True).tokenize, 23 | # tokenizer=ws_tokenizer, 24 | dicts=[emoticons] 25 | ) 26 | 27 | sentences = [ 28 | "CANT WAIT for the new season of #TwinPeaks \(^o^)/!!! #davidlynch #tvseries :))) ", 29 | "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/", 30 | "I saw the new #JOHNDOE movie AND IT SUCKS!!! WAISTED $10... #badmovies :/", 31 | "@SentimentSymp: can't wait for the Nov 9 #Sentiment talks! YAAAAAAY !!! :-D http://sentimentsymposium.com/.", 32 | "Thanks x https://t.co/ZXTcDLyDS9", 33 | "@Calum5SOS You lil poop please follow @EmilyBain224 ☺️💕", 34 | "Words attendees would use to describe @prosper4africa's #ALN2015! https://t.co/hmNm8AdwOh", 35 | ] 36 | 37 | for s in sentences: 38 | print() 39 | print(s) 40 | print(" ".join(text_processor.pre_process_doc(s))) 41 | -------------------------------------------------------------------------------- /ekphrasis/examples/segmenter_diffs.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/examples/segmenter_diffs.pickle -------------------------------------------------------------------------------- /ekphrasis/examples/segmenter_diffs.txt: -------------------------------------------------------------------------------- 1 | tags english twitter 2 | 0 #mixitup mixitup mix it up 3 | 1 #traderlivetweet2015 trader live tweet2015 trader live tweet 2015 4 | 2 #dsma dsma ds ma 5 | 3 #catfished catfish ed catfished 6 | 4 #kpopers kpop ers kpopers 7 | 5 #vmabert vma bert vma be rt 8 | 6 #weeeeeee we eeeeee weeeeeee 9 | 7 #smwldn smw ldn sm wldn 10 | 8 #drinkalittledrink drinka little drink drink alittle drink 11 | 9 #hibeees hi be ees hibee es 12 | 10 #soccernews soccer news soccernews 13 | 11 #amavi am avi amavi 14 | 12 #socinn soc inn so cinn 15 | 13 #linearlagebra linear la gebra linear lage bra 16 | 14 #iahsfb iahs fb ia hs fb 17 | 15 #ullukapatha ul luka patha ullu ka pa tha 18 | 16 #ootd oo td ootd 19 | 17 #gooners go on ers gooners 20 | 18 #afcbcst afc bc st afcb cst 21 | 19 #pascon pasc on pas con 22 | 20 #rockonrockall rock on rockall rock on rock all 23 | 21 #may4thbewithyou may 4thbe with you may4thbe with you 24 | 22 #tripofalifetime trip of alife time trip ofa lifetime 25 | 23 #vscogrid vs co grid vsco grid 26 | 24 #thewatercooler the water cooler the watercooler 27 | 25 #wewillmakeadate we will makea date we will make adate 28 | 26 #ccot cc ot ccot 29 | 27 #sqlsaturday sql saturday sqlsaturday 30 | 28 #bbuk bb uk bbuk 31 | 29 #wawaw wa waw wawaw 32 | 30 #btowneats bt own eats btown eats 33 | 31 #suvelo su velo suv elo 34 | 32 #ooohkillem oooh kill em oooh killem 35 | 33 #ifollowback if ol low back ifollowback 36 | 34 #tvtag tv tag tvtag 37 | 35 #yall ya ll yall 38 | 36 #ijustnerdedsohardmyglassesbroke ijustnerdedsohard my glasses broke ijust nerded so hard my glasses broke 39 | 37 #doingjusticeishine doing justice ishine doing justice is hine 40 | 38 #iwish iwi sh iwish 41 | 39 #scfafootball scfa football sc fa football 42 | 40 #teuchters te uchte rs teuchters 43 | 41 #donda don da donda 44 | 42 #itsgonnabeagoodday its gonna bea goodday its gonna be agood day 45 | 43 #applewatch applewatch apple watch 46 | 44 #bantemweightchampuonship ban tem weight champu on ship ban tem weight champ uon ship 47 | 45 #itoa itoa it oa 48 | 46 #buzzin buzz in buzzin 49 | 47 #bringbacksungwoo bring back sung woo bring back sungwoo 50 | 48 #jerzday jerz day jerzday 51 | 49 #bluemix blue mix bluemix 52 | 50 #strikeforce strikeforce strike force 53 | 51 #oomf oo mf oomf 54 | 52 #webradio web radio webradio 55 | 53 #goggleeyedhomonculus goggle eyed homonculus goggle eyed ho mon cul us 56 | 54 #risolutore risoluto re riso luto re 57 | 55 #likeaboss like abo ss likeaboss 58 | 56 #ffshoes ff shoes ffs hoes 59 | 57 #bhramabull bhrama bull bh rama bull 60 | 58 #vsco vs co vsco 61 | 59 #sockersweek sockers week so ckers week 62 | 60 #bbcupdat bbc up dat bbc updat 63 | 61 #vhappy vh appy v happy 64 | 62 #fridayfunday friday fun day friday funday 65 | 63 #indveng ind veng in dv eng 66 | 64 #bdayvaycay bdayvaycay bday vaycay 67 | 65 #vscoedit vs co edit vsco edit 68 | 66 #miss2012 miss2012 miss 2012 69 | 67 #cantsingforshit can tsing for shit cant sing for shit 70 | 68 #missyallboyz missy all boyz miss yall boyz 71 | 69 #chessdom chessdom chess dom 72 | 70 #retrogaming retrogaming retro gaming 73 | 71 #lowkey low key lowkey 74 | 72 #wwfc wwfc ww fc 75 | 73 #rqwn rqwn rq wn 76 | 74 #aumrsch aum rsch au mrs ch 77 | 75 #2012trip 2012trip 2012 trip 78 | 76 #latechsuckd la tech su ckd la tech suckd 79 | 77 #chsocm ch so cm chs ocm 80 | 78 #sedc sedc se dc 81 | 79 #guardiannews guardian news guardiannews 82 | 80 #whby whby wh by 83 | 81 #xc-dsign xc dsign xc d sign 84 | 82 #oolegooo oo leg ooo oo legooo 85 | 83 #brockuproblems brocku problems broc ku problems 86 | 84 #iamadad iam ad ad iama dad 87 | 85 #gojetsgo go jetsgo go jets go 88 | 86 #oann oa nn oann 89 | 87 #straya st raya straya 90 | 88 #sarurdaykitchen sar ur day kitchen sarurday kitchen 91 | 89 #vscoitaly vs co italy vsco italy 92 | 90 #motog mot og motog 93 | 91 #topoli topoli to poli 94 | 92 #destinythegame destiny the game destinythegame 95 | 93 #vegvegveg vegvegveg veg veg veg 96 | 94 #samsung-note samsung note samsung note 97 | 95 #loveyouuuuu love you uu uu love youuuuu 98 | 96 #bbcqt bbc qt bbcqt 99 | 97 #jellybean jellybean jelly bean 100 | 98 #anglamigeh angla mig eh ang lamig eh 101 | 99 #rekt re kt rekt 102 | 100 #ciscos cis cos ciscos 103 | 101 #socringeworthy so cringe worthy so cringeworthy 104 | 102 #flumplover flum plover flump lover 105 | 103 #iwishitwerebetter iwi shit were better iwish it were better 106 | 104 #asematy asema ty as ema ty 107 | 105 #acwsgothenburg acws gothenburg ac ws gothenburg 108 | 106 #nyfw ny fw nyfw 109 | 107 #48hrsnosleep 48hrs nosleep 48hrs no sleep 110 | 108 #notenoughhoursinaday not enough hours in aday not enough hours ina day 111 | 109 #vansonmans van son mans vans on mans 112 | 110 #perksofbeingawallflower perks of being aw all flower perks of being awall flower 113 | 111 #oooosh oo oo sh oooosh 114 | 112 #awaydays awaydays away days 115 | 113 #scenez sc enez scen ez 116 | 114 #smdh sm dh smdh 117 | 115 #kanye2020 kanye2020 kanye 2020 118 | 116 #imissmybrother im iss my brother imiss my brother 119 | 117 #damnnnnnn dam nnn nnn damnnnnnn 120 | 118 #starbuzz star buzz starbuzz 121 | 119 #profgetcrunk prof getcrunk prof get crunk 122 | 120 #deflategate deflate gate deflategate 123 | 121 #bestlunche best lun che best lunc he 124 | 122 #idid id id idid 125 | 123 #suchakid such akid sucha kid 126 | 124 #meetthepress meet the press meetthepress 127 | 125 #stillkidrauhl still kid ra uhl still kidrauhl 128 | 126 #appletv3 apple tv3 apple tv 3 129 | 127 #justgirlythings just girly things justgirlythings 130 | 128 #skimmlife ski mm life skimm life 131 | 129 #samac samac sa mac 132 | 130 #bizitalk bizi talk bizitalk 133 | 131 #tunewiki tune wiki tunewiki 134 | 132 #kamcord kam cord kamcord 135 | 133 #coybig coy big coybig 136 | 134 #ecodesign ecodesign eco design 137 | 135 #dastal das tal da stal 138 | 136 #fitn fi tn fitn 139 | 137 #ffxiv ff xiv ffxiv 140 | 138 #ladsontour ladson tour lads on tour 141 | 139 #netezza netezza net ezza 142 | 140 #quatchi quatchi quat chi 143 | 141 #adultshit adults hit adult shit 144 | 142 #hamont hamont ha mont 145 | 143 #aggeliesergasias agge lies ergasias aggeliesergasias 146 | 144 #blackbirdgang blackbird gang black bird gang 147 | 145 #whatabadass what abad ass whata badass 148 | 146 #igers ige rs igers 149 | 147 #noonecares noone cares no one cares 150 | 148 #teamfollowback team follow back teamfollowback 151 | 149 #getnthefucoutofhere get nthe fuc out of here getn the fuc out of here 152 | 150 #vscocam vs co cam vsco cam 153 | 151 #jbiebs jbi ebs jbiebs 154 | 152 #dragoncon dragon con dragoncon 155 | 153 #ihateschool iha te school ihate school 156 | 154 #kubball kubb all ku bball 157 | 155 #fuckuover fuc ku over fucku over 158 | 156 #auspol aus pol auspol 159 | 157 #pimpnjoy pim pn joy pimp njoy 160 | 158 #miaafb mia afb miaa fb 161 | 159 #indianamensbasetball indiana mens baset ball indiana mens base tball 162 | 160 #ahhhhhh ahhhh hh ahhhhhh 163 | 161 #catherinebelll catherine bel ll catherine belll 164 | 162 #nacamam nac am am na cam am 165 | 163 #socent soc ent so cent 166 | 164 #ladieshereicome ladies he rei come ladies here icome 167 | 165 #bango bango ban go 168 | 166 #euref euref eu ref 169 | 167 #1005chunjiday 1005chunjiday 1005chunji day 170 | 168 #tweetpic66 tweetpic66 tweet pic 66 171 | 169 #uncareingworld un care ing world un careing world 172 | 170 #tlot tlot tl ot 173 | 171 #socbiz soc biz so cbiz 174 | 172 #juvederm juve derm juvederm 175 | 173 #makeamove makea move make amo ve 176 | 174 #wordtomymuva word to my mu va word to my muva 177 | 175 #knickstape knicks tape knickstape 178 | 176 #whatapair what ap air whata pair 179 | 177 #fmlll fm lll fmlll 180 | 178 #tcot tc ot tcot 181 | 179 #shepherdshut shepherd shut shepherds hut 182 | 180 #gapol ga pol gap ol 183 | 181 #thinkaboutitnobhead think about it nob head think about it nobhead 184 | 182 #novaturient no vaturi ent nov at uri ent 185 | 183 #treatyoself treat yo self treat yoself 186 | 184 #tville tv ille tville 187 | 185 #speedoflight speedoflight speed of light 188 | 186 #ecigs eci gs ecigs 189 | 187 #delange9 de lange 9 delange9 190 | 188 #chavbants cha vb ants chav bants 191 | 189 #bountygate bounty gate bountygate 192 | 190 #worldie world ie worldie 193 | 191 #gacky ga cky gac ky 194 | 192 #looooser lo ooo ser looooser 195 | 193 #goodday goodday good day 196 | 194 #ivaluemylife iv alue my life iva lue my life 197 | 195 #trndnl trn dnl trnd nl 198 | 196 #nobrollies nob rollies no brollies 199 | 197 #ultimatfighterfridays ulti mat fighter fridays ultimat fighter fridays 200 | 198 #istillloveeastview is till love eastview istill love eastview 201 | 199 #str8likedat str8likedat str8 like dat 202 | 200 #freeghoncheh free ghon cheh free ghoncheh 203 | 201 #dbacks db acks dbacks 204 | 202 #lallysmarine la llys marine lal lys marine 205 | 203 #cmon cm on cmon 206 | 204 #dohh do hh dohh 207 | 205 #brexit br exit brexit 208 | 206 #wahh wa hh wahh 209 | 207 #bbcnewsline bbc newsline bbc news line 210 | 208 #presstitutes press tit utes presstitutes 211 | 209 #illbringabottlenexttime ill brin ga bottle next time ill bring abott le next time 212 | 210 #byeeeee bye eeee byeeeee 213 | 211 #mumfords mum fords mumfords 214 | 212 #ihope iho pe ihope 215 | 213 #kingturnezbet king turn ezbet king turn ez bet 216 | 214 #pray4me pray4me pray4 me 217 | 215 #trump2016 trump2016 trump 2016 218 | 216 #unforgetable un forget able unforgetable 219 | 217 #iwontbesocial iwo nt be social iwont be social 220 | 218 #godeacs gode acs go deacs 221 | 219 #loverboy loverboy lover boy 222 | 220 #amazeballs amaze balls amazeballs 223 | 221 #adorbs ad orbs adorbs 224 | 222 #ishouldjustwatchthat is hould just watch that ishould just watch that 225 | 223 #bruuh bru uh bruuh 226 | 224 #unionjfollowme unio nj follow me unionj follow me 227 | 225 #nekroman nekroman nek roman 228 | 226 #bronnygate bron ny gate bronny gate 229 | 227 #dcfcfans dc fc fans dcfc fans 230 | 228 #onehellofanighttour one hello fa night tour one hell of an ight tour 231 | 229 #bentleyvolleyballl bentley volleyball l bentley volley balll 232 | 230 #awaydaysonly awaydays only away days only 233 | 231 #carly2016 carly2016 carly 2016 234 | 232 #lituania lituania lit ua nia 235 | 233 #saddos sad dos saddos 236 | 234 #thunde thun de thunde 237 | 235 #throwawaydogs throwaway dogs throw away dogs 238 | 236 #lambily lamb ily lambily 239 | 237 #cyber1news cyber1 news cyber 1 news 240 | 238 #sorrynotsorry sorry not sorry sorrynotsorry 241 | 239 #nextime nex time nextime 242 | 240 #iusocc ius occ iu socc 243 | 241 #motwyw motwyw motw yw 244 | 242 #dressinglikeaslutcomeswithaprice dressing like as lut comes with ap rice dressing like asl ut comes wit ha price 245 | 243 #backtostroz back to st roz back to stroz 246 | 244 #gamedev gamedev game dev 247 | 245 #2012shit 2012shit 2012 shit 248 | 246 #ocra ocra oc ra 249 | 247 #ekloges ek loges ek log es 250 | 248 #xoxoxo xo xo xo xoxoxo 251 | 249 #oopsididitagain oops ididit again oops idid it again 252 | 250 #fuckkkk fuck kkk fuckkkk 253 | 251 #theellenshow the ellen show theellenshow -------------------------------------------------------------------------------- /ekphrasis/examples/sentiment.py: -------------------------------------------------------------------------------- 1 | from ekphrasis.classes.preprocessor import TextPreProcessor 2 | from ekphrasis.classes.tokenizer import SocialTokenizer 3 | from ekphrasis.utils.nlp import polarity 4 | 5 | sentences = [ 6 | "So there is no way for me to plug it in here in the US unless I go by a converter.", 7 | "Good case, Excellent value.", 8 | "Works great!", 9 | 'The design is very odd, as the ear "clip" is not very comfortable at all.', 10 | "Needless to say, I wasted my money." 11 | ] 12 | 13 | # define preprocessing pipeline 14 | text_processor = TextPreProcessor( 15 | fix_text=True, 16 | unpack_contractions=True, 17 | tokenizer=SocialTokenizer(lowercase=True).tokenize, 18 | ) 19 | 20 | # pass each sentence through the pipeline 21 | tokenized_sentences = list(text_processor.pre_process_docs(sentences)) 22 | for sent in tokenized_sentences: 23 | _polarity, _scores = polarity(sent) 24 | print("{:.4f}\t".format(_polarity) + " ".join(sent)) 25 | -------------------------------------------------------------------------------- /ekphrasis/examples/word_segmentation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Reading english - 1grams ...\n" 15 | ] 16 | }, 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Reading english - 2grams ...\n" 22 | ] 23 | }, 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "Reading twitter - 1grams ...\nReading twitter - 2grams ...\n" 29 | ] 30 | }, 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "Reading text8 - 1grams ...\nReading text8 - 2grams ...\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "from ekphrasis.classes.segmenter import Segmenter\n", 41 | "\n", 42 | "seg_eng = Segmenter(corpus=\"english\")\n", 43 | "seg_tw = Segmenter(corpus=\"twitter\")\n", 44 | "seg_t8 = Segmenter(corpus=\"text8\")" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "import pandas as pd\n", 56 | "\n", 57 | "diffs = pd.read_pickle(\"segmenter_diffs.pickle\")" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 15, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "smallandinsignificant\n(eng): small and insignificant\n(tw): small and insignificant\n\ninsufficientnumbers\n(eng): insufficient numbers\n(tw): insufficient numbers\n\nexponentialbackoff\n(eng): exponential backoff\n(tw): exponential back off\n\nsitdown\n(eng): sit down\n(tw): sit down\n\ngamedev\n(eng): gamedev\n(tw): game dev\n\nretrogaming\n(eng): retrogaming\n(tw): retro gaming\n\nthewatercooler\n(eng): the water cooler\n(tw): the watercooler\n\nhomonculus\n(eng): homonculus\n(tw): ho mon cul us\n\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "words = [\"smallandinsignificant\", \"insufficientnumbers\", \"exponentialbackoff\", \"sitdown\", \"gamedev\", \"retrogaming\",\"thewatercooler\", \"homonculus\"]\n", 77 | "for w in words:\n", 78 | " print(w)\n", 79 | " print(\"(eng):\", seg_eng.segment(w))\n", 80 | " print(\"(tw):\", seg_tw.segment(w))\n", 81 | " # print(\"(t8):\", seg_t8.segment(w))\n", 82 | " print()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "" 94 | ] 95 | } 96 | ], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "Python 3", 100 | "language": "python", 101 | "name": "python3" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 3.0 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython3", 113 | "version": "3.5.2" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 0 118 | } -------------------------------------------------------------------------------- /ekphrasis/regexes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/regexes/__init__.py -------------------------------------------------------------------------------- /ekphrasis/regexes/expressions.txt: -------------------------------------------------------------------------------- 1 | { 2 | "ACRONYM": "\\b(?:[A-Z]\\.)(?:[A-Z]\\.)+(?:\\.(?!\\.))?(?:[A-Z]\\b)?", 3 | "ALLCAPS": "(?]?[\\^;][\\W_m][\\;^][;<>]?)|(?:[^\\s()]?m?[\\(][\\W_oTOJ]{1,3}[\\s]?[\\W_oTOJ]{1,3}[)]m?[^\\s()]?)|(?:\\*?[v>\\-\\/\\\\][o0O\\_\\.][v\\-<\\/\\\\]\\*?)|(?:[oO0>][\\-_\\/oO\\.\\\\]{1,2}[oO0>])|(?:\\^\\^))(?![\\w])", 9 | "ELONGATED": "\\b[A-Za-z]*([a-zA-Z])\\1\\1[A-Za-z]*\\b", 10 | "EMAIL": "(?:^|(?<=[^\\w@.)]))(?:[\\w+-](?:\\.(?!\\.))?)*?[\\w+-]@(?:\\w-?)*?\\w+(?:\\.(?:[a-z]{2,})){1,3}(?:$|(?=\\b))", 11 | "EMOJI": "(?:\\uD83C\\uDFF4\\uDB40\\uDC67\\uDB40\\uDC62(?:\\uDB40\\uDC65\\uDB40\\uDC6E\\uDB40\\uDC67|\\uDB40\\uDC77\\uDB40\\uDC6C\\uDB40\\uDC73|\\uDB40\\uDC73\\uDB40\\uDC63\\uDB40\\uDC74)\\uDB40\\uDC7F|\\uD83D\\uDC69\\u200D\\uD83D\\uDC69\\u200D(?:\\uD83D\\uDC66\\u200D\\uD83D\\uDC66|\\uD83D\\uDC67\\u200D(?:\\uD83D[\\uDC66\\uDC67]))|\\uD83D\\uDC68(?:\\u200D(?:\\u2764\\uFE0F\\u200D(?:\\uD83D\\uDC8B\\u200D)?\\uD83D\\uDC68|(?:\\uD83D[\\uDC68\\uDC69])\\u200D(?:\\uD83D\\uDC66\\u200D\\uD83D\\uDC66|\\uD83D\\uDC67\\u200D(?:\\uD83D[\\uDC66\\uDC67]))|\\uD83D\\uDC66\\u200D\\uD83D\\uDC66|\\uD83D\\uDC67\\u200D(?:\\uD83D[\\uDC66\\uDC67])|[\\u2695\\u2696\\u2708]\\uFE0F|\\uD83C[\\uDF3E\\uDF73\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92])|(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D[\\u2695\\u2696\\u2708]\\uFE0F|(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D(?:\\uD83C[\\uDF3E\\uDF73\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92]))|\\uD83D\\uDC69\\u200D(?:\\u2764\\uFE0F\\u200D(?:\\uD83D\\uDC8B\\u200D(?:\\uD83D[\\uDC68\\uDC69])|\\uD83D[\\uDC68\\uDC69])|\\uD83C[\\uDF3E\\uDF73\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92])|\\uD83D\\uDC69\\u200D\\uD83D\\uDC66\\u200D\\uD83D\\uDC66|(?:\\uD83D\\uDC41\\uFE0F\\u200D\\uD83D\\uDDE8|\\uD83D\\uDC69(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D[\\u2695\\u2696\\u2708]|(?:(?:\\u26F9|\\uD83C[\\uDFCB\\uDFCC]|\\uD83D\\uDD75)\\uFE0F|\\uD83D\\uDC6F|\\uD83E[\\uDD3C\\uDDDE\\uDDDF])\\u200D[\\u2640\\u2642]|(?:\\u26F9|\\uD83C[\\uDFCB\\uDFCC]|\\uD83D\\uDD75)(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D[\\u2640\\u2642]|(?:\\uD83C[\\uDFC3\\uDFC4\\uDFCA]|\\uD83D[\\uDC6E\\uDC71\\uDC73\\uDC77\\uDC81\\uDC82\\uDC86\\uDC87\\uDE45-\\uDE47\\uDE4B\\uDE4D\\uDE4E\\uDEA3\\uDEB4-\\uDEB6]|\\uD83E[\\uDD26\\uDD37-\\uDD39\\uDD3D\\uDD3E\\uDDD6-\\uDDDD])(?:(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D[\\u2640\\u2642]|\\u200D[\\u2640\\u2642])|\\uD83D\\uDC69\\u200D[\\u2695\\u2696\\u2708])\\uFE0F|\\uD83D\\uDC69\\u200D\\uD83D\\uDC67\\u200D(?:\\uD83D[\\uDC66\\uDC67])|\\uD83D\\uDC69\\u200D\\uD83D\\uDC69\\u200D(?:\\uD83D[\\uDC66\\uDC67])|\\uD83D\\uDC68(?:\\u200D(?:(?:\\uD83D[\\uDC68\\uDC69])\\u200D(?:\\uD83D[\\uDC66\\uDC67])|\\uD83D[\\uDC66\\uDC67])|\\uD83C[\\uDFFB-\\uDFFF])|\\uD83C\\uDFF3\\uFE0F\\u200D\\uD83C\\uDF08|\\uD83D\\uDC69\\u200D\\uD83D\\uDC67|\\uD83D\\uDC69(?:\\uD83C[\\uDFFB-\\uDFFF])\\u200D(?:\\uD83C[\\uDF3E\\uDF73\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92])|\\uD83D\\uDC69\\u200D\\uD83D\\uDC66|\\uD83C\\uDDF4\\uD83C\\uDDF2|\\uD83C\\uDDFD\\uD83C\\uDDF0|\\uD83C\\uDDF6\\uD83C\\uDDE6|\\uD83D\\uDC69(?:\\uD83C[\\uDFFB-\\uDFFF])|\\uD83C\\uDDFC(?:\\uD83C[\\uDDEB\\uDDF8])|\\uD83C\\uDDEB(?:\\uD83C[\\uDDEE-\\uDDF0\\uDDF2\\uDDF4\\uDDF7])|\\uD83C\\uDDE9(?:\\uD83C[\\uDDEA\\uDDEC\\uDDEF\\uDDF0\\uDDF2\\uDDF4\\uDDFF])|\\uD83C\\uDDE7(?:\\uD83C[\\uDDE6\\uDDE7\\uDDE9-\\uDDEF\\uDDF1-\\uDDF4\\uDDF6-\\uDDF9\\uDDFB\\uDDFC\\uDDFE\\uDDFF])|\\uD83C\\uDDF1(?:\\uD83C[\\uDDE6-\\uDDE8\\uDDEE\\uDDF0\\uDDF7-\\uDDFB\\uDDFE])|\\uD83C\\uDDFE(?:\\uD83C[\\uDDEA\\uDDF9])|\\uD83C\\uDDF9(?:\\uD83C[\\uDDE6\\uDDE8\\uDDE9\\uDDEB-\\uDDED\\uDDEF-\\uDDF4\\uDDF7\\uDDF9\\uDDFB\\uDDFC\\uDDFF])|\\uD83C\\uDDF5(?:\\uD83C[\\uDDE6\\uDDEA-\\uDDED\\uDDF0-\\uDDF3\\uDDF7-\\uDDF9\\uDDFC\\uDDFE])|\\uD83C\\uDDEF(?:\\uD83C[\\uDDEA\\uDDF2\\uDDF4\\uDDF5])|\\uD83C\\uDDED(?:\\uD83C[\\uDDF0\\uDDF2\\uDDF3\\uDDF7\\uDDF9\\uDDFA])|\\uD83C\\uDDEE(?:\\uD83C[\\uDDE8-\\uDDEA\\uDDF1-\\uDDF4\\uDDF6-\\uDDF9])|\\uD83C\\uDDFB(?:\\uD83C[\\uDDE6\\uDDE8\\uDDEA\\uDDEC\\uDDEE\\uDDF3\\uDDFA])|\\uD83C\\uDDEC(?:\\uD83C[\\uDDE6\\uDDE7\\uDDE9-\\uDDEE\\uDDF1-\\uDDF3\\uDDF5-\\uDDFA\\uDDFC\\uDDFE])|\\uD83C\\uDDF7(?:\\uD83C[\\uDDEA\\uDDF4\\uDDF8\\uDDFA\\uDDFC])|\\uD83C\\uDDEA(?:\\uD83C[\\uDDE6\\uDDE8\\uDDEA\\uDDEC\\uDDED\\uDDF7-\\uDDFA])|\\uD83C\\uDDFA(?:\\uD83C[\\uDDE6\\uDDEC\\uDDF2\\uDDF3\\uDDF8\\uDDFE\\uDDFF])|\\uD83C\\uDDE8(?:\\uD83C[\\uDDE6\\uDDE8\\uDDE9\\uDDEB-\\uDDEE\\uDDF0-\\uDDF5\\uDDF7\\uDDFA-\\uDDFF])|\\uD83C\\uDDE6(?:\\uD83C[\\uDDE8-\\uDDEC\\uDDEE\\uDDF1\\uDDF2\\uDDF4\\uDDF6-\\uDDFA\\uDDFC\\uDDFD\\uDDFF])|[#\\*0-9]\\uFE0F\\u20E3|\\uD83C\\uDDF8(?:\\uD83C[\\uDDE6-\\uDDEA\\uDDEC-\\uDDF4\\uDDF7-\\uDDF9\\uDDFB\\uDDFD-\\uDDFF])|\\uD83C\\uDDFF(?:\\uD83C[\\uDDE6\\uDDF2\\uDDFC])|\\uD83C\\uDDF0(?:\\uD83C[\\uDDEA\\uDDEC-\\uDDEE\\uDDF2\\uDDF3\\uDDF5\\uDDF7\\uDDFC\\uDDFE\\uDDFF])|\\uD83C\\uDDF3(?:\\uD83C[\\uDDE6\\uDDE8\\uDDEA-\\uDDEC\\uDDEE\\uDDF1\\uDDF4\\uDDF5\\uDDF7\\uDDFA\\uDDFF])|\\uD83C\\uDDF2(?:\\uD83C[\\uDDE6\\uDDE8-\\uDDED\\uDDF0-\\uDDFF])|(?:\\uD83C[\\uDFC3\\uDFC4\\uDFCA]|\\uD83D[\\uDC6E\\uDC71\\uDC73\\uDC77\\uDC81\\uDC82\\uDC86\\uDC87\\uDE45-\\uDE47\\uDE4B\\uDE4D\\uDE4E\\uDEA3\\uDEB4-\\uDEB6]|\\uD83E[\\uDD26\\uDD37-\\uDD39\\uDD3D\\uDD3E\\uDDD6-\\uDDDD])(?:\\uD83C[\\uDFFB-\\uDFFF])|(?:\\u26F9|\\uD83C[\\uDFCB\\uDFCC]|\\uD83D\\uDD75)(?:\\uD83C[\\uDFFB-\\uDFFF])|(?:[\\u261D\\u270A-\\u270D]|\\uD83C[\\uDF85\\uDFC2\\uDFC7]|\\uD83D[\\uDC42\\uDC43\\uDC46-\\uDC50\\uDC66\\uDC67\\uDC70\\uDC72\\uDC74-\\uDC76\\uDC78\\uDC7C\\uDC83\\uDC85\\uDCAA\\uDD74\\uDD7A\\uDD90\\uDD95\\uDD96\\uDE4C\\uDE4F\\uDEC0\\uDECC]|\\uD83E[\\uDD18-\\uDD1C\\uDD1E\\uDD1F\\uDD30-\\uDD36\\uDDD1-\\uDDD5])(?:\\uD83C[\\uDFFB-\\uDFFF])|(?:[\\u261D\\u26F9\\u270A-\\u270D]|\\uD83C[\\uDF85\\uDFC2-\\uDFC4\\uDFC7\\uDFCA-\\uDFCC]|\\uD83D[\\uDC42\\uDC43\\uDC46-\\uDC50\\uDC66-\\uDC69\\uDC6E\\uDC70-\\uDC78\\uDC7C\\uDC81-\\uDC83\\uDC85-\\uDC87\\uDCAA\\uDD74\\uDD75\\uDD7A\\uDD90\\uDD95\\uDD96\\uDE45-\\uDE47\\uDE4B-\\uDE4F\\uDEA3\\uDEB4-\\uDEB6\\uDEC0\\uDECC]|\\uD83E[\\uDD18-\\uDD1C\\uDD1E\\uDD1F\\uDD26\\uDD30-\\uDD39\\uDD3D\\uDD3E\\uDDD1-\\uDDDD])(?:\\uD83C[\\uDFFB-\\uDFFF])?|(?:[\\u231A\\u231B\\u23E9-\\u23EC\\u23F0\\u23F3\\u25FD\\u25FE\\u2614\\u2615\\u2648-\\u2653\\u267F\\u2693\\u26A1\\u26AA\\u26AB\\u26BD\\u26BE\\u26C4\\u26C5\\u26CE\\u26D4\\u26EA\\u26F2\\u26F3\\u26F5\\u26FA\\u26FD\\u2705\\u270A\\u270B\\u2728\\u274C\\u274E\\u2753-\\u2755\\u2757\\u2795-\\u2797\\u27B0\\u27BF\\u2B1B\\u2B1C\\u2B50\\u2B55]|\\uD83C[\\uDC04\\uDCCF\\uDD8E\\uDD91-\\uDD9A\\uDDE6-\\uDDFF\\uDE01\\uDE1A\\uDE2F\\uDE32-\\uDE36\\uDE38-\\uDE3A\\uDE50\\uDE51\\uDF00-\\uDF20\\uDF2D-\\uDF35\\uDF37-\\uDF7C\\uDF7E-\\uDF93\\uDFA0-\\uDFCA\\uDFCF-\\uDFD3\\uDFE0-\\uDFF0\\uDFF4\\uDFF8-\\uDFFF]|\\uD83D[\\uDC00-\\uDC3E\\uDC40\\uDC42-\\uDCFC\\uDCFF-\\uDD3D\\uDD4B-\\uDD4E\\uDD50-\\uDD67\\uDD7A\\uDD95\\uDD96\\uDDA4\\uDDFB-\\uDE4F\\uDE80-\\uDEC5\\uDECC\\uDED0-\\uDED2\\uDEEB\\uDEEC\\uDEF4-\\uDEF8]|\\uD83E[\\uDD10-\\uDD3A\\uDD3C-\\uDD3E\\uDD40-\\uDD45\\uDD47-\\uDD4C\\uDD50-\\uDD6B\\uDD80-\\uDD97\\uDDC0\\uDDD0-\\uDDE6])|(?:[#\\*0-9\\xA9\\xAE\\u203C\\u2049\\u2122\\u2139\\u2194-\\u2199\\u21A9\\u21AA\\u231A\\u231B\\u2328\\u23CF\\u23E9-\\u23F3\\u23F8-\\u23FA\\u24C2\\u25AA\\u25AB\\u25B6\\u25C0\\u25FB-\\u25FE\\u2600-\\u2604\\u260E\\u2611\\u2614\\u2615\\u2618\\u261D\\u2620\\u2622\\u2623\\u2626\\u262A\\u262E\\u262F\\u2638-\\u263A\\u2640\\u2642\\u2648-\\u2653\\u2660\\u2663\\u2665\\u2666\\u2668\\u267B\\u267F\\u2692-\\u2697\\u2699\\u269B\\u269C\\u26A0\\u26A1\\u26AA\\u26AB\\u26B0\\u26B1\\u26BD\\u26BE\\u26C4\\u26C5\\u26C8\\u26CE\\u26CF\\u26D1\\u26D3\\u26D4\\u26E9\\u26EA\\u26F0-\\u26F5\\u26F7-\\u26FA\\u26FD\\u2702\\u2705\\u2708-\\u270D\\u270F\\u2712\\u2714\\u2716\\u271D\\u2721\\u2728\\u2733\\u2734\\u2744\\u2747\\u274C\\u274E\\u2753-\\u2755\\u2757\\u2763\\u2764\\u2795-\\u2797\\u27A1\\u27B0\\u27BF\\u2934\\u2935\\u2B05-\\u2B07\\u2B1B\\u2B1C\\u2B50\\u2B55\\u3030\\u303D\\u3297\\u3299]|\\uD83C[\\uDC04\\uDCCF\\uDD70\\uDD71\\uDD7E\\uDD7F\\uDD8E\\uDD91-\\uDD9A\\uDDE6-\\uDDFF\\uDE01\\uDE02\\uDE1A\\uDE2F\\uDE32-\\uDE3A\\uDE50\\uDE51\\uDF00-\\uDF21\\uDF24-\\uDF93\\uDF96\\uDF97\\uDF99-\\uDF9B\\uDF9E-\\uDFF0\\uDFF3-\\uDFF5\\uDFF7-\\uDFFF]|\\uD83D[\\uDC00-\\uDCFD\\uDCFF-\\uDD3D\\uDD49-\\uDD4E\\uDD50-\\uDD67\\uDD6F\\uDD70\\uDD73-\\uDD7A\\uDD87\\uDD8A-\\uDD8D\\uDD90\\uDD95\\uDD96\\uDDA4\\uDDA5\\uDDA8\\uDDB1\\uDDB2\\uDDBC\\uDDC2-\\uDDC4\\uDDD1-\\uDDD3\\uDDDC-\\uDDDE\\uDDE1\\uDDE3\\uDDE8\\uDDEF\\uDDF3\\uDDFA-\\uDE4F\\uDE80-\\uDEC5\\uDECB-\\uDED2\\uDEE0-\\uDEE5\\uDEE9\\uDEEB\\uDEEC\\uDEF0\\uDEF3-\\uDEF8]|\\uD83E[\\uDD10-\\uDD3A\\uDD3C-\\uDD3E\\uDD40-\\uDD45\\uDD47-\\uDD4C\\uDD50-\\uDD6B\\uDD80-\\uDD97\\uDDC0\\uDDD0-\\uDDE6])\\uFE0F?)", 12 | "EMPHASIS": "(?:\\*\\b\\w+\\b\\*)", 13 | "HASHTAG": "\\#\\b[\\w\\-\\_]+\\b", 14 | "LTR_FACE": "(?:(?<=])?(?:(?|/\\\\]+|[\u00de\u00d7\u00fe]|(?|/\\\\]+|(?<=])?(?![a-zA-Z])", 24 | "TAG": "<[\\/]?\\w+[\\/]?>", 25 | "TIME": "(?:(?:\\d+)?\\.?\\d+(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))|(?:(?:[0-2]?[0-9]|[2][0-3]):(?:[0-5][0-9])(?::(?:[0-5][0-9]))?(?: ?(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))?)", 26 | "URL": "(?:https?:\\/\\/(?:www\\.|(?!www))[^\\s\\.]+\\.[^\\s]{2,}|www\\.[^\\s]+\\.[^\\s]{2,})", 27 | "USER": "\\@\\w+", 28 | "WORD": "(?:[\\w_]+)" 29 | } -------------------------------------------------------------------------------- /ekphrasis/regexes/generate_expressions.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | ############################################################################## 4 | # EMOTICONS 5 | ############################################################################## 6 | # [DPO023|}><=]? # optional hat 7 | # [xXB:#%=|;8\*] # eyes 8 | # ['\",]? # optional tears 9 | # [oc\-^]? # optional nose 10 | # [DpPO03cboÞþJSLxX*@$#&,.\|<>}{()\[\]\\/] # mouth 11 | ############################################################################## 12 | 13 | __ltr_emoticon = [ 14 | # optional hat 15 | r"(?:(?<=])?", 16 | 17 | # eyes 18 | r"(?:(?|/\\]+|[Þ×þ]|(?|/\\]+|(?<=])?", # optional hat 38 | r"(?![a-zA-Z])", 39 | ] 40 | 41 | __LTR_FACE = "".join(__ltr_emoticon) 42 | __RTL_FACE = "".join(__rtl_emoticon) 43 | 44 | ############################################################################## 45 | # DATES/TIMES todo: add days 46 | # the regex captures most ways a date my be expressed in natural language. 47 | ############################################################################## 48 | __short_date = r"(?:\b(?", 82 | "USER": r"\@\w+", 83 | "EMPHASIS": r"(?:\*\b\w+\b\*)", 84 | "CENSORED": r"(?:\b\w+\*+\w+\b)", 85 | "ACRONYM": r"\b(?:[A-Z]\.)(?:[A-Z]\.)+(?:\.(?!\.))?(?:[A-Z]\b)?", 86 | "ELONGATED": r"\b[A-Za-z]*([a-zA-Z])\1\1[A-Za-z]*\b", 87 | "RTL_FACE": __RTL_FACE, 88 | "LTR_FACE": __LTR_FACE, 89 | "EASTERN_EMOTICONS": r"(?]?[\^;][\W_m][\;^][;<>]?)|(?:[^\s()]?m?[\(][\W_oTOJ]{1,3}[\s]?[\W_oTOJ]{1,3}[)]m?[^\s()]?)|(?:\*?[v>\-\/\\][o0O\_\.][v\-<\/\\]\*?)|(?:[oO0>][\-_\/oO\.\\]{1,2}[oO0>])|(?:\^\^))(?![\w])", 90 | "REST_EMOTICONS": r"(?= threshold: 97 | entry = k.split(SEPARATOR) 98 | entry.append(str(v)) 99 | f.write('\t'.join(entry) + '\n') 100 | 101 | if args.pickle: 102 | with open(file + ".pickle", 'wb') as f: 103 | pickle.dump(counts, f) 104 | 105 | 106 | def count_file(filename, countkeeper, desc=""): 107 | """ 108 | Count the word statistics of a file 109 | :param desc: 110 | :param filename: 111 | :param countkeeper: 112 | :return: 113 | """ 114 | print() 115 | print("computing statistics for file: ", filename) 116 | with open(filename, "r", encoding="utf-8", errors='ignore') as infile: 117 | num_lines = sum(1 for line in open(filename, "r", encoding="utf-8")) 118 | for line in tqdm(infile, total=num_lines, desc=desc): 119 | try: 120 | toks = tokenize(line) 121 | for i in range(args.ngrams): 122 | ngram = i + 1 123 | if ngram > 1: 124 | toks = [""] + toks 125 | for token in get_ngrams(toks, ngram): 126 | countkeeper[ngram][SEPARATOR.join(token)] += 1 127 | except Exception as e: 128 | print("ERROR - ", e, infile) 129 | 130 | 131 | def write_stats(counts): 132 | print() 133 | dir_path = os.path.dirname(os.path.realpath(__file__)) 134 | for k, v in counts.items(): 135 | print("Writing " + str(k) + "-grams...") 136 | counter = Counter(counts[k]) 137 | print("entries:{}\t-\ttokens:{}".format(format(len(counter), ','), 138 | format(sum(counter.values()), 139 | ','))) 140 | 141 | name = "counts_{}grams.txt".format(str(k)) 142 | filename = os.path.join(dir_path, "..", "stats", args.name, name) 143 | 144 | print("writing stats to file {}".format(filename)) 145 | os.makedirs(os.path.dirname(filename), exist_ok=True) 146 | 147 | write_stats_to_file(filename, counter, args.mincount[int(k) - 1]) 148 | 149 | 150 | def prune_low_freq(word_stats, threshold): 151 | """ 152 | remove ngrams with count less than mincount 153 | avoid dict comprehension as it creates a new temp dict 154 | and overloads the memory 155 | Args: 156 | word_stats (): 157 | threshold (): 158 | 159 | Returns: 160 | 161 | """ 162 | for ng in list(word_stats.keys()): 163 | for t in list(word_stats[ng].keys()): 164 | if not word_stats[ng][t] >= threshold: 165 | del word_stats[ng][t] 166 | 167 | 168 | def plot_statistics(statistics): 169 | fig = plt.figure(figsize=(5 * len(statistics), 5)) 170 | for i, (k, v) in enumerate(statistics.items()): 171 | ax = fig.add_subplot(1, len(statistics), i + 1) 172 | ax.set_title("{}-gram - total={}".format(k, len(v))) 173 | ax.grid(True) 174 | values = numpy.fromiter(statistics[k].values(), numpy.int32) 175 | ax.hist(values, bins=100, range=(0, 100)) 176 | fig.tight_layout() 177 | fig.canvas.draw() 178 | fig.canvas.flush_events() 179 | 180 | 181 | if __name__ == '__main__': 182 | plt.ion() # set plot to animated 183 | stats = defaultdict(lambda: defaultdict(int)) 184 | pruning_size_threshold = 5000000 185 | low_freq_threshold = 3 186 | 187 | if os.path.isfile(args.input): 188 | count_file(args.input, stats) 189 | time.sleep(0.01) 190 | prune_low_freq(stats, 1) 191 | write_stats(stats) 192 | 193 | elif os.path.isdir(args.input): 194 | files = glob.glob(args.input + "*.txt") 195 | for i, file in enumerate(files): 196 | try: 197 | count_file(file, stats, str(i + 1) + "/" + str(len(files))) 198 | except Exception as e: 199 | print("ERROR - ", e, file) 200 | 201 | time.sleep(0.01) 202 | 203 | if any(len(stats[ngram]) > pruning_size_threshold for ngram in 204 | list(stats.keys())): 205 | print("Cleaning entries with only one occurrence, " 206 | "in order to save memory...") 207 | prune_low_freq(stats, low_freq_threshold) 208 | # write progress 209 | # plot_statistics(stats) 210 | 211 | write_stats(stats) 212 | 213 | prune_low_freq(stats, low_freq_threshold) 214 | write_stats(stats) 215 | else: 216 | print("Wrong input. Give a file or directory!") 217 | -------------------------------------------------------------------------------- /ekphrasis/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbaziotis/ekphrasis/309b6b089bb1ebaed705ba9ffa584f1826e296d4/ekphrasis/utils/__init__.py -------------------------------------------------------------------------------- /ekphrasis/utils/helpers.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | import operator 3 | import os 4 | from os import path 5 | from os.path import expanduser 6 | import sys 7 | import ujson as json 8 | from urllib.request import urlretrieve 9 | import zipfile 10 | 11 | 12 | def get_stats_dir(): 13 | home = expanduser("~") 14 | 15 | ekphrasis_dir = path.join(home, '.ekphrasis') 16 | 17 | if not os.path.exists(ekphrasis_dir): 18 | os.makedirs(ekphrasis_dir) 19 | 20 | stats_dir = path.join(ekphrasis_dir, 'stats') 21 | 22 | if not os.path.exists(stats_dir): 23 | os.makedirs(stats_dir) 24 | 25 | return stats_dir 26 | 27 | 28 | def parse_stats(name, sep='\t', ngram_sep='_'): 29 | """ 30 | Read key,value pairs from file. 31 | """ 32 | print("reading ngrams", name) 33 | d = {} 34 | with open(name, "r", encoding="utf-8") as f: 35 | for line in f: 36 | values = line.split(sep) 37 | if len(values) > 2: 38 | d[ngram_sep.join(values[:-1])] = int(values[-1]) 39 | else: 40 | d[values[0]] = int(values[1]) 41 | 42 | return d 43 | 44 | 45 | def read_stats(corpus, ngram): 46 | stats_dir = get_stats_dir() 47 | check_stats_files() 48 | print("Reading " + "{} - {}grams ...".format(corpus, ngram)) 49 | text = path.join(*[stats_dir, corpus, "counts_{}grams.txt".format(ngram)]) 50 | dumped = path.join( 51 | *[stats_dir, corpus, "counts_{}grams.json".format(ngram)]) 52 | 53 | if os.path.isfile(dumped): 54 | with open(dumped, "r") as f: 55 | stats = json.load(f) 56 | return stats 57 | elif os.path.isfile(text): 58 | print("generating cache file for faster loading...") 59 | stats = parse_stats(text) 60 | with open(dumped, "w") as f: 61 | json.dump(stats, f) 62 | return stats 63 | else: 64 | print("stats file not available!") 65 | sys.exit(1) 66 | 67 | 68 | def listdir_nohidden(path): 69 | return [f for f in os.listdir(path) if not f.startswith('.')] 70 | 71 | 72 | def download_statistics(): 73 | stats_dir = get_stats_dir() 74 | print("Word statistics files not found!\nDownloading...", end=" ") 75 | # url = "https://www.dropbox.com/s/a84otqrg6u1c5je/stats.zip?dl=1" 76 | url = "https://data.statmt.org/cbaziotis/projects/ekphrasis/stats.zip" 77 | urlretrieve(url, "stats.zip") 78 | print("done!") 79 | 80 | print("Unpacking...", end=" ") 81 | with zipfile.ZipFile("stats.zip", "r") as zip_ref: 82 | zip_ref.extractall(stats_dir) 83 | 84 | os.remove("stats.zip") 85 | print("done!") 86 | 87 | 88 | def check_stats_files(): 89 | stats_dir = get_stats_dir() 90 | if not os.path.exists(stats_dir) or len(listdir_nohidden(stats_dir)) == 0: 91 | download_statistics() 92 | 93 | 94 | def product(nums): 95 | """ 96 | Return the product of a sequence of numbers. 97 | """ 98 | return reduce(operator.mul, nums, 1) 99 | 100 | def remove_tags(doc): 101 | """ 102 | Remove tags from sentence 103 | """ 104 | doc = ' '.join(word for word in doc.split() if word[0]!='<') 105 | return doc 106 | 107 | # check_stats_files() 108 | -------------------------------------------------------------------------------- /ekphrasis/utils/nlp.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import re 3 | 4 | import nltk 5 | import numpy 6 | from nltk.corpus import sentiwordnet as swn 7 | from termcolor import cprint 8 | 9 | wordnet_lemmatizer = nltk.WordNetLemmatizer() 10 | # additional negations: nowhere 11 | 12 | negation_words = {"'t", "ain't", 'aint', "aren't", 'arent', 'cant', 13 | "didn't", 'didnt', "doesn't", 'doesnt', "don't", 'dont', 14 | "hadn't", 'hadnt', "hasn't", 'hasnt', "haven't", 'havent', 15 | "isn't", 'isnt', 16 | 'never', 'no', 'none', 'noone', 'not', 'nothing', 'wont', } 17 | negation_modals = {"couldn't", 'couldnt', "shouldn't", 'shouldnt', "wouldn't", 18 | 'wouldnt'} 19 | contrast_words = {"but", "although", "though", "however", "despite", "whereas", 20 | "while", "unlike", "still"} 21 | neg_puncts = {"\n", ".", "?", ":", "..."} 22 | 23 | 24 | def unpack_contractions(text): 25 | """ 26 | Replace *English* contractions in ``text`` str with their unshortened forms. 27 | N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive), 28 | so are left as-is. 29 | 30 | --------- 31 | --------- 32 | 33 | Important Note: The function is taken from textacy (https://github.com/chartbeat-labs/textacy). 34 | 35 | See textacy.preprocess.unpack_contractions(text) 36 | -> http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.preprocess.unpack_contractions 37 | 38 | 39 | The reason that textacy is not added as a dependency is to avoid having the user to install it's dependencies (such as SpaCy), 40 | in order to just use this function. 41 | 42 | """ 43 | # standard 44 | text = re.sub( 45 | r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n['’]t", 46 | r"\1\2 not", text) 47 | text = re.sub( 48 | r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)['’]ll", 49 | r"\1\2 will", text) 50 | text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)['’]re", r"\1\2 are", 51 | text) 52 | text = re.sub( 53 | r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)['’]ve", 54 | r"\1\2 have", text) 55 | # non-standard 56 | text = re.sub(r"(\b)([Cc]a)n['’]t", r"\1\2n not", text) 57 | text = re.sub(r"(\b)([Ii])['’]m", r"\1\2 am", text) 58 | text = re.sub(r"(\b)([Ll]et)['’]s", r"\1\2 us", text) 59 | text = re.sub(r"(\b)([Ww])on['’]t", r"\1\2ill not", text) 60 | text = re.sub(r"(\b)([Ss])han['’]t", r"\1\2hall not", text) 61 | text = re.sub(r"(\b)([Yy])(?:['’]all|a['’]ll)", r"\1\2ou all", text) 62 | return text 63 | 64 | 65 | def doc_ngrams(doc, n_from=1, n_to=2): 66 | return list(itertools.chain.from_iterable( 67 | [[doc[i:i + n] for i in range(len(doc) - (n - 1))] 68 | for n in range(n_from, n_to + 1)])) 69 | 70 | 71 | def find_negations(doc, neg_comma=True, neg_modals=True, debug=False): 72 | """ 73 | Takes as input a list of words and returns the positions (indices) of the words 74 | that are in the context of a negation. 75 | 76 | :param list doc: a list of words (strings) 77 | :param bool neg_comma: if True, the negation context ends on a comma 78 | :param bool neg_modals: if True, include negation modals in the set of negation words 79 | :param bool debug: if True, print the text color coded by context 80 | :return set: a set of the word positions inside a negation 81 | 82 | """ 83 | doc_context = [] 84 | append = doc_context.append 85 | negation_stopset = neg_puncts | {","} if neg_comma else set() 86 | negation_startset = negation_words | negation_modals if neg_modals else set() 87 | 88 | # status == "normal" means outside of parentheses 89 | # status == "parentheses" means inside parentheses 90 | # status[XXX] == True means that the context XXX is negated 91 | # status[XXX] == False means that the context XXX is affirmative 92 | status = {"normal": False, "parentheses": False} 93 | 94 | # pointer to the current context 95 | current = "normal" 96 | 97 | for i, tok in enumerate(doc): 98 | 99 | if tok in negation_startset: 100 | status[current] = True 101 | if debug: 102 | cprint(tok, 'red', attrs=['bold'], end=' ') 103 | continue 104 | 105 | if tok in negation_stopset | contrast_words: 106 | if debug: 107 | if status[current]: 108 | cprint(tok, 'green', attrs=['bold'], end=' ') 109 | else: 110 | print(tok, end=" ") 111 | status[current] = False 112 | continue 113 | 114 | if tok == "(": 115 | current = "parentheses" 116 | if debug: 117 | cprint(tok, 'green', attrs=['bold'], end=' ') 118 | continue 119 | 120 | if tok == ")": 121 | status[ 122 | "parentheses"] = False # in order to be false the next time it goes in to a parentheses 123 | current = "normal" 124 | if debug: 125 | cprint(tok, 'green', attrs=['bold'], end=' ') 126 | continue 127 | 128 | if debug: 129 | if status[current]: 130 | cprint(tok, 'magenta', end=' ') 131 | else: 132 | print(tok, end=" ") 133 | 134 | if status[current]: 135 | append(i) 136 | 137 | if debug: 138 | print() 139 | # input("press to continue...") 140 | 141 | return set(doc_context) 142 | 143 | 144 | def mark_doc(doc, wids, mark=None, pos=None): 145 | """ 146 | Given a list of words and a set of word positions, mark the words in those positions. 147 | :param list doc: a list of words (strings) 148 | :param set wids: the positions of the words to be marked 149 | :param string mark: a string that sets the mark that will be applied 150 | to each of the selected words 151 | :param string pos: can be one of {"prefix", "suffix"} 152 | :return: the marked list of words 153 | """ 154 | if mark is None: 155 | mark = "NEG" 156 | 157 | if pos is None: 158 | pos = "suffix" 159 | 160 | marked_doc = [] 161 | 162 | for i, tok in enumerate(doc): 163 | if i in wids: 164 | if pos == "prefix": 165 | word = mark + "_" + tok 166 | else: 167 | word = tok + "_" + mark 168 | marked_doc.append(word) 169 | else: 170 | marked_doc.append(tok) 171 | 172 | return marked_doc 173 | 174 | 175 | def polarity(doc, neg_comma=True, neg_modals=True): 176 | """ 177 | Estimate the sentiment polarity of a tokenized document. 178 | Args: 179 | doc (): a list of words (strings) 180 | neg_comma (): if True, the negation context ends on a comma 181 | neg_modals (): if True, include negation modals in the set of negation words 182 | 183 | Returns: 184 | - polarity 185 | - [positive, negative, neutral] scores 186 | 187 | """ 188 | 189 | tagged = nltk.pos_tag([wordnet_lemmatizer.lemmatize(w) 190 | for w in doc]) 191 | negations = find_negations(doc, neg_comma=neg_comma, neg_modals=neg_modals) 192 | scores = [] 193 | for i, (word, tag) in enumerate(tagged): 194 | try: 195 | ss_set = None 196 | if 'NN' in tag and swn.senti_synsets(word): 197 | ss_set = list(swn.senti_synsets(word))[0] 198 | elif 'VB' in tag and swn.senti_synsets(word): 199 | ss_set = list(swn.senti_synsets(word))[0] 200 | elif 'JJ' in tag and swn.senti_synsets(word): 201 | ss_set = list(swn.senti_synsets(word))[0] 202 | elif 'RB' in tag and swn.senti_synsets(word): 203 | ss_set = list(swn.senti_synsets(word))[0] 204 | if ss_set: 205 | pos = ss_set.pos_score() 206 | neg = ss_set.neg_score() 207 | obj = ss_set.obj_score() 208 | if i in negations: 209 | pos, neg = neg, pos 210 | scores.append([pos, neg, obj]) 211 | except: 212 | pass 213 | 214 | _scores = numpy.mean(numpy.array(scores), axis=0) 215 | _polarity = _scores[0] - _scores[1] 216 | 217 | return _polarity, _scores 218 | -------------------------------------------------------------------------------- /local_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf build 4 | rm -rf ekphrasis.egg-info 5 | rm -rf dist 6 | 7 | python setup.py sdist bdist_wheel 8 | 9 | pip install --no-index --find-links=dist\ ekphrasis --force-reinstall --no-deps -U -------------------------------------------------------------------------------- /pypi_push.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf build 4 | rm -rf ekphrasis.egg-info 5 | rm -rf dist 6 | 7 | python setup.py sdist bdist_wheel 8 | pip wheel -r requirements.txt 9 | 10 | # twine register dist/*.tar.gz 11 | twine upload dist/* 12 | # python setup.py sdist upload -r pypi 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm==4.19.4 2 | colorama==0.3.9 3 | matplotlib==2.2.2 4 | setuptools==36.2.5 5 | termcolor==1.1.0 6 | numpy==1.19.1 7 | nltk==3.2.4 8 | ujson==1.35 9 | ftfy==4.4.3 10 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='ekphrasis', 4 | version='0.5.4', 5 | description='Text processing tool, geared towards text from ' 6 | 'social networks, such as Twitter or Facebook. ' 7 | 'Ekphrasis performs tokenization, word normalization, ' 8 | 'word segmentation (for splitting hashtags) ' 9 | 'and spell correction.', 10 | url='https://github.com/cbaziotis/ekphrasis', 11 | author='Christos Baziotis', 12 | author_email='christos.baziotis@gmail.com', 13 | license='MIT', 14 | packages=find_packages(exclude=['docs', 'tests*', 'analysis']), 15 | install_requires=[ 16 | 'termcolor', 17 | 'tqdm', 18 | 'colorama', 19 | 'ujson', 20 | 'matplotlib', 21 | 'nltk', 22 | 'ftfy', 23 | 'numpy' 24 | ], 25 | include_package_data=True 26 | ) 27 | --------------------------------------------------------------------------------