├── requirements.txt ├── .travis.yml ├── imgs └── zarathustra.png ├── test.py ├── .gitignore ├── setup.py ├── test └── test_lbda.py ├── .github └── workflows │ └── build.yml ├── examples ├── tweets.py ├── feature_extraction.py └── book.py ├── README.md ├── data └── README.txt └── src └── lbsa.py /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.21.0 2 | pandas 3 | requests -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | script: nosetests -v test/test_lbda.py -------------------------------------------------------------------------------- /imgs/zarathustra.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AntoinePassemiers/Lexicon-Based-Sentiment-Analysis/HEAD/imgs/zarathustra.png -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | df = pd.read_excel('NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx', sheet_name='NRC-Lex-v0.92-word-translations') 5 | 6 | print(df) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.csv 3 | 4 | __pycache__/ 5 | build/ 6 | dist/ 7 | lbsa.egg-info/ 8 | 9 | NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx 10 | thus_spoke_zarathustra.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # setup.py 3 | # author : Antoine Passemiers 4 | 5 | import os 6 | from setuptools import setup, find_packages 7 | 8 | 9 | datafiles = [(d, [os.path.join(d, f) for f in files]) 10 | for d, folders, files in os.walk('data')] 11 | 12 | setup( 13 | name='lbsa', 14 | version='0.0.1', 15 | author='Antoine Passemiers', 16 | description='Lexicon-based sentiment analysis', 17 | packages = find_packages('src'), 18 | include_package_data=True, 19 | package_dir={"": "src"}, 20 | py_modules=["lbsa"], 21 | data_files = datafiles, 22 | url='https://github.com/AntoinePassemiers/Lexicon-Based-Sentiment-Analysis', 23 | install_requires=[ 24 | 'numpy >= 1.13.3', 25 | 'matplotlib >= 2.0.2', 26 | 'pandas' 27 | ], 28 | ) -------------------------------------------------------------------------------- /test/test_lbda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # test-lbsa.py 3 | # author : Antoine Passemiers 4 | 5 | import lbsa 6 | 7 | 8 | def test_lbsa(): 9 | reviews = [ 10 | 'You should get this game, because in this game you can destroy other cars with really AWESOME guns like a acid thrower', 11 | 'A great dev, but a mediocre app. You just tap the screen . In a word : BORING . Don\'t get this app.', 12 | 'Even at free it was too expensive. Total waste of time and space. Save yourself the trouble of having to remove it by not downloading it in the first place.', 13 | 'Works flawlessly with my favorite stations. I highly recommend this app as it makes finding a stream for your favorite local radio stations a breeze.' 14 | ] 15 | 16 | afinn_lexicon = lbsa.get_lexicon('opinion', language='english', source='afinn') 17 | mpqa_lexicon = lbsa.get_lexicon('opinion', language='english', source='mpqa') 18 | 19 | extractor = lbsa.FeatureExtractor(afinn_lexicon, mpqa_lexicon) 20 | extractor.process(reviews) 21 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | python-version: [3.8] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python3 -m pip install numpy pytest nose coverage 27 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 28 | python3 setup.py install 29 | - name: Configure cc-test-reporter 30 | run: | 31 | curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter 32 | chmod +x ./cc-test-reporter 33 | ./cc-test-reporter before-build 34 | - name: Test with nose 35 | run: | 36 | coverage run -m nose -v test 37 | -------------------------------------------------------------------------------- /examples/tweets.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # tweets.py 3 | # author : Antoine Passemiers 4 | 5 | import lbsa 6 | 7 | 8 | tweet = """ 9 | The Budget Agreement today is so important for our great Military. 10 | It ends the danger sequester and gives Secretary Mattis what he needs to keep America Great. 11 | Republicans and Democrats must support our troops and support this Bill! 12 | """ 13 | 14 | print('\nUse NRC lexicon') 15 | lexicon = lbsa.get_lexicon('opinion', language='english', source='nrc') 16 | print(lexicon.process(tweet)) 17 | 18 | print('\nUse afinn lexicon') 19 | lexicon = lbsa.get_lexicon('opinion', language='english', source='afinn') 20 | print(lexicon.process(tweet)) 21 | 22 | print('\nUse mpqa lexicon') 23 | lexicon = lbsa.get_lexicon('opinion', language='english', source='mpqa') 24 | print(lexicon.process(tweet)) 25 | 26 | tweet2 = """ 27 | A la suite de la tempête #Eunice et à la demande du Président de la République, 28 | l’Etat décrétera dans les meilleurs délais l’état de catastrophe naturelle partout 29 | où cela s’avérera nécessaire. 30 | """ 31 | print('\nAuto-detect languages') 32 | lexicon = lbsa.get_lexicon('sa', language='auto', source='nrc') 33 | print(lexicon.process(tweet)) 34 | print(lexicon.process(tweet2)) 35 | -------------------------------------------------------------------------------- /examples/feature_extraction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # tweets.py 3 | # author : Antoine Passemiers 4 | 5 | import lbsa 6 | 7 | reviews = [ 8 | 'You should get this game, because in this game you can destroy other cars with really AWESOME guns like a acid thrower', 9 | 'A great dev, but a mediocre app. You just tap the screen . In a word : BORING . Don\'t get this app.', 10 | 'Even at free it was too expensive. Total waste of time and space. Save yourself the trouble of having to remove it by not downloading it in the first place.', 11 | 'Works flawlessly with my favorite stations. I highly recommend this app as it makes finding a stream for your favorite local radio stations a breeze.' 12 | ] 13 | 14 | afinn_lexicon = lbsa.get_lexicon('opinion', language='english', source='afinn') 15 | nrc_lexicon = lbsa.get_lexicon('opinion', language='english', source='nrc') 16 | nrc_sa_lexicon = lbsa.get_lexicon('sa', language='english', source='nrc') 17 | mpqa_lexicon = lbsa.get_lexicon('opinion', language='english', source='mpqa') 18 | 19 | extractor = lbsa.FeatureExtractor(afinn_lexicon, nrc_lexicon, nrc_sa_lexicon, mpqa_lexicon) 20 | 21 | print('Feature names:') 22 | print('{}\n'.format(extractor.feature_names)) 23 | 24 | print(extractor.process(reviews)) -------------------------------------------------------------------------------- /examples/book.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # lbsa.py: lexicon-based sentiment analysis 3 | # author : Antoine Passemiers 4 | 5 | import lbsa 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | def moving_average(sequence, n=1000): 12 | ma = np.cumsum(sequence, axis=0) 13 | ma[n:] = ma[n:] - ma[:-n] 14 | return ma[n - 1:] / n 15 | 16 | 17 | # https://archive.org/stream/thusspokezarathu00nietuoft/thusspokezarathu00nietuoft_djvu.txt 18 | with open('../../data/thus_spoke_zarathustra.txt', 'r') as f: 19 | text = f.read() 20 | lexicon = lbsa.get_lexicon('sa', language='english') 21 | features = lbsa.make_time_analysis(text, lexicon) 22 | 23 | block_size = 100 24 | 25 | tag_names = lexicon.get_tag_names() 26 | for feature_name in tag_names: 27 | feature = features[feature_name] 28 | new_length = len(feature) - (len(feature) % block_size) 29 | feature = np.mean(feature[:new_length].reshape(-1, block_size), axis=1) 30 | feature = moving_average(feature, n=100) 31 | 32 | plt.plot(feature, label=feature_name) 33 | 34 | plt.legend() 35 | plt.ylabel('Average counts', fontsize=15) 36 | plt.xlabel('Number of blocks (1 block = %i words)' % block_size, fontsize=15) 37 | plt.title('Sentiment analysis of "Thus spoke Zarathustra" over time', fontsize=15) 38 | 39 | fig = plt.gcf() 40 | fig.set_size_inches(18.5, 10.5) 41 | fig.savefig('zarathustra.png', dpi=100) 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build status](https://github.com/antropie/cryptobot/actions/workflows/build.yml/badge.svg)](https://github.com/antropie/cryptobot/actions?query=build) 2 | # LBSA - Lexicon-based Sentiment Analysis 3 | 4 | Fast library for sentiment analysis, opinion mining and language detection. 5 | 6 | ## Installation 7 | 8 | Install dependencies: 9 | ```sh 10 | $ sudo pip3 install requirements.txt 11 | ``` 12 | 13 | From the parent folder, install the library by typing the following command: 14 | 15 | ```sh 16 | $ sudo python3 setup.py install 17 | ``` 18 | 19 | To access the NRC lexicon, download it from: 20 | http://www.saifmohammad.com/WebDocs/Lexicons/NRC-Emotion-Lexicon.zip 21 | 22 | Extract it, and provide the path to the excel file the first time you use the NRC lexicon. 23 | For example: 24 | ```python 25 | >>> path = 'path/to/NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx' 26 | >>> sa_lexicon = lbsa.get_lexicon('sa', language='english', source='nrc', path=path) 27 | ``` 28 | 29 | 30 | ### Dependencies 31 | 32 | * numpy >= 1.13.3 33 | * pandas >= 0.21.0 34 | * xlrd 35 | 36 | ## Features 37 | 38 | ### Sentiment analysis 39 | 40 | ```python 41 | >>> import lbsa 42 | >>> tweet = """ 43 | ... The Budget Agreement today is so important for our great Military. 44 | ... It ends the dangerous sequester and gives Secretary Mattis what he needs to keep America Great. 45 | ... Republicans and Democrats must support our troops and support this Bill! 46 | ... """ 47 | >>> sa_lexicon = lbsa.get_lexicon('sa', language='english', source='nrc') 48 | >>> sa_lexicon.process(tweet) 49 | {'anger': 0, 'anticipation': 0, 'disgust': 0, 'fear': 2, 'joy': 0, 'sadness': 0, 50 | 'surprise': 0, 'trust': 3} 51 | ``` 52 | 53 | ### Opinion mining 54 | 55 | ```python 56 | >>> op_lexicon = lbsa.get_lexicon('opinion', language='english', source='nrc') 57 | >>> op_lexicon.process(tweet) 58 | {'positive': 2, 'negative': 1} 59 | ``` 60 | 61 | ### Language detection 62 | 63 | Language detection requires the NRC lexicon: 64 | 65 | ```python 66 | >>> import lbsa 67 | >>> tweet = """ 68 | ... A la suite de la tempête #Eunice et à la demande du Président de la République, 69 | ... lEtat décrétera dans les meilleurs délais létat de catastrophe naturelle partout 70 | ... où cela savérera nécessaire. 71 | ... """ 72 | >>> lexicon = lbsa.get_lexicon('sa', language='auto', source='nrc') 73 | >>> print(lexicon.process(tweet)) 74 | {'anger': 2, 'anticipation': 1, 'disgust': 1, 'fear': 2, 'joy': 0, 'sadness': 2, 'surprise': 2, 75 | 'trust': 0, 'lang': 'french'} 76 | ``` 77 | 78 | ### Feature extractor 79 | 80 | ```python 81 | >>> extractor = lbsa.FeatureExtractor(sa_lexicon, op_lexicon) 82 | >>> extractor.process(tweet) 83 | array([0., 0., 0., 2., 0., 0., 0., 3., 2., 1.]) 84 | ``` 85 | 86 | #### Example 87 | 88 | Feature extractor: 89 | 90 | [feature_extraction.py](https://github.com/AntoinePassemiers/Lexicon-Based-Sentiment-Analysis/blob/master/examples/feature_extraction.py) 91 | 92 | ![alt text](imgs/zarathustra.png) 93 | 94 | Perform sentiment analysis over time on "Thus spoke Zarathustra": 95 | 96 | [book.py](https://github.com/AntoinePassemiers/Lexicon-Based-Sentiment-Analysis/blob/master/examples/book.py) 97 | -------------------------------------------------------------------------------- /data/README.txt: -------------------------------------------------------------------------------- 1 | 2 | NRC Word-Emotion Association Lexicon 3 | (NRC Emotion Lexicon) 4 | Version 0.92 5 | 10 July 2011 6 | Copyright (C) 2011 National Research Council Canada (NRC) 7 | Contact: Saif Mohammad (saif.mohammad@nrc-cnrc.gc.ca) 8 | 9 | 1. This copy of the NRC Emotion Lexicon is to be used for research 10 | purposes only. Please contact NRC if interested in a commercial 11 | license. 12 | 13 | 2. If you use this lexicon in your research, then please cite 14 | at least one of the papers listed below in the PUBLICATIONS section 15 | (preferably the journal paper in Computational Intelligence). 16 | 17 | ....................................................................... 18 | 19 | NRC EMOTION LEXICON 20 | ------------------- 21 | The NRC emotion lexicon is a list of words and their associations with 22 | eight emotions (anger, fear, anticipation, trust, surprise, sadness, 23 | joy, and disgust) and two sentiments (negative and positive). The 24 | annotations were manually done through Amazon's Mechanical Turk. Refer 25 | to publications below for more details. 26 | 27 | ....................................................................... 28 | 29 | PUBLICATIONS 30 | ------------ 31 | Details of the lexicon can be found in the following peer-reviewed 32 | publications: 33 | 34 | -- Crowdsourcing a Word-Emotion Association Lexicon, Saif Mohammad and 35 | Peter Turney, Computational Intelligence, 39(3), 555-590, 2013. 36 | 37 | -- Tracking Sentiment in Mail: How Genders Differ on Emotional Axes, 38 | Saif Mohammad and Tony Yang, In Proceedings of the ACL 2011 Workshop 39 | on ACL 2011 Workshop on Computational Approaches to Subjectivity and 40 | Sentiment Analysis (WASSA), June 2011, Portland, OR. Paper (pdf) 41 | 42 | -- From Once Upon a Time to Happily Ever After: Tracking Emotions in 43 | Novels and Fairy Tales, Saif Mohammad, In Proceedings of the ACL 2011 44 | Workshop on Language Technology for Cultural Heritage, Social 45 | Sciences, and Humanities (LaTeCH), June 2011, Portland, OR. Paper 46 | 47 | -- Emotions Evoked by Common Words and Phrases: Using Mechanical Turk 48 | to Create an Emotion Lexicon", Saif Mohammad and Peter Turney, In 49 | Proceedings of the NAACL-HLT 2010 Workshop on Computational Approaches 50 | to Analysis and Generation of Emotion in Text, June 2010, LA, 51 | California. 52 | 53 | Links to the papers are available here: 54 | http://www.purl.org/net/NRCemotionlexicon 55 | ....................................................................... 56 | 57 | VERSION INFORMATION 58 | ------------------- 59 | Version 0.92 is the latest version as of 10 July 2011. This version 60 | has annotations for more than twice as many terms as in Version 0.5 61 | which was released earlier. 62 | 63 | ....................................................................... 64 | 65 | FORMAT 66 | ------ 67 | Each line has the following format: 68 | TargetWordAffectCategoryAssociationFlag 69 | 70 | TargetWord is a word for which emotion associations are provided. 71 | 72 | AffectCategory is one of eight emotions (anger, fear, anticipation, 73 | trust, surprise, sadness, joy, or disgust) or one of two polarities 74 | (negative or positive). 75 | 76 | AssociationFlag has one of two possible values: 0 or 1. 0 indicates 77 | that the target word has no association with affect category, 78 | whereas 1 indicates an association. 79 | 80 | ....................................................................... 81 | 82 | OTHER FORMS OF THE LEXICON 83 | -------------------------- 84 | 85 | The original lexicon has annotations at word-sense level. Each 86 | word-sense pair is annotated by at least three annotators (most are 87 | annotated by at least five). The word-level lexicon was created by 88 | taking the union of emotions associated with all the senses of a word. 89 | Please contact NRC if interested in the sense-level lexicon or if 90 | interested in more detailed information such as the individual 91 | annotations by each of the annotators. 92 | 93 | ....................................................................... 94 | 95 | CONTACT INFORMATION 96 | ------------------- 97 | Saif Mohammad 98 | Research Officer, National Research Council Canada 99 | email: saif.mohammad@nrc-cnrc.gc.ca 100 | phone: +1-613-993-0620 101 | 102 | ....................................................................... 103 | -------------------------------------------------------------------------------- /src/lbsa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # lbsa.py: lexicon-based sentiment analysis 3 | # author : Antoine Passemiers 4 | 5 | import os 6 | import re 7 | import sys 8 | import io 9 | import pickle 10 | import csv 11 | import zipfile 12 | import shutil 13 | import requests 14 | 15 | import numpy as np 16 | import pandas as pd 17 | 18 | from urllib.request import urlretrieve 19 | 20 | 21 | TOKENIZER = re.compile(f'([!"#$%&\'()*+,-./:;<=>?@[\\]^_`|~“”¨«»®´·º½¾¿¡§£₤‘’\n\t])') 22 | 23 | 24 | class UnknownSource(Exception): 25 | 26 | def __init__(self, *args, **kwargs): 27 | Exception.__init__(self, *args, **kwargs) 28 | 29 | 30 | class LexiconException(Exception): 31 | 32 | def __init__(self, *args, **kwargs): 33 | Exception.__init__(self, *args, **kwargs) 34 | 35 | 36 | class Lexicon: 37 | 38 | def __init__(self, dataframe, tag_names, source, language='english'): 39 | self.dataframe = dataframe 40 | self.dataframe.rename(columns={c: Lexicon.reformat_language_name(c) for c in self.dataframe.columns}, inplace=True) 41 | self.tag_names = tag_names 42 | self.source = source 43 | self.language = language 44 | tags = np.asarray(self.dataframe[self.tag_names]) 45 | self.table = {} 46 | for language in self.dataframe.columns: 47 | if language in tag_names: 48 | continue 49 | if language.startswith('unnamed'): 50 | continue 51 | 52 | words = self.dataframe[language] 53 | if isinstance(words, pd.DataFrame): 54 | words = words.iloc[:, 0] 55 | 56 | for key, value in zip(words, tags): 57 | if key not in self.table: 58 | self.table[key] = {} 59 | self.table[key][language] = value 60 | 61 | @staticmethod 62 | def reformat_language_name(name): 63 | name = name.lower().strip() 64 | if '(' in name: 65 | name = name.split('(')[0].strip() 66 | return name 67 | 68 | def get(self, token): 69 | return self.table.get(token, None) 70 | 71 | def get_n_tags(self): 72 | return len(self.tag_names) 73 | 74 | def get_tag_names(self): 75 | return self.tag_names 76 | 77 | def process(self, text, as_dict=True): 78 | tokens = tokenize(text) if not isinstance(text, list) else text 79 | n_tags = self.get_n_tags() 80 | language_counts = {} 81 | counts = {} 82 | for token in tokens: 83 | results = self.get(token.lower()) 84 | if results is not None: 85 | for language in results.keys(): 86 | if language not in counts: 87 | counts[language] = np.zeros(n_tags, dtype=np.int) 88 | counts[language] += results[language] 89 | if language not in language_counts: 90 | language_counts[language] = 0 91 | language_counts[language] += 1 92 | 93 | if len(counts) == 0: 94 | counters = np.zeros(n_tags, dtype=np.int) 95 | else: 96 | # Select language 97 | if self.language == 'auto': 98 | languages = list(language_counts.keys()) 99 | total_counts = [language_counts[language] for language in languages] 100 | language = languages[np.argmax(total_counts)] 101 | counters = counts[language] 102 | else: 103 | language = self.language 104 | if language not in counts: 105 | # raise LexiconException(f'Could not find language "{language}". Found: {counts.keys()}') 106 | counters = np.zeros(n_tags, dtype=np.int) 107 | else: 108 | counters = counts[language] 109 | 110 | if as_dict: 111 | data = { name: counter for name, counter in zip(self.tag_names, counters) } 112 | data['lang'] = language 113 | return data 114 | else: 115 | return counters 116 | 117 | def __len__(self): 118 | return len(self.dataframe) 119 | 120 | 121 | class FeatureExtractor: 122 | 123 | def __init__(self, *args): 124 | self.lexicons = list(args) 125 | self.sizes = [lexicon.get_n_tags() for lexicon in self.lexicons] 126 | self.offsets = np.cumsum([0] + self.sizes) 127 | self.n_features = sum(self.sizes) 128 | self.feature_names = list() 129 | for lexicon in self.lexicons: 130 | tag_names = [lexicon.source + '_' + name for name in lexicon.get_tag_names()] 131 | self.feature_names += tag_names 132 | 133 | def process(self, X): 134 | if isinstance(X, str): 135 | X = [X] 136 | elif len(X) == 0: 137 | return list() 138 | features = np.empty((len(X), self.n_features)) 139 | for i, text in enumerate(X): 140 | tokens = tokenize(text) 141 | for j, lexicon in enumerate(self.lexicons): 142 | features[i, self.offsets[j]:self.offsets[j+1]] = lexicon.process(tokens, as_dict=False) 143 | return np.squeeze(features) 144 | 145 | 146 | def make_time_analysis(text, lexicon): 147 | if isinstance(text, list): 148 | tokens = text 149 | else: 150 | tokens = tokenize(text) 151 | n_tags = lexicon.get_n_tags() 152 | tag_names = lexicon.get_tag_names() 153 | mask = np.zeros((len(tokens), n_tags), dtype=np.int) 154 | for token_id, token in enumerate(tokens): 155 | value = lexicon.get(token.lower()) 156 | if value is not None: 157 | mask[token_id, :] += value 158 | data = { key: value for key, value in zip(tag_names, mask.T) } 159 | return data 160 | 161 | 162 | class DownloadProgressBar: 163 | 164 | def __init__(self, prefix, length=30): 165 | self.prefix = prefix 166 | self.length = length 167 | self.downloaded = 0 168 | self.total_size = None 169 | self.update(0) 170 | 171 | def progress_hook(self, count, block_size, total_size): 172 | self.total_size = total_size 173 | self.downloaded += block_size 174 | progress = np.clip(float(self.downloaded) / float(self.total_size), 0., 1.) 175 | self.update(progress) 176 | 177 | def update(self, progress): 178 | percent = 100. * progress 179 | n_blocks = int(np.round(progress * self.length)) 180 | bar = ('=' * n_blocks).ljust(self.length) 181 | sys.stdout.write('\r%s [%s] %.2f %%\r' % (self.prefix, bar, percent)) 182 | if self.downloaded == self.total_size: 183 | sys.stdout.write('\n') 184 | 185 | 186 | def get_cache_dir(): 187 | home = os.path.expanduser("~") 188 | LBSA_DATA_DIR = os.path.join(home, '.lbsa') 189 | if not os.path.isdir(LBSA_DATA_DIR): 190 | os.makedirs(LBSA_DATA_DIR) 191 | return LBSA_DATA_DIR 192 | 193 | 194 | def load_nrc_lexicon(path=None): 195 | LBSA_DATA_DIR = get_cache_dir() 196 | nrc_filename = 'NRC-Emotion-Lexicon' 197 | 198 | def download_lexicon(): 199 | if path is None: 200 | LEXICON_URL = f'http://www.saifmohammad.com/WebDocs/Lexicons/{nrc_filename}.zip' 201 | print('Downloading NRC lexicon...') 202 | req = requests.get(LEXICON_URL) 203 | with open(os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.zip'), 'wb') as f: 204 | f.write(req.content) 205 | with zipfile.ZipFile(os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.zip'), 'r') as zip_object: 206 | zipObject.extract( 207 | os.path.join('NRC-Emotion-Lexicon-v0.92', 'NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx'), 208 | os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.xlsx')) 209 | else: 210 | shutil.copyfile(path, os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.xlsx')) 211 | 212 | if not os.path.exists(os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.csv')): 213 | # Download lexicon in XLSX format 214 | if not os.path.exists(os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.xlsx')): 215 | download_lexicon() 216 | 217 | # Convert from XLSX to CSV file 218 | filepath = os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.xlsx') 219 | df = pd.read_excel(filepath, sheet_name='NRC-Lex-v0.92-word-translations') 220 | df.to_csv(os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.csv')) 221 | 222 | # Remove XLSX file 223 | os.remove(os.path.join(LBSA_DATA_DIR, "%s.xlsx" % nrc_filename)) 224 | 225 | sentiment_names = ["positive", "negative", "anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust"] 226 | lexicon_path = os.path.join(__file__, os.path.join(LBSA_DATA_DIR, '%s.csv' % nrc_filename)) 227 | nrc_all_languages = pd.read_csv(lexicon_path, encoding='utf8', index_col=False) 228 | nrc_all_languages.rename(columns=lambda x: x.replace('Word', '').split('Translation')[0].rstrip(' ').lower(), inplace=True) 229 | for column_name in sentiment_names: 230 | nrc_all_languages[column_name] = nrc_all_languages[column_name].astype(np.int32) 231 | return nrc_all_languages, sentiment_names 232 | 233 | 234 | """ 235 | def load_bing_opinion_lexicon(): 236 | LBSA_DATA_DIR = get_cache_dir() 237 | bing_filename = "opinion-lexicon-English" 238 | if not os.path.isdir(os.path.join(LBSA_DATA_DIR, "bing")): 239 | os.makedirs(os.path.join(LBSA_DATA_DIR, "bing")) 240 | if not os.path.exists(os.path.join(LBSA_DATA_DIR, "bing/positive.txt")): 241 | # Download rar archive 242 | LEXICON_URL = "http://www.cs.uic.edu/~liub/FBS/%s.rar" % bing_filename 243 | filepath = os.path.join(LBSA_DATA_DIR, "%s.rar" % bing_filename) 244 | urlretrieve(LEXICON_URL, filepath) 245 | rar = rarfile.RarFile(filepath) 246 | rar.extractall(path=os.path.join(LBSA_DATA_DIR, "bing")) 247 | # TODO 248 | """ 249 | 250 | 251 | def load_mpqa_sujectivity_lexicon(name='', organization='', email='', path=None): 252 | assert path is None 253 | LBSA_DATA_DIR = get_cache_dir() 254 | if not os.path.isdir(os.path.join(LBSA_DATA_DIR, "mpqa")): 255 | os.makedirs(os.path.join(LBSA_DATA_DIR, "mpqa")) 256 | filepath = os.path.join(LBSA_DATA_DIR, 'mpqa/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff') 257 | if not os.path.exists(filepath): 258 | print('Downloading mpqa lexicon...') 259 | response = requests.post( 260 | "http://mpqa.cs.pitt.edu/request_resource.php", 261 | data={"name": "", "organization": "", "email": "", "dataset":"subj_lexicon"}) 262 | zf = zipfile.ZipFile(io.BytesIO(response.content)) 263 | zf.extractall(path=os.path.join(LBSA_DATA_DIR, "mpqa")) 264 | 265 | with open(filepath) as f: 266 | words, positive, negative, strong_subj = list(), list(), list(), list() 267 | for line in f.readlines(): 268 | items = line.rstrip().split(' ') 269 | if len(items) == 6: 270 | words.append(items[2].split('=')[1]) 271 | strong_subj.append(1 if (items[0].split('=')[1] == 'strongsubj') else 0) 272 | positive.append(1 if items[5].split('=')[1] in ['positive', 'both'] else 0) 273 | negative.append(1 if items[5].split('=')[1] in ['negative', 'both'] else 0) 274 | return pd.DataFrame({ 275 | 'english': words, 276 | 'positive': np.asarray(positive, dtype=np.int), 277 | 'negative': np.asarray(negative, dtype=np.int), 278 | 'strong_subjectivty': np.asarray(strong_subj, dtype=np.int) 279 | }) 280 | 281 | 282 | def load_afinn_opinion_lexicon(path=None): 283 | LBSA_DATA_DIR = get_cache_dir() 284 | if not os.path.isdir(os.path.join(LBSA_DATA_DIR, "afinn")): 285 | os.makedirs(os.path.join(LBSA_DATA_DIR, "afinn")) 286 | if not os.path.exists(os.path.join(LBSA_DATA_DIR, "afinn/AFINN/AFINN-111.txt")): 287 | 288 | if path is None: 289 | # Download zip archive 290 | LEXICON_URL = 'http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/6010/zip/imm6010.zip' 291 | filepath = os.path.join(LBSA_DATA_DIR, "afinn/imm6010.zip") 292 | progressbar = DownloadProgressBar('Downloading AFINN lexicon') 293 | urlretrieve(LEXICON_URL, filepath, reporthook=progressbar.progress_hook) 294 | print('') 295 | with zipfile.ZipFile(filepath) as zf: 296 | zf.extractall(path=os.path.join(LBSA_DATA_DIR, "afinn")) 297 | # Remove zip archive 298 | os.remove(filepath) 299 | else: 300 | shutil.copyfile(path, os.path.join(LBSA_DATA_DIR, 'afinn/AFINN/AFINN-111.txt')) 301 | 302 | words, values = list(), list() 303 | with open(os.path.join(LBSA_DATA_DIR, 'afinn/AFINN/AFINN-111.txt')) as f: 304 | for line in f.readlines(): 305 | items = line.rstrip().split('\t') 306 | if len(items) == 2: 307 | words.append(items[0]) 308 | values.append(int(items[1])) 309 | values = np.asarray(values, dtype=np.int) 310 | positives = np.zeros(len(values), dtype=np.int) 311 | negatives = np.zeros(len(values), dtype=np.int) 312 | positives[values > 0] = values[values > 0] 313 | negatives[values < 0] = -values[values < 0] 314 | return pd.DataFrame({ 315 | 'english': words, 316 | 'positive': positives, 317 | 'negative': negatives 318 | }) 319 | 320 | 321 | def tokenize(text): 322 | return TOKENIZER.sub(r' \1 ', text).split() 323 | 324 | 325 | def create_sa_lexicon(source='nrc', language='english', path=None): 326 | if source == 'nrc': 327 | nrc_all_languages, tag_names = load_nrc_lexicon(path=path) 328 | to_remove = ['positive', 'negative'] 329 | nrc_all_languages.drop(to_remove, axis=1, inplace=True) 330 | for tag_name in to_remove: 331 | tag_names.remove(tag_name) 332 | lexicon = Lexicon(nrc_all_languages, tag_names, source, language=language) 333 | else: 334 | raise UnknownSource('Source %s does not provide any available sentiment analysis lexicon') 335 | return lexicon 336 | 337 | 338 | def create_opinion_lexicon(source='nrc', language='english', path=None): 339 | if source == 'nrc': 340 | nrc_all_languages, tag_names = load_nrc_lexicon(path=path) 341 | to_remove = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust'] 342 | nrc_all_languages.drop(to_remove, axis=1, inplace=True) 343 | for tag_name in to_remove: 344 | tag_names.remove(tag_name) 345 | lexicon = Lexicon(nrc_all_languages, tag_names, source, language=language) 346 | elif source == 'afinn': 347 | ol = load_afinn_opinion_lexicon(path=path) 348 | lexicon = Lexicon(ol, ['positive', 'negative'], source, language=language) 349 | elif source == 'mpqa': 350 | ol = load_mpqa_sujectivity_lexicon(path=path) 351 | lexicon = Lexicon(ol, ['positive', 'negative', 'strong_subjectivty'], source, language=language) 352 | else: 353 | raise UnknownSource('Source %s does not provide any available opinion/subjectivity lexicon') 354 | return lexicon 355 | 356 | 357 | def get_lexicon(lexicon_type, **kwargs): 358 | assert(lexicon_type in ['sa', 'opinion']) 359 | if lexicon_type == 'sa': 360 | return create_sa_lexicon(**kwargs) 361 | else: 362 | return create_opinion_lexicon(**kwargs) 363 | --------------------------------------------------------------------------------