├── requirements.txt
├── .travis.yml
├── imgs
    └── zarathustra.png
├── test.py
├── .gitignore
├── setup.py
├── test
    └── test_lbda.py
├── .github
    └── workflows
    │   └── build.yml
├── examples
    ├── tweets.py
    ├── feature_extraction.py
    └── book.py
├── README.md
├── data
    └── README.txt
└── src
    └── lbsa.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.21.0
2 | pandas
3 | requests


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - "3.6"
4 | script: nosetests -v test/test_lbda.py


--------------------------------------------------------------------------------
/imgs/zarathustra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AntoinePassemiers/Lexicon-Based-Sentiment-Analysis/HEAD/imgs/zarathustra.png


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 
3 | 
4 | df = pd.read_excel('NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx', sheet_name='NRC-Lex-v0.92-word-translations')
5 | 
6 | print(df)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.csv
 3 | 
 4 | __pycache__/
 5 | build/
 6 | dist/
 7 | lbsa.egg-info/
 8 | 
 9 | NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx
10 | thus_spoke_zarathustra.txt


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # setup.py
 3 | # author : Antoine Passemiers
 4 | 
 5 | import os
 6 | from setuptools import setup, find_packages
 7 | 
 8 | 
 9 | datafiles = [(d, [os.path.join(d, f) for f in files]) 
10 |     for d, folders, files in os.walk('data')]
11 | 
12 | setup(
13 |     name='lbsa',
14 |     version='0.0.1',
15 |     author='Antoine Passemiers',
16 |     description='Lexicon-based sentiment analysis',
17 |     packages = find_packages('src'),
18 |     include_package_data=True,
19 |     package_dir={"": "src"},
20 |     py_modules=["lbsa"],
21 |     data_files = datafiles,
22 |     url='https://github.com/AntoinePassemiers/Lexicon-Based-Sentiment-Analysis',
23 |     install_requires=[
24 |         'numpy >= 1.13.3',
25 |         'matplotlib >= 2.0.2',
26 |         'pandas'
27 |     ],
28 | )


--------------------------------------------------------------------------------
/test/test_lbda.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # test-lbsa.py
 3 | # author : Antoine Passemiers
 4 | 
 5 | import lbsa
 6 | 
 7 | 
 8 | def test_lbsa():
 9 |     reviews = [
10 |         'You should get this game, because in this game you can destroy other cars with really AWESOME guns like a acid thrower',
11 |         'A great dev, but a mediocre app. You just tap the screen . In a word : BORING . Don\'t get this app.',
12 |         'Even at free it was too expensive. Total waste of time and space. Save yourself the trouble of having to remove it by not downloading it in the first place.',
13 |         'Works flawlessly with my favorite stations. I highly recommend this app as it makes finding a stream for your favorite local radio stations a breeze.'
14 |     ]
15 | 
16 |     afinn_lexicon = lbsa.get_lexicon('opinion', language='english', source='afinn')
17 |     mpqa_lexicon = lbsa.get_lexicon('opinion', language='english', source='mpqa')
18 | 
19 |     extractor = lbsa.FeatureExtractor(afinn_lexicon, mpqa_lexicon)
20 |     extractor.process(reviews)
21 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       fail-fast: false
15 |       matrix:
16 |         python-version: [3.8]
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         python3 -m pip install numpy pytest nose coverage
27 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
28 |         python3 setup.py install
29 |     - name: Configure cc-test-reporter
30 |       run: |
31 |         curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
32 |         chmod +x ./cc-test-reporter
33 |         ./cc-test-reporter before-build
34 |     - name: Test with nose
35 |       run: |
36 |         coverage run -m nose -v test
37 | 


--------------------------------------------------------------------------------
/examples/tweets.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # tweets.py
 3 | # author : Antoine Passemiers
 4 | 
 5 | import lbsa
 6 | 
 7 | 
 8 | tweet = """
 9 | The Budget Agreement today is so important for our great Military.
10 | It ends the danger sequester and gives Secretary Mattis what he needs to keep America Great.
11 | Republicans and Democrats must support our troops and support this Bill!
12 | """
13 | 
14 | print('\nUse NRC lexicon')
15 | lexicon = lbsa.get_lexicon('opinion', language='english', source='nrc')
16 | print(lexicon.process(tweet))
17 | 
18 | print('\nUse afinn lexicon')
19 | lexicon = lbsa.get_lexicon('opinion', language='english', source='afinn')
20 | print(lexicon.process(tweet))
21 | 
22 | print('\nUse mpqa lexicon')
23 | lexicon = lbsa.get_lexicon('opinion', language='english', source='mpqa')
24 | print(lexicon.process(tweet))
25 | 
26 | tweet2 = """
27 | A la suite de la tempête #Eunice et à la demande du Président de la République,
28 | l’Etat décrétera dans les meilleurs délais l’état de catastrophe naturelle partout
29 | où cela s’avérera nécessaire.
30 | """
31 | print('\nAuto-detect languages')
32 | lexicon = lbsa.get_lexicon('sa', language='auto', source='nrc')
33 | print(lexicon.process(tweet))
34 | print(lexicon.process(tweet2))
35 | 


--------------------------------------------------------------------------------
/examples/feature_extraction.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # tweets.py
 3 | # author : Antoine Passemiers
 4 | 
 5 | import lbsa
 6 | 
 7 | reviews = [
 8 |     'You should get this game, because in this game you can destroy other cars with really AWESOME guns like a acid thrower',
 9 |     'A great dev, but a mediocre app. You just tap the screen . In a word : BORING . Don\'t get this app.',
10 |     'Even at free it was too expensive. Total waste of time and space. Save yourself the trouble of having to remove it by not downloading it in the first place.',
11 |     'Works flawlessly with my favorite stations. I highly recommend this app as it makes finding a stream for your favorite local radio stations a breeze.'
12 | ]
13 | 
14 | afinn_lexicon = lbsa.get_lexicon('opinion', language='english', source='afinn')
15 | nrc_lexicon = lbsa.get_lexicon('opinion', language='english', source='nrc')
16 | nrc_sa_lexicon = lbsa.get_lexicon('sa', language='english', source='nrc')
17 | mpqa_lexicon = lbsa.get_lexicon('opinion', language='english', source='mpqa')
18 | 
19 | extractor = lbsa.FeatureExtractor(afinn_lexicon, nrc_lexicon, nrc_sa_lexicon, mpqa_lexicon)
20 | 
21 | print('Feature names:')
22 | print('{}\n'.format(extractor.feature_names))
23 | 
24 | print(extractor.process(reviews))


--------------------------------------------------------------------------------
/examples/book.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # lbsa.py: lexicon-based sentiment analysis
 3 | # author : Antoine Passemiers
 4 | 
 5 | import lbsa
 6 | 
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | 
10 | 
11 | def moving_average(sequence, n=1000):
12 |     ma = np.cumsum(sequence, axis=0)
13 |     ma[n:] = ma[n:] - ma[:-n]
14 |     return ma[n - 1:] / n
15 | 
16 | 
17 | # https://archive.org/stream/thusspokezarathu00nietuoft/thusspokezarathu00nietuoft_djvu.txt
18 | with open('../../data/thus_spoke_zarathustra.txt', 'r') as f:
19 |     text = f.read()
20 |     lexicon = lbsa.get_lexicon('sa', language='english')
21 |     features = lbsa.make_time_analysis(text, lexicon)
22 | 
23 |     block_size = 100
24 | 
25 |     tag_names = lexicon.get_tag_names()
26 |     for feature_name in tag_names:
27 |         feature = features[feature_name]
28 |         new_length = len(feature) - (len(feature) % block_size)
29 |         feature = np.mean(feature[:new_length].reshape(-1, block_size), axis=1)
30 |         feature = moving_average(feature, n=100)
31 | 
32 |         plt.plot(feature, label=feature_name)
33 | 
34 |     plt.legend()
35 |     plt.ylabel('Average counts', fontsize=15)
36 |     plt.xlabel('Number of blocks (1 block = %i words)' % block_size, fontsize=15)
37 |     plt.title('Sentiment analysis of "Thus spoke Zarathustra" over time', fontsize=15)
38 | 
39 |     fig = plt.gcf()
40 |     fig.set_size_inches(18.5, 10.5)
41 |     fig.savefig('zarathustra.png', dpi=100)
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build status](https://github.com/antropie/cryptobot/actions/workflows/build.yml/badge.svg)](https://github.com/antropie/cryptobot/actions?query=build)
 2 | # LBSA - Lexicon-based Sentiment Analysis
 3 | 
 4 | Fast library for sentiment analysis, opinion mining and language detection.
 5 | 
 6 | ## Installation
 7 | 
 8 | Install dependencies:
 9 | ```sh
10 | $ sudo pip3 install requirements.txt
11 | ```
12 | 
13 | From the parent folder, install the library by typing the following command:
14 | 
15 | ```sh
16 | $ sudo python3 setup.py install
17 | ```
18 | 
19 | To access the NRC lexicon, download it from:
20 | http://www.saifmohammad.com/WebDocs/Lexicons/NRC-Emotion-Lexicon.zip
21 | 
22 | Extract it, and provide the path to the excel file the first time you use the NRC lexicon.
23 | For example:
24 | ```python
25 | >>> path = 'path/to/NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx'
26 | >>> sa_lexicon = lbsa.get_lexicon('sa', language='english', source='nrc', path=path)
27 | ```
28 | 
29 | 
30 | ### Dependencies
31 | 
32 | * numpy >= 1.13.3
33 | * pandas >= 0.21.0
34 | * xlrd
35 | 
36 | ## Features
37 | 
38 | ### Sentiment analysis
39 | 
40 | ```python
41 | >>> import lbsa
42 | >>> tweet = """
43 | ... The Budget Agreement today is so important for our great Military.
44 | ... It ends the dangerous sequester and gives Secretary Mattis what he needs to keep America Great.
45 | ... Republicans and Democrats must support our troops and support this Bill!
46 | ... """
47 | >>> sa_lexicon = lbsa.get_lexicon('sa', language='english', source='nrc')
48 | >>> sa_lexicon.process(tweet)
49 | {'anger': 0, 'anticipation': 0, 'disgust': 0, 'fear': 2, 'joy': 0, 'sadness': 0, 
50 | 'surprise': 0, 'trust': 3}
51 | ```
52 | 
53 | ### Opinion mining
54 | 
55 | ```python
56 | >>> op_lexicon = lbsa.get_lexicon('opinion', language='english', source='nrc')
57 | >>> op_lexicon.process(tweet)
58 | {'positive': 2, 'negative': 1}
59 | ```
60 | 
61 | ### Language detection
62 | 
63 | Language detection requires the NRC lexicon:
64 | 
65 | ```python
66 | >>> import lbsa
67 | >>> tweet = """
68 | ... A la suite de la tempête #Eunice et à la demande du Président de la République,
69 | ... lEtat décrétera dans les meilleurs délais létat de catastrophe naturelle partout
70 | ... où cela savérera nécessaire.
71 | ... """
72 | >>> lexicon = lbsa.get_lexicon('sa', language='auto', source='nrc')
73 | >>> print(lexicon.process(tweet))
74 | {'anger': 2, 'anticipation': 1, 'disgust': 1, 'fear': 2, 'joy': 0, 'sadness': 2, 'surprise': 2,
75 | 'trust': 0, 'lang': 'french'}
76 | ```
77 | 
78 | ### Feature extractor
79 | 
80 | ```python
81 | >>> extractor = lbsa.FeatureExtractor(sa_lexicon, op_lexicon)
82 | >>> extractor.process(tweet)
83 | array([0., 0., 0., 2., 0., 0., 0., 3., 2., 1.])
84 | ```
85 | 
86 | #### Example
87 | 
88 | Feature extractor:
89 | 
90 | [feature_extraction.py](https://github.com/AntoinePassemiers/Lexicon-Based-Sentiment-Analysis/blob/master/examples/feature_extraction.py)
91 | 
92 | ![alt text](imgs/zarathustra.png)
93 | 
94 | Perform sentiment analysis over time on "Thus spoke Zarathustra":
95 | 
96 | [book.py](https://github.com/AntoinePassemiers/Lexicon-Based-Sentiment-Analysis/blob/master/examples/book.py)
97 | 


--------------------------------------------------------------------------------
/data/README.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | NRC Word-Emotion Association Lexicon
  3 | (NRC Emotion Lexicon)
  4 | Version 0.92
  5 | 10 July 2011
  6 | Copyright (C) 2011 National Research Council Canada (NRC)
  7 | Contact: Saif Mohammad (saif.mohammad@nrc-cnrc.gc.ca)
  8 | 
  9 | 1. This copy of the NRC Emotion Lexicon is to be used for research
 10 | purposes only.  Please contact NRC if interested in a commercial
 11 | license.
 12 | 
 13 | 2. If you use this lexicon in your research, then please cite
 14 | at least one of the papers listed below in the PUBLICATIONS section
 15 | (preferably the journal paper in Computational Intelligence).
 16 | 
 17 | .......................................................................
 18 | 
 19 | NRC EMOTION LEXICON
 20 | -------------------
 21 | The NRC emotion lexicon is a list of words and their associations with
 22 | eight emotions (anger, fear, anticipation, trust, surprise, sadness,
 23 | joy, and disgust) and two sentiments (negative and positive). The
 24 | annotations were manually done through Amazon's Mechanical Turk. Refer
 25 | to publications below for more details.
 26 | 
 27 | .......................................................................
 28 | 
 29 | PUBLICATIONS
 30 | ------------
 31 | Details of the lexicon can be found in the following peer-reviewed
 32 | publications:
 33 | 
 34 | -- Crowdsourcing a Word-Emotion Association Lexicon, Saif Mohammad and
 35 | Peter Turney, Computational Intelligence, 39(3), 555-590, 2013.
 36 |  	 
 37 | -- Tracking Sentiment in Mail: How Genders Differ on Emotional Axes,
 38 | Saif Mohammad and Tony Yang, In Proceedings of the ACL 2011 Workshop
 39 | on ACL 2011 Workshop on Computational Approaches to Subjectivity and
 40 | Sentiment Analysis (WASSA), June 2011, Portland, OR.  Paper (pdf)
 41 |  	 
 42 | -- From Once Upon a Time to Happily Ever After: Tracking Emotions in
 43 | Novels and Fairy Tales, Saif Mohammad, In Proceedings of the ACL 2011
 44 | Workshop on Language Technology for Cultural Heritage, Social
 45 | Sciences, and Humanities (LaTeCH), June 2011, Portland, OR.  Paper
 46 |  	 
 47 | -- Emotions Evoked by Common Words and Phrases: Using Mechanical Turk
 48 | to Create an Emotion Lexicon", Saif Mohammad and Peter Turney, In
 49 | Proceedings of the NAACL-HLT 2010 Workshop on Computational Approaches
 50 | to Analysis and Generation of Emotion in Text, June 2010, LA,
 51 | California.
 52 | 
 53 | Links to the papers are available here:
 54 | http://www.purl.org/net/NRCemotionlexicon
 55 | .......................................................................
 56 | 
 57 | VERSION INFORMATION
 58 | -------------------
 59 | Version 0.92 is the latest version as of 10 July 2011.  This version
 60 | has annotations for more than twice as many terms as in Version 0.5
 61 | which was released earlier.
 62 | 
 63 | .......................................................................
 64 | 
 65 | FORMAT
 66 | ------
 67 | Each line has the following format:
 68 | TargetWord<tab>AffectCategory<tab>AssociationFlag
 69 | 
 70 | TargetWord is a word for which emotion associations are provided.
 71 | 
 72 | AffectCategory is one of eight emotions (anger, fear, anticipation,
 73 | trust, surprise, sadness, joy, or disgust) or one of two polarities
 74 | (negative or positive).
 75 | 
 76 | AssociationFlag has one of two possible values: 0 or 1.  0 indicates
 77 | that the target word has no association with affect category,
 78 | whereas 1 indicates an association.
 79 | 
 80 | .......................................................................
 81 | 
 82 | OTHER FORMS OF THE LEXICON
 83 | --------------------------
 84 | 
 85 | The original lexicon has annotations at word-sense level.  Each
 86 | word-sense pair is annotated by at least three annotators (most are
 87 | annotated by at least five).  The word-level lexicon was created by
 88 | taking the union of emotions associated with all the senses of a word.
 89 | Please contact NRC if interested in the sense-level lexicon or if
 90 | interested in more detailed information such as the individual
 91 | annotations by each of the annotators.
 92 | 
 93 | .......................................................................
 94 | 
 95 | CONTACT INFORMATION
 96 | -------------------
 97 | Saif Mohammad
 98 | Research Officer, National Research Council Canada
 99 | email: saif.mohammad@nrc-cnrc.gc.ca
100 | phone: +1-613-993-0620
101 | 
102 | .......................................................................
103 | 


--------------------------------------------------------------------------------
/src/lbsa.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # lbsa.py: lexicon-based sentiment analysis
  3 | # author : Antoine Passemiers
  4 | 
  5 | import os
  6 | import re
  7 | import sys
  8 | import io
  9 | import pickle
 10 | import csv
 11 | import zipfile
 12 | import shutil
 13 | import requests
 14 | 
 15 | import numpy as np
 16 | import pandas as pd
 17 | 
 18 | from urllib.request import urlretrieve
 19 | 
 20 | 
 21 | TOKENIZER = re.compile(f'([!"#$%&\'()*+,-./:;<=>?@[\\]^_`|~“”¨«»®´·º½¾¿¡§£₤‘’\n\t])')
 22 | 
 23 | 
 24 | class UnknownSource(Exception):
 25 | 
 26 |     def __init__(self, *args, **kwargs):
 27 |         Exception.__init__(self, *args, **kwargs)
 28 | 
 29 | 
 30 | class LexiconException(Exception):
 31 | 
 32 |     def __init__(self, *args, **kwargs):
 33 |         Exception.__init__(self, *args, **kwargs)
 34 | 
 35 | 
 36 | class Lexicon:
 37 | 
 38 |     def __init__(self, dataframe, tag_names, source, language='english'):
 39 |         self.dataframe = dataframe
 40 |         self.dataframe.rename(columns={c: Lexicon.reformat_language_name(c) for c in self.dataframe.columns}, inplace=True)
 41 |         self.tag_names = tag_names
 42 |         self.source = source
 43 |         self.language = language
 44 |         tags = np.asarray(self.dataframe[self.tag_names])
 45 |         self.table = {}
 46 |         for language in self.dataframe.columns:
 47 |             if language in tag_names:
 48 |                 continue
 49 |             if language.startswith('unnamed'):
 50 |                 continue
 51 | 
 52 |             words = self.dataframe[language]
 53 |             if isinstance(words, pd.DataFrame):
 54 |                 words = words.iloc[:, 0]
 55 | 
 56 |             for key, value in zip(words, tags):
 57 |                 if key not in self.table:
 58 |                     self.table[key] = {}
 59 |                 self.table[key][language] = value
 60 | 
 61 |     @staticmethod
 62 |     def reformat_language_name(name):
 63 |         name = name.lower().strip()
 64 |         if '(' in name:
 65 |             name = name.split('(')[0].strip()
 66 |         return name
 67 |     
 68 |     def get(self, token):
 69 |         return self.table.get(token, None)
 70 |     
 71 |     def get_n_tags(self):
 72 |         return len(self.tag_names)
 73 |     
 74 |     def get_tag_names(self):
 75 |         return self.tag_names
 76 | 
 77 |     def process(self, text, as_dict=True):
 78 |         tokens = tokenize(text) if not isinstance(text, list) else text
 79 |         n_tags = self.get_n_tags()
 80 |         language_counts = {}
 81 |         counts = {}
 82 |         for token in tokens:
 83 |             results = self.get(token.lower())
 84 |             if results is not None:
 85 |                 for language in results.keys():
 86 |                     if language not in counts:
 87 |                         counts[language] = np.zeros(n_tags, dtype=np.int)
 88 |                     counts[language] += results[language]
 89 |                     if language not in language_counts:
 90 |                         language_counts[language] = 0
 91 |                     language_counts[language] += 1
 92 | 
 93 |         if len(counts) == 0:
 94 |             counters = np.zeros(n_tags, dtype=np.int)
 95 |         else:
 96 |             # Select language
 97 |             if self.language == 'auto':
 98 |                 languages = list(language_counts.keys())
 99 |                 total_counts = [language_counts[language] for language in languages]
100 |                 language = languages[np.argmax(total_counts)]
101 |                 counters = counts[language]
102 |             else:
103 |                 language = self.language
104 |             if language not in counts:
105 |                 # raise LexiconException(f'Could not find language "{language}". Found: {counts.keys()}')
106 |                 counters = np.zeros(n_tags, dtype=np.int)
107 |             else:
108 |                 counters = counts[language]
109 | 
110 |         if as_dict:
111 |             data = { name: counter for name, counter in zip(self.tag_names, counters) }
112 |             data['lang'] = language
113 |             return data
114 |         else:
115 |             return counters
116 |     
117 |     def __len__(self):
118 |         return len(self.dataframe)
119 | 
120 | 
121 | class FeatureExtractor:
122 | 
123 |     def __init__(self, *args):
124 |         self.lexicons = list(args)
125 |         self.sizes = [lexicon.get_n_tags() for lexicon in self.lexicons]
126 |         self.offsets = np.cumsum([0] + self.sizes)
127 |         self.n_features = sum(self.sizes)
128 |         self.feature_names = list()
129 |         for lexicon in self.lexicons:
130 |             tag_names = [lexicon.source + '_' + name for name in lexicon.get_tag_names()]
131 |             self.feature_names += tag_names
132 | 
133 |     def process(self, X):
134 |         if isinstance(X, str):
135 |             X = [X]
136 |         elif len(X) == 0:
137 |             return list()
138 |         features = np.empty((len(X), self.n_features))
139 |         for i, text in enumerate(X):
140 |             tokens = tokenize(text)
141 |             for j, lexicon in enumerate(self.lexicons):
142 |                 features[i, self.offsets[j]:self.offsets[j+1]] = lexicon.process(tokens, as_dict=False)
143 |         return np.squeeze(features)
144 | 
145 | 
146 | def make_time_analysis(text, lexicon):
147 |     if isinstance(text, list):
148 |         tokens = text
149 |     else:
150 |         tokens = tokenize(text)
151 |     n_tags = lexicon.get_n_tags()
152 |     tag_names = lexicon.get_tag_names()
153 |     mask = np.zeros((len(tokens), n_tags), dtype=np.int)
154 |     for token_id, token in enumerate(tokens):
155 |         value = lexicon.get(token.lower())
156 |         if value is not None:
157 |             mask[token_id, :] += value
158 |     data = { key: value for key, value in zip(tag_names, mask.T) }
159 |     return data
160 | 
161 | 
162 | class DownloadProgressBar:
163 | 
164 |     def __init__(self, prefix, length=30):
165 |         self.prefix = prefix
166 |         self.length = length
167 |         self.downloaded = 0
168 |         self.total_size = None
169 |         self.update(0)
170 | 
171 |     def progress_hook(self, count, block_size, total_size):
172 |         self.total_size = total_size
173 |         self.downloaded += block_size
174 |         progress = np.clip(float(self.downloaded) / float(self.total_size), 0., 1.)
175 |         self.update(progress)
176 | 
177 |     def update(self, progress):
178 |         percent = 100. * progress
179 |         n_blocks = int(np.round(progress * self.length))
180 |         bar = ('=' * n_blocks).ljust(self.length)
181 |         sys.stdout.write('\r%s [%s] %.2f %%\r' % (self.prefix, bar, percent))
182 |         if self.downloaded == self.total_size:
183 |             sys.stdout.write('\n')
184 | 
185 | 
186 | def get_cache_dir():
187 |     home = os.path.expanduser("~")
188 |     LBSA_DATA_DIR = os.path.join(home, '.lbsa')
189 |     if not os.path.isdir(LBSA_DATA_DIR):
190 |         os.makedirs(LBSA_DATA_DIR)
191 |     return LBSA_DATA_DIR
192 | 
193 | 
194 | def load_nrc_lexicon(path=None):
195 |     LBSA_DATA_DIR = get_cache_dir()
196 |     nrc_filename = 'NRC-Emotion-Lexicon'
197 | 
198 |     def download_lexicon():
199 |         if path is None:
200 |             LEXICON_URL = f'http://www.saifmohammad.com/WebDocs/Lexicons/{nrc_filename}.zip'
201 |             print('Downloading NRC lexicon...')
202 |             req = requests.get(LEXICON_URL)
203 |             with open(os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.zip'), 'wb') as f:
204 |                 f.write(req.content)
205 |             with zipfile.ZipFile(os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.zip'), 'r') as zip_object:
206 |                 zipObject.extract(
207 |                     os.path.join('NRC-Emotion-Lexicon-v0.92', 'NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx'),
208 |                     os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.xlsx'))
209 |         else:
210 |             shutil.copyfile(path, os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.xlsx'))
211 | 
212 |     if not os.path.exists(os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.csv')):
213 |         # Download lexicon in XLSX format
214 |         if not os.path.exists(os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.xlsx')):
215 |             download_lexicon()
216 | 
217 |         # Convert from XLSX to CSV file
218 |         filepath = os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.xlsx')
219 |         df = pd.read_excel(filepath, sheet_name='NRC-Lex-v0.92-word-translations')
220 |         df.to_csv(os.path.join(LBSA_DATA_DIR, f'{nrc_filename}.csv'))
221 |         
222 |         # Remove XLSX file
223 |         os.remove(os.path.join(LBSA_DATA_DIR, "%s.xlsx" % nrc_filename))
224 | 
225 |     sentiment_names = ["positive", "negative", "anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust"]
226 |     lexicon_path = os.path.join(__file__, os.path.join(LBSA_DATA_DIR, '%s.csv' % nrc_filename))
227 |     nrc_all_languages = pd.read_csv(lexicon_path, encoding='utf8', index_col=False)
228 |     nrc_all_languages.rename(columns=lambda x: x.replace('Word', '').split('Translation')[0].rstrip(' ').lower(), inplace=True)
229 |     for column_name in sentiment_names:
230 |         nrc_all_languages[column_name] = nrc_all_languages[column_name].astype(np.int32)
231 |     return nrc_all_languages, sentiment_names
232 | 
233 | 
234 | """
235 | def load_bing_opinion_lexicon():
236 |     LBSA_DATA_DIR = get_cache_dir()
237 |     bing_filename = "opinion-lexicon-English"
238 |     if not os.path.isdir(os.path.join(LBSA_DATA_DIR, "bing")):
239 |         os.makedirs(os.path.join(LBSA_DATA_DIR, "bing"))
240 |     if not os.path.exists(os.path.join(LBSA_DATA_DIR, "bing/positive.txt")):
241 |         # Download rar archive
242 |         LEXICON_URL = "http://www.cs.uic.edu/~liub/FBS/%s.rar" % bing_filename
243 |         filepath = os.path.join(LBSA_DATA_DIR, "%s.rar" % bing_filename)
244 |         urlretrieve(LEXICON_URL, filepath)
245 |         rar = rarfile.RarFile(filepath)
246 |         rar.extractall(path=os.path.join(LBSA_DATA_DIR, "bing"))
247 |         # TODO
248 | """
249 | 
250 | 
251 | def load_mpqa_sujectivity_lexicon(name='', organization='', email='', path=None):
252 |     assert path is None
253 |     LBSA_DATA_DIR = get_cache_dir()
254 |     if not os.path.isdir(os.path.join(LBSA_DATA_DIR, "mpqa")):
255 |         os.makedirs(os.path.join(LBSA_DATA_DIR, "mpqa"))
256 |     filepath = os.path.join(LBSA_DATA_DIR, 'mpqa/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff')
257 |     if not os.path.exists(filepath):
258 |         print('Downloading mpqa lexicon...')
259 |         response = requests.post(
260 |             "http://mpqa.cs.pitt.edu/request_resource.php",
261 |             data={"name": "", "organization": "", "email": "", "dataset":"subj_lexicon"})
262 |         zf = zipfile.ZipFile(io.BytesIO(response.content))
263 |         zf.extractall(path=os.path.join(LBSA_DATA_DIR, "mpqa"))
264 |     
265 |     with open(filepath) as f:
266 |         words, positive, negative, strong_subj = list(), list(), list(), list()
267 |         for line in f.readlines():
268 |             items = line.rstrip().split(' ')
269 |             if len(items) == 6:
270 |                 words.append(items[2].split('=')[1])
271 |                 strong_subj.append(1 if (items[0].split('=')[1] == 'strongsubj') else 0)
272 |                 positive.append(1 if items[5].split('=')[1] in ['positive', 'both'] else 0)
273 |                 negative.append(1 if items[5].split('=')[1] in ['negative', 'both'] else 0)
274 |     return pd.DataFrame({
275 |         'english': words,
276 |         'positive': np.asarray(positive, dtype=np.int),
277 |         'negative': np.asarray(negative, dtype=np.int),
278 |         'strong_subjectivty': np.asarray(strong_subj, dtype=np.int)
279 |     })
280 |         
281 | 
282 | def load_afinn_opinion_lexicon(path=None):
283 |     LBSA_DATA_DIR = get_cache_dir()
284 |     if not os.path.isdir(os.path.join(LBSA_DATA_DIR, "afinn")):
285 |         os.makedirs(os.path.join(LBSA_DATA_DIR, "afinn"))
286 |     if not os.path.exists(os.path.join(LBSA_DATA_DIR, "afinn/AFINN/AFINN-111.txt")):
287 | 
288 |         if path is None:
289 |             # Download zip archive
290 |             LEXICON_URL = 'http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/6010/zip/imm6010.zip'
291 |             filepath = os.path.join(LBSA_DATA_DIR, "afinn/imm6010.zip")
292 |             progressbar = DownloadProgressBar('Downloading AFINN lexicon')
293 |             urlretrieve(LEXICON_URL, filepath, reporthook=progressbar.progress_hook)
294 |             print('')
295 |             with zipfile.ZipFile(filepath) as zf:
296 |                 zf.extractall(path=os.path.join(LBSA_DATA_DIR, "afinn"))
297 |             # Remove zip archive
298 |             os.remove(filepath)
299 |         else:
300 |             shutil.copyfile(path, os.path.join(LBSA_DATA_DIR, 'afinn/AFINN/AFINN-111.txt'))
301 |     
302 |     words, values = list(), list()
303 |     with open(os.path.join(LBSA_DATA_DIR, 'afinn/AFINN/AFINN-111.txt')) as f:
304 |         for line in f.readlines():
305 |             items = line.rstrip().split('\t')
306 |             if len(items) == 2:
307 |                 words.append(items[0])
308 |                 values.append(int(items[1]))
309 |     values = np.asarray(values, dtype=np.int)
310 |     positives = np.zeros(len(values), dtype=np.int)
311 |     negatives = np.zeros(len(values), dtype=np.int)
312 |     positives[values > 0] = values[values > 0]
313 |     negatives[values < 0] = -values[values < 0]
314 |     return pd.DataFrame({
315 |         'english': words,
316 |         'positive': positives,
317 |         'negative': negatives
318 |     })
319 | 
320 | 
321 | def tokenize(text):
322 |     return TOKENIZER.sub(r' \1 ', text).split()
323 | 
324 | 
325 | def create_sa_lexicon(source='nrc', language='english', path=None):
326 |     if source == 'nrc':
327 |         nrc_all_languages, tag_names = load_nrc_lexicon(path=path)
328 |         to_remove = ['positive', 'negative']
329 |         nrc_all_languages.drop(to_remove, axis=1, inplace=True)
330 |         for tag_name in to_remove:
331 |             tag_names.remove(tag_name)
332 |         lexicon = Lexicon(nrc_all_languages, tag_names, source, language=language)
333 |     else:
334 |         raise UnknownSource('Source %s does not provide any available sentiment analysis lexicon')
335 |     return lexicon
336 | 
337 | 
338 | def create_opinion_lexicon(source='nrc', language='english', path=None):
339 |     if source == 'nrc':
340 |         nrc_all_languages, tag_names = load_nrc_lexicon(path=path)
341 |         to_remove = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']
342 |         nrc_all_languages.drop(to_remove, axis=1, inplace=True)
343 |         for tag_name in to_remove:
344 |             tag_names.remove(tag_name)
345 |         lexicon = Lexicon(nrc_all_languages, tag_names, source, language=language)
346 |     elif source == 'afinn':
347 |         ol = load_afinn_opinion_lexicon(path=path)
348 |         lexicon = Lexicon(ol, ['positive', 'negative'], source, language=language)
349 |     elif source == 'mpqa':
350 |         ol = load_mpqa_sujectivity_lexicon(path=path)
351 |         lexicon = Lexicon(ol, ['positive', 'negative', 'strong_subjectivty'], source, language=language)
352 |     else:
353 |         raise UnknownSource('Source %s does not provide any available opinion/subjectivity lexicon')
354 |     return lexicon
355 | 
356 | 
357 | def get_lexicon(lexicon_type, **kwargs):
358 |     assert(lexicon_type in ['sa', 'opinion'])
359 |     if lexicon_type == 'sa':
360 |         return create_sa_lexicon(**kwargs)
361 |     else:
362 |         return create_opinion_lexicon(**kwargs)
363 | 


--------------------------------------------------------------------------------