├── examples
    ├── __init__.py
    └── liwc.py
├── liwc
    ├── __init__.py
    ├── test_liwc.py
    └── liwc.py
├── .travis.yml
├── .gitignore
├── requirements.txt
├── setup.py
├── LICENSE
└── README.md


/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/liwc/__init__.py:
--------------------------------------------------------------------------------
1 | from .liwc import Liwc
2 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - "3.6"
4 | install:
5 |   - pip install -r requirements.txt
6 | script:
7 |   - pytest


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | .pytest_cache/
 9 | .idea/
10 | build
11 | dist
12 | liwc_text_analysis.egg-info
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | atomicwrites==1.3.0
 2 | attrs==19.1.0
 3 | importlib-metadata==0.19
 4 | more-itertools==7.2.0
 5 | packaging==19.1
 6 | pluggy==0.12.0
 7 | py==1.8.0
 8 | pyparsing==2.4.2
 9 | pytest==5.1.2
10 | six==1.12.0
11 | wcwidth==0.1.7
12 | zipp==0.6.0
13 | 


--------------------------------------------------------------------------------
/examples/liwc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Written by Evan Lalopoulos <evan.lalopoulos.2017@my.bristol.ac.uk>
 4 | University of Bristol, May 2018
 5 | Copyright (C) - All Rights Reserved
 6 | """
 7 | 
 8 | import os
 9 | 
10 | from liwc import Liwc
11 | 
12 | # Replace with the path of a liwc (.dic) file
13 | LIWC_FILEPATH = os.path.abspath(
14 |     os.path.join(os.path.dirname(__file__), 'LIWC2007_English.dic'))
15 | 
16 | if __name__ == "__main__":
17 |     liwc = Liwc(LIWC_FILEPATH)
18 | 
19 |     print(liwc.search('happy'))
20 |     print(liwc.parse('I love ice cream.'.split(' ')))
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import setuptools
 3 | 
 4 | HERE = os.path.abspath(os.path.dirname(__file__))
 5 | 
 6 | with open("README.md", "r") as fh:
 7 |     long_description = fh.read()
 8 | 
 9 | with open(os.path.join(HERE, 'requirements.txt'), "r") as fp:
10 |     install_reqs = fp.read().splitlines()
11 | 
12 | setuptools.setup(
13 |     name="liwc-text-analysis",
14 |     version="1.0.2",
15 |     author="Evan Lalopoulos",
16 |     author_email="evan.lalopoulos.2017@my.bristol.ac.uk",
17 |     description="A python package for the Linguistic Inquiry and Word Count (LIWC) dictionary.",
18 |     long_description=long_description,
19 |     long_description_content_type="text/markdown",
20 |     url="https://github.com/evanll/liwc-text-analysis-python",
21 |     packages=setuptools.find_packages(),
22 |     classifiers=[
23 |         "Programming Language :: Python :: 3",
24 |         "Operating System :: OS Independent",
25 |     ],
26 |     python_requires='>=3.6',
27 |     install_requires=install_reqs
28 | )
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Evan Lalopoulos
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/evanll/liwc-text-analysis-python.svg?branch=master)](https://travis-ci.org/evanll/liwc-text-analysis-python)
 2 | 
 3 | # LIWC Text Analysis - Python
 4 | A python package for the Linguistic Inquiry and Word Count (LIWC) dictionary. This package requires the proprietary LIWC dictionary file (.dic), that can be obtained from [LIWC.net](http://liwc.net/). 
 5 | 
 6 | ## Usage
 7 | ```python
 8 | >>> from liwc import Liwc
 9 | >>> liwc = Liwc(LIWC_FILEPATH)
10 | >>> # Search a word in the dictionary to find in which LIWC categories it belongs
11 | >>> print(liwc.search('happy'))
12 | ['affect', 'posemo']
13 | >>> # Extract raw counts of words in a document that fall into the various LIWC categories
14 | >>> print(liwc.parse('I love ice cream.'.split(' ')))
15 | Counter({'verb': 1, 'present': 1, 'affect': 1, 'posemo': 1, 'bio': 1, 'sexual': 1, 'social': 1})
16 | ```
17 | ## Tests
18 | The project comes with an extensive set of unit tests. The Pytest framework is used for unit testing. 
19 | To run the tests use:  
20 | `pytest`
21 | 
22 | ## Project repository
23 | https://github.com/evanll/liwc-text-analysis-python
24 | 
25 | ## Author
26 | Written by Evan Lalopoulos <evan.lalopoulos.2017@my.bristol.ac.uk>
27 | 
28 | **Evan Lalopoulos** - [evanll](https://github.com/evanll)
29 | 


--------------------------------------------------------------------------------
/liwc/test_liwc.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Written by Evan Lalopoulos <evan.lalopoulos.2017@my.bristol.ac.uk>
  4 | University of Bristol, May 2018
  5 | Copyright (C) - All Rights Reserved
  6 | """
  7 | 
  8 | from .liwc import Liwc
  9 | 
 10 | WORD_CAT_DICT_1 = {
 11 |     "love": [
 12 |         "affect",
 13 |         "posemo"
 14 |     ],
 15 |     "loved": [
 16 |         "affect",
 17 |         "posemo"
 18 |     ]
 19 | }
 20 | 
 21 | WORD_CAT_DICT_2 = {
 22 |     "abandon*": [
 23 |         "affect",
 24 |         "negemo"
 25 |     ],
 26 |     "absolute": [
 27 |         "cogmech"
 28 |     ],
 29 |     "love": [
 30 |         "affect",
 31 |         "posemo"
 32 |     ]
 33 | }
 34 | 
 35 | 
 36 | def test_build_trie():
 37 |     expected = {
 38 |         "l": {
 39 |             "o": {
 40 |                 "v": {
 41 |                     "e": {
 42 |                         "$": [
 43 |                             "affect",
 44 |                             "posemo"
 45 |                         ],
 46 |                         "d": {
 47 |                             "$": [
 48 |                                 "affect",
 49 |                                 "posemo"
 50 |                             ]
 51 |                         }
 52 |                     }
 53 |                 }
 54 |             }
 55 |         }
 56 |     }
 57 |     assert Liwc._build_char_trie(WORD_CAT_DICT_1) == expected
 58 | 
 59 | 
 60 | def test_build_trie_wildcard():
 61 |     expected = {
 62 |         "a": {
 63 |             "b": {
 64 |                 "a": {
 65 |                     "n": {
 66 |                         "d": {
 67 |                             "o": {
 68 |                                 "n": {
 69 |                                     "*": [
 70 |                                         "affect",
 71 |                                         "negemo"
 72 |                                     ],
 73 |                                     "$": [
 74 |                                         "affect",
 75 |                                         "negemo"
 76 |                                     ]
 77 |                                 }
 78 |                             }
 79 |                         }
 80 |                     }
 81 |                 },
 82 |                 "s": {
 83 |                     "o": {
 84 |                         "l": {
 85 |                             "u": {
 86 |                                 "t": {
 87 |                                     "e": {
 88 |                                         "$": [
 89 |                                             "cogmech"
 90 |                                         ]
 91 |                                     }
 92 |                                 }
 93 |                             }
 94 |                         }
 95 |                     }
 96 |                 }
 97 |             }
 98 |         },
 99 |         "l": {
100 |             "o": {
101 |                 "v": {
102 |                     "e": {
103 |                         "$": [
104 |                             "affect",
105 |                             "posemo"
106 |                         ]
107 |                     }
108 |                 }
109 |             }
110 |         }
111 |     }
112 | 
113 |     assert Liwc._build_char_trie(WORD_CAT_DICT_2) == expected
114 | 
115 | 
116 | def test_search_trie():
117 |     trie = Liwc._build_char_trie(WORD_CAT_DICT_1)
118 | 
119 |     assert Liwc._search_trie(trie, 'love') == ["affect", "posemo"]
120 |     assert Liwc._search_trie(trie, 'loved') == ["affect", "posemo"]
121 | 
122 | 
123 | def test_search_wildcard():
124 |     trie = Liwc._build_char_trie(WORD_CAT_DICT_2)
125 | 
126 |     assert Liwc._search_trie(trie, 'abandon') == ["affect", "negemo"]
127 |     assert Liwc._search_trie(trie, 'abandonment') == ["affect", "negemo"]
128 | 


--------------------------------------------------------------------------------
/liwc/liwc.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Written by Evan Lalopoulos <evan.lalopoulos.2017@my.bristol.ac.uk>
  4 | University of Bristol, May 2018
  5 | Copyright (C) - All Rights Reserved
  6 | """
  7 | 
  8 | import collections
  9 | 
 10 | 
 11 | class Liwc():
 12 |     """
 13 |     Class for the Linguistic Inquiry and Word Count (LIWC) dictionairy.
 14 |     The dictionary files are proprietary and can be obtained by liwc.net
 15 |     """
 16 | 
 17 |     def __init__(self, filepath):
 18 |         """
 19 |         :param filepath: path to the LIWC .dic file.
 20 |         """
 21 |         self.categories, self.lexicon = self._load_dict_file(filepath)
 22 |         self._trie = self._build_char_trie(self.lexicon)
 23 | 
 24 |     def search(self, word):
 25 |         """
 26 |         Search a word in the liwc dictionairy.
 27 | 
 28 |         :param word:
 29 |         :return: a list of the liwc categories the word belongs.
 30 |                  an empty list if the word is not found in the dictionary.
 31 |         """
 32 |         return self._search_trie(self._trie, word)
 33 | 
 34 |     def parse(self, tokens):
 35 |         """
 36 |         Parses a document and extracts raw counts of words that fall into the
 37 |         various LIWC categories.
 38 | 
 39 |         :param tokens: a list of tokens, a tokeniSed document
 40 |         :return: a counter with the linguistic categories found in the doc,
 41 |                 and the raw count of words that fall in each category.
 42 |         """
 43 |         cat_counter = collections.Counter()
 44 | 
 45 |         for token in tokens:
 46 |             # Find in which categories this token falls, if any
 47 |             cats = self.search(token)
 48 |             for cat in cats:
 49 |                 cat_counter[cat] += 1
 50 | 
 51 |         return cat_counter
 52 | 
 53 |     def _load_dict_file(self, filepath):
 54 |         liwc_file = open(filepath)
 55 | 
 56 |         # Key, category dict
 57 |         categories = {}
 58 | 
 59 |         # Word, cat_name dict
 60 |         lexicon = {}
 61 | 
 62 |         # '%' signals a change in the .dic file.
 63 |         # (0-1) Cats, ids
 64 |         # (>1) Words, cat_ids
 65 |         percent_sign_count = 0
 66 | 
 67 |         for line in liwc_file:
 68 |             stp = line.strip()
 69 | 
 70 |             if stp:
 71 |                 parts = stp.split('\t')
 72 | 
 73 |                 if parts[0] == '%':
 74 |                     percent_sign_count += 1
 75 |                 else:
 76 |                     # If the percent sign counter equals 1, parse the LIWC
 77 |                     # categories
 78 |                     if percent_sign_count == 1:
 79 |                         categories[parts[0]] = parts[1]
 80 |                     # Else, parse lexicon
 81 |                     else:
 82 |                         lexicon[parts[0]] = [categories[cat_id]
 83 |                                              for cat_id in parts[1:]]
 84 | 
 85 |         return categories, lexicon
 86 | 
 87 |     @staticmethod
 88 |     def _build_char_trie(lexicon):
 89 |         """
 90 |         Builds a char trie, to cater for wildcard ('*') matches.
 91 |         """
 92 |         trie = {}
 93 |         for pattern, cat_names in lexicon.items():
 94 |             cursor = trie
 95 |             for char in pattern:
 96 |                 if char == '*':
 97 |                     cursor['*'] = cat_names
 98 |                     break
 99 | 
100 |                 if char not in cursor:
101 |                     cursor[char] = {}
102 | 
103 |                 cursor = cursor[char]
104 | 
105 |             # $ signifies end of token
106 |             cursor['$'] = cat_names
107 | 
108 |         return trie
109 | 
110 |     @staticmethod
111 |     def _search_trie(trie, token, i=0):
112 |         """
113 |         Search the given char trie for paths that match the token.
114 |         """
115 |         if '*' in trie:
116 |             return trie['*']
117 |         elif '$' in trie and i == len(token):
118 |             return trie['$']
119 |         elif i < len(token):
120 |             char = token[i]
121 |             if char in trie:
122 |                 return Liwc._search_trie(trie[char], token, i + 1)
123 |         return []
124 | 


--------------------------------------------------------------------------------