├── examples ├── __init__.py └── liwc.py ├── liwc ├── __init__.py ├── test_liwc.py └── liwc.py ├── .travis.yml ├── .gitignore ├── requirements.txt ├── setup.py ├── LICENSE └── README.md /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /liwc/__init__.py: -------------------------------------------------------------------------------- 1 | from .liwc import Liwc 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | install: 5 | - pip install -r requirements.txt 6 | script: 7 | - pytest -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | .pytest_cache/ 9 | .idea/ 10 | build 11 | dist 12 | liwc_text_analysis.egg-info 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | atomicwrites==1.3.0 2 | attrs==19.1.0 3 | importlib-metadata==0.19 4 | more-itertools==7.2.0 5 | packaging==19.1 6 | pluggy==0.12.0 7 | py==1.8.0 8 | pyparsing==2.4.2 9 | pytest==5.1.2 10 | six==1.12.0 11 | wcwidth==0.1.7 12 | zipp==0.6.0 13 | -------------------------------------------------------------------------------- /examples/liwc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Written by Evan Lalopoulos 4 | University of Bristol, May 2018 5 | Copyright (C) - All Rights Reserved 6 | """ 7 | 8 | import os 9 | 10 | from liwc import Liwc 11 | 12 | # Replace with the path of a liwc (.dic) file 13 | LIWC_FILEPATH = os.path.abspath( 14 | os.path.join(os.path.dirname(__file__), 'LIWC2007_English.dic')) 15 | 16 | if __name__ == "__main__": 17 | liwc = Liwc(LIWC_FILEPATH) 18 | 19 | print(liwc.search('happy')) 20 | print(liwc.parse('I love ice cream.'.split(' '))) 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import setuptools 3 | 4 | HERE = os.path.abspath(os.path.dirname(__file__)) 5 | 6 | with open("README.md", "r") as fh: 7 | long_description = fh.read() 8 | 9 | with open(os.path.join(HERE, 'requirements.txt'), "r") as fp: 10 | install_reqs = fp.read().splitlines() 11 | 12 | setuptools.setup( 13 | name="liwc-text-analysis", 14 | version="1.0.2", 15 | author="Evan Lalopoulos", 16 | author_email="evan.lalopoulos.2017@my.bristol.ac.uk", 17 | description="A python package for the Linguistic Inquiry and Word Count (LIWC) dictionary.", 18 | long_description=long_description, 19 | long_description_content_type="text/markdown", 20 | url="https://github.com/evanll/liwc-text-analysis-python", 21 | packages=setuptools.find_packages(), 22 | classifiers=[ 23 | "Programming Language :: Python :: 3", 24 | "Operating System :: OS Independent", 25 | ], 26 | python_requires='>=3.6', 27 | install_requires=install_reqs 28 | ) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Evan Lalopoulos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/evanll/liwc-text-analysis-python.svg?branch=master)](https://travis-ci.org/evanll/liwc-text-analysis-python) 2 | 3 | # LIWC Text Analysis - Python 4 | A python package for the Linguistic Inquiry and Word Count (LIWC) dictionary. This package requires the proprietary LIWC dictionary file (.dic), that can be obtained from [LIWC.net](http://liwc.net/). 5 | 6 | ## Usage 7 | ```python 8 | >>> from liwc import Liwc 9 | >>> liwc = Liwc(LIWC_FILEPATH) 10 | >>> # Search a word in the dictionary to find in which LIWC categories it belongs 11 | >>> print(liwc.search('happy')) 12 | ['affect', 'posemo'] 13 | >>> # Extract raw counts of words in a document that fall into the various LIWC categories 14 | >>> print(liwc.parse('I love ice cream.'.split(' '))) 15 | Counter({'verb': 1, 'present': 1, 'affect': 1, 'posemo': 1, 'bio': 1, 'sexual': 1, 'social': 1}) 16 | ``` 17 | ## Tests 18 | The project comes with an extensive set of unit tests. The Pytest framework is used for unit testing. 19 | To run the tests use: 20 | `pytest` 21 | 22 | ## Project repository 23 | https://github.com/evanll/liwc-text-analysis-python 24 | 25 | ## Author 26 | Written by Evan Lalopoulos 27 | 28 | **Evan Lalopoulos** - [evanll](https://github.com/evanll) 29 | -------------------------------------------------------------------------------- /liwc/test_liwc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Written by Evan Lalopoulos 4 | University of Bristol, May 2018 5 | Copyright (C) - All Rights Reserved 6 | """ 7 | 8 | from .liwc import Liwc 9 | 10 | WORD_CAT_DICT_1 = { 11 | "love": [ 12 | "affect", 13 | "posemo" 14 | ], 15 | "loved": [ 16 | "affect", 17 | "posemo" 18 | ] 19 | } 20 | 21 | WORD_CAT_DICT_2 = { 22 | "abandon*": [ 23 | "affect", 24 | "negemo" 25 | ], 26 | "absolute": [ 27 | "cogmech" 28 | ], 29 | "love": [ 30 | "affect", 31 | "posemo" 32 | ] 33 | } 34 | 35 | 36 | def test_build_trie(): 37 | expected = { 38 | "l": { 39 | "o": { 40 | "v": { 41 | "e": { 42 | "$": [ 43 | "affect", 44 | "posemo" 45 | ], 46 | "d": { 47 | "$": [ 48 | "affect", 49 | "posemo" 50 | ] 51 | } 52 | } 53 | } 54 | } 55 | } 56 | } 57 | assert Liwc._build_char_trie(WORD_CAT_DICT_1) == expected 58 | 59 | 60 | def test_build_trie_wildcard(): 61 | expected = { 62 | "a": { 63 | "b": { 64 | "a": { 65 | "n": { 66 | "d": { 67 | "o": { 68 | "n": { 69 | "*": [ 70 | "affect", 71 | "negemo" 72 | ], 73 | "$": [ 74 | "affect", 75 | "negemo" 76 | ] 77 | } 78 | } 79 | } 80 | } 81 | }, 82 | "s": { 83 | "o": { 84 | "l": { 85 | "u": { 86 | "t": { 87 | "e": { 88 | "$": [ 89 | "cogmech" 90 | ] 91 | } 92 | } 93 | } 94 | } 95 | } 96 | } 97 | } 98 | }, 99 | "l": { 100 | "o": { 101 | "v": { 102 | "e": { 103 | "$": [ 104 | "affect", 105 | "posemo" 106 | ] 107 | } 108 | } 109 | } 110 | } 111 | } 112 | 113 | assert Liwc._build_char_trie(WORD_CAT_DICT_2) == expected 114 | 115 | 116 | def test_search_trie(): 117 | trie = Liwc._build_char_trie(WORD_CAT_DICT_1) 118 | 119 | assert Liwc._search_trie(trie, 'love') == ["affect", "posemo"] 120 | assert Liwc._search_trie(trie, 'loved') == ["affect", "posemo"] 121 | 122 | 123 | def test_search_wildcard(): 124 | trie = Liwc._build_char_trie(WORD_CAT_DICT_2) 125 | 126 | assert Liwc._search_trie(trie, 'abandon') == ["affect", "negemo"] 127 | assert Liwc._search_trie(trie, 'abandonment') == ["affect", "negemo"] 128 | -------------------------------------------------------------------------------- /liwc/liwc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Written by Evan Lalopoulos 4 | University of Bristol, May 2018 5 | Copyright (C) - All Rights Reserved 6 | """ 7 | 8 | import collections 9 | 10 | 11 | class Liwc(): 12 | """ 13 | Class for the Linguistic Inquiry and Word Count (LIWC) dictionairy. 14 | The dictionary files are proprietary and can be obtained by liwc.net 15 | """ 16 | 17 | def __init__(self, filepath): 18 | """ 19 | :param filepath: path to the LIWC .dic file. 20 | """ 21 | self.categories, self.lexicon = self._load_dict_file(filepath) 22 | self._trie = self._build_char_trie(self.lexicon) 23 | 24 | def search(self, word): 25 | """ 26 | Search a word in the liwc dictionairy. 27 | 28 | :param word: 29 | :return: a list of the liwc categories the word belongs. 30 | an empty list if the word is not found in the dictionary. 31 | """ 32 | return self._search_trie(self._trie, word) 33 | 34 | def parse(self, tokens): 35 | """ 36 | Parses a document and extracts raw counts of words that fall into the 37 | various LIWC categories. 38 | 39 | :param tokens: a list of tokens, a tokeniSed document 40 | :return: a counter with the linguistic categories found in the doc, 41 | and the raw count of words that fall in each category. 42 | """ 43 | cat_counter = collections.Counter() 44 | 45 | for token in tokens: 46 | # Find in which categories this token falls, if any 47 | cats = self.search(token) 48 | for cat in cats: 49 | cat_counter[cat] += 1 50 | 51 | return cat_counter 52 | 53 | def _load_dict_file(self, filepath): 54 | liwc_file = open(filepath) 55 | 56 | # Key, category dict 57 | categories = {} 58 | 59 | # Word, cat_name dict 60 | lexicon = {} 61 | 62 | # '%' signals a change in the .dic file. 63 | # (0-1) Cats, ids 64 | # (>1) Words, cat_ids 65 | percent_sign_count = 0 66 | 67 | for line in liwc_file: 68 | stp = line.strip() 69 | 70 | if stp: 71 | parts = stp.split('\t') 72 | 73 | if parts[0] == '%': 74 | percent_sign_count += 1 75 | else: 76 | # If the percent sign counter equals 1, parse the LIWC 77 | # categories 78 | if percent_sign_count == 1: 79 | categories[parts[0]] = parts[1] 80 | # Else, parse lexicon 81 | else: 82 | lexicon[parts[0]] = [categories[cat_id] 83 | for cat_id in parts[1:]] 84 | 85 | return categories, lexicon 86 | 87 | @staticmethod 88 | def _build_char_trie(lexicon): 89 | """ 90 | Builds a char trie, to cater for wildcard ('*') matches. 91 | """ 92 | trie = {} 93 | for pattern, cat_names in lexicon.items(): 94 | cursor = trie 95 | for char in pattern: 96 | if char == '*': 97 | cursor['*'] = cat_names 98 | break 99 | 100 | if char not in cursor: 101 | cursor[char] = {} 102 | 103 | cursor = cursor[char] 104 | 105 | # $ signifies end of token 106 | cursor['$'] = cat_names 107 | 108 | return trie 109 | 110 | @staticmethod 111 | def _search_trie(trie, token, i=0): 112 | """ 113 | Search the given char trie for paths that match the token. 114 | """ 115 | if '*' in trie: 116 | return trie['*'] 117 | elif '$' in trie and i == len(token): 118 | return trie['$'] 119 | elif i < len(token): 120 | char = token[i] 121 | if char in trie: 122 | return Liwc._search_trie(trie[char], token, i + 1) 123 | return [] 124 | --------------------------------------------------------------------------------