├── .gitignore
├── data
    └── data_char_testphrases.rda
├── .Rbuildignore
├── NAMESPACE
├── README.md
├── .travis.yml
├── NEWS.md
├── inst
    └── extdata
    │   └── testphrases.txt
├── R
    ├── data.R
    └── liwcalike.R
├── codecov.yml
├── man
    ├── data_char_testphrases.Rd
    └── liwcalike.Rd
├── appveyor.yml
├── tests
    ├── testthat
    │   └── test-liwcalike.R
    └── data
    │   └── LIWC2015_Results_Washington.csv
├── DESCRIPTION
├── CONDUCT.md
├── README_old.Rmd
└── README_old.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | *.Rproj
6 | 


--------------------------------------------------------------------------------
/data/data_char_testphrases.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kbenoit/LIWCalike/HEAD/data/data_char_testphrases.rda


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README.Rmd$
4 | ^\.travis
5 | ^revdep
6 | ^appveyor\.yml$
7 | ^CONDUCT\.md$
8 | ^codecov\.yml$
9 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | S3method(liwcalike,character)
4 | S3method(liwcalike,corpus)
5 | export(liwcalike)
6 | import(quanteda)
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LIWCalike
2 | 
3 | ## NOTE: This package has been mothballed
4 | 
5 | The new package including an improved version of the `liwcalike()` function can be found at https://github.com/kbenoit/quanteda.dictionaries.  
6 | 
7 | Please use that instead.


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | cache: packages
 3 | warnings_are_errors: true
 4 | sudo: true
 5 | dist: trusty
 6 | latex: false
 7 | r_packages:
 8 |   - covr
 9 | env:
10 |   global:
11 |     - R_CHECK_ARGS="--no-build-vignettes"
12 | after_success:
13 |   - Rscript -e 'library(covr);codecov()'
14 | 
15 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # LIWCalike 
 2 | 
 3 | # Version 0.3.0 
 4 | 
 5 | ## Major changes
 6 | 
 7 | * Updated for **quanteda** >= v0.9.9.
 8 | 
 9 | ## New features
10 | 
11 | * Now handles multi-word dictionary values.
12 | 
13 | ## Bug fixes
14 | 
15 | * Fixed problem with computation of total words (#7).
16 | 
17 | 


--------------------------------------------------------------------------------
/inst/extdata/testphrases.txt:
--------------------------------------------------------------------------------
 1 | Test sentence for LIWCalike.  Second sentence.
 2 | Each row is a document.
 3 | Comma, period.
 4 | The red-shirted lawyer gave her ex-boyfriend $300 out of pity :(.
 5 | LOL :).
 6 | (Parentheses) for $100.
 7 | Say "what" again!!
 8 | Why are we here?
 9 | Other punctation: ^; %, &.
10 | Sentence one.  Sentence two! :-)
11 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' sample short documents for testing
 3 | #' 
 4 | #' Some sample short documents in plain text format for testing
 5 | #'   with \code{\link{liwcalike}}.
 6 | #' @examples
 7 | #' liwcalike(data_char_testphrases)
 8 | "data_char_testphrases"
 9 | 
10 | # save(testphrases, file = "data/testphrases.RData")
11 | # writeLines(testphrases, "inst/extdata/testphrases.txt")
12 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   notify:
 3 |     require_ci_to_pass: yes
 4 | 
 5 | coverage:
 6 |   precision: 2
 7 |   round: down
 8 |   range: "70...100"
 9 | 
10 |   status:
11 |     project: yes
12 |     patch: yes
13 |     changes: no
14 | 
15 | parsers:
16 |   gcov:
17 |     branch_detection:
18 |       conditional: yes
19 |       loop: yes
20 |       method: no
21 |       macro: no
22 | 
23 | comment:
24 |   layout: header, diff
25 |   behavior: default
26 |   require_changes: no
27 | 
28 | ignore:
29 |   - "data"
30 |   - "demo"
31 |   - "tests"
32 |   - "vignettes"
33 | 


--------------------------------------------------------------------------------
/man/data_char_testphrases.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{data_char_testphrases}
 5 | \alias{data_char_testphrases}
 6 | \title{sample short documents for testing}
 7 | \format{An object of class \code{character} of length 10.}
 8 | \usage{
 9 | data_char_testphrases
10 | }
11 | \description{
12 | Some sample short documents in plain text format for testing
13 |   with \code{\link{liwcalike}}.
14 | }
15 | \examples{
16 | liwcalike(data_char_testphrases)
17 | }
18 | \keyword{datasets}
19 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # DO NOT CHANGE the "init" and "install" sections below
 2 | 
 3 | # Download script file from GitHub
 4 | init:
 5 |   ps: |
 6 |         $ErrorActionPreference = "Stop"
 7 |         Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
 8 |         Import-Module '..\appveyor-tool.ps1'
 9 | 
10 | install:
11 |   ps: Bootstrap
12 | 
13 | cache:
14 |   - C:\RLibrary
15 | 
16 | # Adapt as necessary starting from here
17 | 
18 | build_script:
19 |   - travis-tool.sh install_deps
20 | 
21 | test_script:
22 |   - travis-tool.sh run_tests
23 | 
24 | on_failure:
25 |   - 7z a failure.zip *.Rcheck\*
26 |   - appveyor PushArtifact failure.zip
27 | 
28 | artifacts:
29 |   - path: '*.Rcheck\**\*.log'
30 |     name: Logs
31 | 
32 |   - path: '*.Rcheck\**\*.out'
33 |     name: Logs
34 | 
35 |   - path: '*.Rcheck\**\*.fail'
36 |     name: Logs
37 | 
38 |   - path: '*.Rcheck\**\*.Rout'
39 |     name: Logs
40 | 
41 |   - path: '\*_*.tar.gz'
42 |     name: Bits
43 | 
44 |   - path: '\*_*.zip'
45 |     name: Bits
46 | 


--------------------------------------------------------------------------------
/tests/testthat/test-liwcalike.R:
--------------------------------------------------------------------------------
 1 | context('test liwcalike.R')
 2 | 
 3 | test_that("test dictionary count etc.", {
 4 |     
 5 |     txt <- c("The red-shirted lawyer gave her ex-boyfriend $300 out of pity :(.",
 6 |              "The green-shirted lawyer gave her $300 out of pity :(.")
 7 |     myDict <- dictionary(list(people = c("lawyer", "boyfriend"),
 8 |                               colorFixed = "red",
 9 |                               colorGlob = "red*",
10 |                               mwe = "out of"))
11 |     myCount <- liwcalike(txt, myDict, what = "word")
12 |     
13 |     toks <- tokens(txt[1], remove_hyphens = TRUE)
14 |     num_words_txt1 <- ntoken(toks)
15 |     
16 |     # dictionary count
17 |     num_people <- sum(toks$text1 == "lawyer") + sum(toks$text1 == "boyfriend")
18 |     expect_equivalent(round(as.numeric(myCount$people[1]), 2), round(100*num_people/num_words_txt1, 2))
19 | 
20 |     # period count
21 |     num_Period <- stringi::stri_count_fixed(txt[1], ".") / num_words_txt1 * 100
22 |     expect_equivalent(round(as.numeric(myCount$Period[1]), 2), round(num_Period, 2))
23 | })
24 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: LIWCalike
 2 | Type: Package
 3 | Title: Text analysis similar to the Linguistic Inquiry and Word Count (LIWC)
 4 | Version: 0.3.2
 5 | Date: 2018-01-14
 6 | Author: Kenneth Benoit
 7 | Maintainer: Kenneth Benoit <kbenoit@lse.ac.uk>
 8 | Description: Built on the quanteda package for text analysis, LIWCalike
 9 |     provides a simple interface to the analysis of text by counting words and other
10 |     textual features, including the application of a dictionary to produce a tabular
11 |     report of percentages. This provides similar functionality to the LIWC stand-
12 |     alone software. The user must a dictionary, which can include one of the custom
13 |     LIWC dictionaries if these have been purchased from http://liwc.wpengine.com.
14 | License: GPL-3
15 | LazyData: TRUE
16 | Depends:
17 |     R (>= 3.2.2)
18 | Imports:
19 |     stringi,
20 |     quanteda (>= 0.99)
21 | Suggests:
22 |     testthat,
23 |     covr
24 | Remotes:
25 |     quanteda/quanteda
26 | URL: http://github.com/kbenoit/LIWCalike
27 | Encoding: UTF-8
28 | BugReports: https://github.com/kbenoit/LIWCalike/issues
29 | RoxygenNote: 6.0.1
30 | 


--------------------------------------------------------------------------------
/CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, we pledge to respect all people who 
 4 | contribute through reporting issues, posting feature requests, updating documentation,
 5 | submitting pull requests or patches, and other activities.
 6 | 
 7 | We are committed to making participation in this project a harassment-free experience for
 8 | everyone, regardless of level of experience, gender, gender identity and expression,
 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 | 
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 | 
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
18 | from the project team.
19 | 
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
21 | opening an issue or contacting one or more of the project maintainers.
22 | 
23 | This Code of Conduct is adapted from the Contributor Covenant 
24 | (http:contributor-covenant.org), version 1.0.0, available at 
25 | http://contributor-covenant.org/version/1/0/0/
26 | 


--------------------------------------------------------------------------------
/tests/data/LIWC2015_Results_Washington.csv:
--------------------------------------------------------------------------------
1 | Filename,Segment,WC,Analytic,Clout,Authentic,Tone,WPS,Sixltr,Dic,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,prep,auxverb,adverb,conj,negate,verb,adj,compare,interrog,number,quant,affect,posemo,negemo,anx,anger,sad,social,family,friend,female,male,cogproc,insight,cause,discrep,tentat,certain,differ,percept,see,hear,feel,bio,body,health,sexual,ingest,drives,affiliation,achieve,power,reward,risk,focuspast,focuspresent,focusfuture,relativ,motion,space,time,work,leisure,home,money,relig,death,informal,swear,netspeak,assent,nonflu,filler,AllPunc,Period,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
2 | 1789-Washington.txt,1,1431,91.76,52.52,24.77,96.34,62.22,26.21,84.63,56.39,12.93,6.43,3.98,0.21,0.98,0.56,0.70,6.50,9.78,18.80,7.69,2.03,5.66,1.33,9.29,4.89,3.21,3.07,0.56,2.73,6.78,5.73,0.84,0.28,0.21,0.14,6.85,0.07,0.28,0.07,0.77,10.34,2.17,1.89,1.19,2.10,2.24,2.10,1.26,0.42,0.28,0.42,0.84,0.35,0.28,0.07,0.07,7.97,1.19,1.54,4.05,1.75,0.77,1.75,6.22,2.10,10.41,0.63,6.22,3.56,3.28,0.21,0.14,0.35,0.70,0.07,0.14,0.00,0.00,0.00,0.14,0.00,7.62,1.61,4.89,0.07,0.56,0.00,0.00,0.21,0.14,0.00,0.14,0.00
3 | 1793-Washington.txt,1,135,92.84,38.33,48.27,25.77,33.75,27.41,83.70,57.78,13.33,7.41,6.67,0.00,0.74,0.00,0.00,5.93,10.37,19.26,8.89,3.70,3.70,0.00,11.85,0.74,0.00,2.22,0.00,2.22,4.44,2.22,2.22,0.00,1.48,0.74,5.19,0.00,0.74,0.00,0.74,8.15,1.48,0.00,1.48,3.70,1.48,2.22,1.48,0.00,0.74,0.00,0.74,0.00,0.00,0.00,0.74,8.15,0.00,2.22,6.67,1.48,0.00,2.96,8.89,2.96,11.85,0.74,4.44,6.67,3.70,1.48,0.00,0.00,0.00,0.74,0.00,0.00,0.00,0.00,0.00,0.00,8.89,2.96,3.70,0.74,0.00,0.00,0.00,0.00,0.00,0.00,1.48,0.00


--------------------------------------------------------------------------------
/man/liwcalike.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/liwcalike.R
 3 | \name{liwcalike}
 4 | \alias{liwcalike}
 5 | \alias{liwcalike.corpus}
 6 | \alias{liwcalike.character}
 7 | \title{analyze text in a LIWC-alike fashion}
 8 | \usage{
 9 | liwcalike(x, ...)
10 | 
11 | \method{liwcalike}{corpus}(x, ...)
12 | 
13 | \method{liwcalike}{character}(x, dictionary = NULL, tolower = TRUE,
14 |   verbose = TRUE, ...)
15 | }
16 | \arguments{
17 | \item{x}{input object, a \pkg{quanteda} \link[quanteda]{corpus} or character
18 | vector for analysis}
19 | 
20 | \item{...}{options passed to \code{\link[quanteda]{tokens}} offering
21 | finer-grained control over how "words" are defined}
22 | 
23 | \item{dictionary}{a \pkg{quanteda} \link[quanteda]{dictionary} object
24 | supplied for analysis}
25 | 
26 | \item{tolower}{convert to common (lowser) case before tokenizing}
27 | 
28 | \item{verbose}{if \code{TRUE} print status messages during processing}
29 | }
30 | \value{
31 | a data.frame object containing the analytic results, one row per
32 |   document supplied
33 | }
34 | \description{
35 | Analyze a set of texts to produce a dataset of percentages and other
36 | quantities describing the text, similar to the functionality supplied by the
37 | Linguistic Inquiry and Word Count standalone software distributed at
38 | \url{http://liwc.wpengine.com}.
39 | }
40 | \section{Segmentation}{
41 |  The LIWC standalone software has many options for
42 |   segmenting the text.  While this function does not supply segmentation
43 |   options, you can easily achieve the same effect by converting the input
44 |   object into a corpus (if it is not already a corpus) and using
45 |   \link[quanteda]{tokens} to split the input
46 |   texts into smaller units based on user-supplied tags, sentence, or
47 |   paragraph boundaries.
48 | }
49 | 
50 | \examples{
51 | liwcalike(data_char_testphrases)
52 | 
53 | # examples for comparison
54 | txt <- c("The red-shirted lawyer gave her yellow-haired, red nose ex-boyfriend $300 
55 |             out of pity:(.")
56 | myDict <- quanteda::dictionary(list(people = c("lawyer", "boyfriend"),
57 |                           colorFixed = "red",
58 |                           colorGlob = c("red*", "yellow*", "green*"),
59 |                           mwe = "out of"))
60 | liwcalike(txt, myDict, what = "word")
61 | liwcalike(txt, myDict, what = "fasterword")
62 | (toks <- quanteda::tokens(txt, what = "fasterword", removeHyphens = TRUE))
63 | length(toks[[1]])
64 | # LIWC says 12 words
65 | 
66 | \dontrun{# works with LIWC 2015 dictionary too
67 | liwc2015dict <- dictionary(file = "~/Dropbox/QUANTESS/dictionaries/LIWC/LIWC2015_English_Flat.dic",
68 |                            format = "LIWC")
69 | inaugLIWCanalysis <- liwcalike(data_corpus_inaugural, liwc2015dict)
70 | inaugLIWCanalysis[1:6, 1:10]
71 | ##           docname Segment   WC      WPS Sixltr   Dic function article relativ motion
72 | ## 1 1789-Washington       1 1540 62.21739  24.35 253.1   52.403  9.0909 101.361 0.3483
73 | ## 2 1793-Washington       2  147 33.75000  25.17 250.3    5.065  0.9091  10.884 0.0387
74 | ## 3      1797-Adams       3 2584 62.72973  24.61 237.5   82.403 15.0649 163.946 0.3096
75 | ## 4  1801-Jefferson       4 1935 42.19512  20.36 253.2   62.143 10.0000 105.442 0.7353
76 | ## 5  1805-Jefferson       5 2381 48.13333  22.97 255.8   79.221 10.9091 151.701 0.6966
77 | ## 6    1809-Madison       6 1267 56.04762  24.78 258.2   42.987  8.3117  83.673 0.3870
78 | }
79 | }
80 | 


--------------------------------------------------------------------------------
/README_old.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output:
 3 |   md_document:
 4 |     variant: markdown_github
 5 | ---
 6 | 
 7 | ```{r, echo = FALSE}
 8 | knitr::opts_chunk$set(
 9 |   collapse = TRUE,
10 |   comment = "#>",
11 |   fig.path = "README-"
12 | )
13 | ```
14 | 
15 | [![CRAN Version](http://www.r-pkg.org/badges/version/LIWCalike)](http://cran.r-project.org/package=LIWCalike)
16 | ![Downloads](http://cranlogs.r-pkg.org/badges/LIWCalike)
17 | [![Travis-CI Build Status](https://travis-ci.org/kbenoit/LIWCalike.svg?branch=master)](https://travis-ci.org/kbenoit/LIWCalike)
18 | [![Appveyor Build status](https://ci.appveyor.com/api/projects/status/kn31ca24tnnrbwth/branch/master?svg=true)](https://ci.appveyor.com/project/kbenoit/liwcalike/branch/master)
19 | [![codecov.io](https://codecov.io/github/kbenoit/LIWCalike/LIWCalike.svg?branch=master)](https://codecov.io/github/kbenoit/LIWCalike/coverage.svg?branch=master)
20 | 
21 | ## LIWCalike: an R implementation of the Linguistic Inquiry and Word Count
22 | 
23 | Built on the quanteda package for text analysis, LIWCalikes provides a simple interface to the analysis of text by counting words and other textual features, including the application of a dictionary to produce a tabular report of percentages.  This provides similar functionality to the LIWC stand-alone software.  
24 | 
25 | The user must supply a dictionary, which can include one of the custom LIWC dictionaries if these have been purchased from http://liwc.wpengine.com, or any other dictionary supplied by the user.  The `dictionary()` constructor of the **quanteda** package, on which **LIWCalike** is built, can read both LIWC and Wordstat-formatted dictionary files, or you can use it to create a dictionary from an R list object (a named list of character vectors, where each character vector is a set of dictionary match patterns and its associated name is the dictionary key).
26 | 
27 | ### Differences from the LIWC standalone software
28 | 
29 | This package is designed for R users and those wishing to build functionality by extending the [**quanteda**](https://github.com/kbenoit/quanteda) package for text analysis.  If you prefer to have a complete, stand-alone user interface, then you should purchase and use the [LIWC standalone software](http://liwc.wpengine.com).  This has several advantages:
30 | 
31 | *  LIWC allows direct importing of files, including binary (Word, pdf, etc) formats.  To use
32 |    **LIWCalike**, you will need to import these into the **quanteda** package first.  
33 |    **LIWCalike** also works fine with simple character vectors, if you prefer to use 
34 |    standard R methods to create your input object (e.g. `readLines()`, `read.csv()`, etc.)
35 | 
36 | *  LIWC provides direct outputs in the form of csv, Excel files, etc.  By contrast, **LIWCalike** returns a `data.frame`, which you have to export yourself (e.g. using `write.csv()`.)
37 | 
38 | *  LIWC provides easy segmentation, through a GUI.  By contrast, with **LIWCalike** you will
39 |    have to segment the texts yourself.  (**quanteda** provides easy ways to do this using 
40 |    `segment()` and `changeunits()`.)
41 |    
42 | *  LIWC color codes the dictionary value matches in your texts and displays these in a nice graphical window.  
43 | 
44 | *  LIWC provides four composite measures that are not included in **LIWCalike**: "Analytic", "Clout", "Authentic", and "Tone".  These are based on proprietary algorithms, as described and refernced in [Pennebaker, J.W., Boyd, R.L., Jordan, K., & Blackburn, K. (2015). The development and psychometric properties of LIWC2015. Austin, TX: University of Texas at Austin. D OI: 10.15781/T29G6Z](http://liwc.wpengine.com/wp-content/uploads/2015/11/LIWC2015_LanguageManual.pdf).  
45 | 
46 | 
47 | ## Using dictionaries with LIWCalike
48 | 
49 | No dictionaries are supplied with **LIWCalike**, it is up to you to supply these.  With the **quanteda** functions for creating or importing dictionaries, however, this is quite easy.
50 | 
51 | With the LIWC 2007, external dictionaries were distributed with the software that could be used in the format read by Provalis Research's [*Wordstat*](http://provalisresearch.com/products/content-analysis-software/).  Because I purchases a license for this product, I have that file and can use it with **LIWCalike**.
52 | 
53 | Using it is quite straightforward:
54 | 
55 | ```{r}
56 | library("LIWCalike")
57 | library("quanteda")
58 | 
59 | # read in the dictionary
60 | liwc2007dict <- dictionary(file = "~/Dropbox/QUANTESS/dictionaries/LIWC/LIWC2007.cat", 
61 |                            format = "wordstat")
62 | tail(liwc2007dict, 1)
63 | 
64 | # our test data
65 | data_char_testphrases
66 | 
67 | # call LIWCalike
68 | output <- liwcalike(data_char_testphrases, liwc2007dict)
69 | 
70 | # view some results
71 | output[, c(1:7, ncol(output)-2)]
72 | ```
73 | 
74 | 
75 | ## How to Install
76 | 
77 | **LIWCalike** is currently only available on GitHub, not on CRAN.  The best method of installing it is through the **devtools** package:
78 | 
79 | ```
80 | devtools::install_github("kbenoit/LIWCalike")
81 | ```
82 | 
83 | This will also automatically install the  **quanteda** package on which **LIWCalike** is built.
84 | 
85 | 
86 | ## Comments, feedback, and code of conduct
87 | 
88 | I welcome your comments and feedback.  Please file issues on the issues page, and/or send me comments at kbenoit@lse.ac.uk.
89 | 
90 | Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms.
91 | 
92 | 


--------------------------------------------------------------------------------
/R/liwcalike.R:
--------------------------------------------------------------------------------
  1 | #' analyze text in a LIWC-alike fashion
  2 | #'
  3 | #' Analyze a set of texts to produce a dataset of percentages and other
  4 | #' quantities describing the text, similar to the functionality supplied by the
  5 | #' Linguistic Inquiry and Word Count standalone software distributed at
  6 | #' \url{http://liwc.wpengine.com}.
  7 | #' @param x input object, a \pkg{quanteda} \link[quanteda]{corpus} or character
  8 | #'   vector for analysis
  9 | #' @param dictionary a \pkg{quanteda} \link[quanteda]{dictionary} object
 10 | #'   supplied for analysis
 11 | #' @param tolower convert to common (lowser) case before tokenizing
 12 | #' @param verbose if \code{TRUE} print status messages during processing
 13 | #' @param ... options passed to \code{\link[quanteda]{tokens}} offering
 14 | #'   finer-grained control over how "words" are defined
 15 | #' @return a data.frame object containing the analytic results, one row per
 16 | #'   document supplied
 17 | #' @section Segmentation: The LIWC standalone software has many options for
 18 | #'   segmenting the text.  While this function does not supply segmentation
 19 | #'   options, you can easily achieve the same effect by converting the input
 20 | #'   object into a corpus (if it is not already a corpus) and using
 21 | #'   \link[quanteda]{tokens} to split the input
 22 | #'   texts into smaller units based on user-supplied tags, sentence, or
 23 | #'   paragraph boundaries.
 24 | #' @examples
 25 | #' liwcalike(data_char_testphrases)
 26 | #'
 27 | #' # examples for comparison
 28 | #' txt <- c("The red-shirted lawyer gave her yellow-haired, red nose ex-boyfriend $300 
 29 | #'             out of pity:(.")
 30 | #' myDict <- quanteda::dictionary(list(people = c("lawyer", "boyfriend"),
 31 | #'                           colorFixed = "red",
 32 | #'                           colorGlob = c("red*", "yellow*", "green*"),
 33 | #'                           mwe = "out of"))
 34 | #' liwcalike(txt, myDict, what = "word")
 35 | #' liwcalike(txt, myDict, what = "fasterword")
 36 | #' (toks <- quanteda::tokens(txt, what = "fasterword", removeHyphens = TRUE))
 37 | #' length(toks[[1]])
 38 | #' # LIWC says 12 words
 39 | #'
 40 | #' \dontrun{# works with LIWC 2015 dictionary too
 41 | #' liwc2015dict <- dictionary(file = "~/Dropbox/QUANTESS/dictionaries/LIWC/LIWC2015_English_Flat.dic",
 42 | #'                            format = "LIWC")
 43 | #' inaugLIWCanalysis <- liwcalike(data_corpus_inaugural, liwc2015dict)
 44 | #' inaugLIWCanalysis[1:6, 1:10]
 45 | #' ##           docname Segment   WC      WPS Sixltr   Dic function article relativ motion
 46 | #' ## 1 1789-Washington       1 1540 62.21739  24.35 253.1   52.403  9.0909 101.361 0.3483
 47 | #' ## 2 1793-Washington       2  147 33.75000  25.17 250.3    5.065  0.9091  10.884 0.0387
 48 | #' ## 3      1797-Adams       3 2584 62.72973  24.61 237.5   82.403 15.0649 163.946 0.3096
 49 | #' ## 4  1801-Jefferson       4 1935 42.19512  20.36 253.2   62.143 10.0000 105.442 0.7353
 50 | #' ## 5  1805-Jefferson       5 2381 48.13333  22.97 255.8   79.221 10.9091 151.701 0.6966
 51 | #' ## 6    1809-Madison       6 1267 56.04762  24.78 258.2   42.987  8.3117  83.673 0.3870
 52 | #' }
 53 | #' @export
 54 | #' @import quanteda
 55 | liwcalike <- function(x, ...) {
 56 |     UseMethod("liwcalike")
 57 | }
 58 | 
 59 | 
 60 | #' @rdname liwcalike
 61 | #' @export
 62 | liwcalike.corpus <- function(x, ...) {
 63 |     liwcalike(texts(x), ...)
 64 | }
 65 | 
 66 | #' @rdname liwcalike
 67 | #' @export
 68 | liwcalike.character <- function(x, dictionary = NULL, tolower = TRUE, verbose = TRUE, ...) {
 69 | 
 70 |     ## initialize results data.frame
 71 |     ## similar to "Filename" and Segment
 72 |     result <-
 73 |         data.frame(docname = if (is.null(names(x))) paste0("text", 1:length(x)) else names(x),
 74 |                    Segment = 1:length(x), row.names = NULL, stringsAsFactors = FALSE)
 75 | 
 76 |     ## get readability before lowercasing
 77 |     WPS <- quanteda::textstat_readability(x, "meanSentenceLength") #, ...)
 78 | 
 79 | 
 80 |     # ## if a dictionary is supplied, apply it to the dfm
 81 |     # # first pre-process the text for multi-word dictionary values
 82 |     # if (!is.null(dictionary)) {
 83 |     #     x <- tokens_compound(x, dictionary, case_insensitive = tolower)
 84 |     #     if (dictionary@concatenator != "_")
 85 |     #         dictionary <- lapply(dictionary, stringi::stri_replace_all_fixed, dictionary@concatenator, "_")
 86 |     # }
 87 | 
 88 |     ## tokenize and form the dfm
 89 |     toks <- quanteda::tokens(x, remove_hyphens = TRUE)
 90 |     
 91 |     ## lower case the texts if required
 92 |     if (tolower) 
 93 |         toks <- quanteda::tokens_tolower(toks)
 94 |     
 95 |     ## form the dfm
 96 |     dfmDict <- quanteda::dfm(toks, dictionary = dictionary, verbose = FALSE)
 97 | 
 98 |     ## WC
 99 |     result[["WC"]] <- quanteda::ntoken(toks)
100 |     # maybe this should be ntoken(dfmAll) - does LIWC count punctuation??
101 | 
102 |     ## no implementation for: Analytic	Clout	Authentic	Tone
103 | 
104 |     ## WPS (mean words per sentence)
105 |     result[["WPS"]] <- WPS
106 | 
107 |     ## Sixltr
108 |     result[["Sixltr"]] <- sapply(toks, function(y) sum(stringi::stri_length(y) > 6)) / result[["WC"]] * 100
109 | 
110 |     ## Dic (percentage of words in the dictionary)
111 |     comp_toks <- tokens_compound(toks, dictionary)
112 |     comp_match <- tokens_select(comp_toks, dictionary)
113 |     result[["Dic"]] <- if (!is.null(dictionary)) ntoken(comp_match) / ntoken(comp_toks) * 100 else NA
114 | 
115 |     ## add the dictionary counts, transformed to percentages of total words
116 |     if (!is.null(dictionary))
117 |         result <- cbind(result,
118 |                         as.data.frame(as.matrix(dfmDict) / matrix(rep(result[["WC"]], each = nfeat(dfmDict)), ncol = nfeat(dfmDict), byrow = TRUE),
119 |                                       row.names = 1:nrow(result)) * 100)
120 | 
121 |     ## punctuation counts
122 |     # AllPunc
123 |     result[["AllPunc"]] <- stringi::stri_count_charclass(x, "\\p{P}") / result[["WC"]] * 100
124 | 
125 |     # Period
126 |     result[["Period"]] <- stringi::stri_count_fixed(x, ".") / result[["WC"]] * 100
127 | 
128 |     # Comma
129 |     result[["Comma"]] <- stringi::stri_count_fixed(x, ",") / result[["WC"]] * 100
130 | 
131 |     # Colon
132 |     result[["Colon"]] <- stringi::stri_count_fixed(x, ":") / result[["WC"]] * 100
133 | 
134 |     # SemiC
135 |     result[["SemiC"]] <- stringi::stri_count_fixed(x, ";") / result[["WC"]] * 100
136 | 
137 |     # QMark
138 |     result[["QMark"]] <- stringi::stri_count_fixed(x, "?") / result[["WC"]] * 100
139 | 
140 |     # Exclam
141 |     result[["Exclam"]] <- stringi::stri_count_fixed(x, "!") / result[["WC"]] * 100
142 | 
143 |     # Dash
144 |     result[["Dash"]] <- stringi::stri_count_charclass(x, "\\p{Pd}") / result[["WC"]] * 100
145 | 
146 |     # Quote
147 |     result[["Quote"]] <- stringi::stri_count_charclass(x, "[:QUOTATION_MARK:]")/ result[["WC"]] * 100
148 | 
149 |     # Apostro
150 |     result[["Apostro"]] <- stringi::stri_count_charclass(x, "['\\u2019]") / result[["WC"]] * 100
151 | 
152 |     # Parenth -- note this is specified as "pairs of parentheses"
153 |     result[["Parenth"]] <- min(c(stringi::stri_count_fixed(x, "("),
154 |                                   stringi::stri_count_fixed(x, ")"))) / result[["WC"]] * 100
155 | 
156 |     # OtherP
157 |     result[["OtherP"]] <- stringi::stri_count_charclass(x, "\\p{Po}") / result[["WC"]] * 100
158 | 
159 |     # format the result
160 |     result[, which(names(result)=="Sixltr") : ncol(result)] <-
161 |         format(result[, which(names(result)=="Sixltr") : ncol(result)],
162 |                digits = 4, trim = TRUE)
163 | 
164 |     result
165 | }
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/README_old.md:
--------------------------------------------------------------------------------
  1 | [![CRAN
  2 | Version](http://www.r-pkg.org/badges/version/LIWCalike)](http://cran.r-project.org/package=LIWCalike)
  3 | ![Downloads](http://cranlogs.r-pkg.org/badges/LIWCalike) [![Travis-CI
  4 | Build
  5 | Status](https://travis-ci.org/kbenoit/LIWCalike.svg?branch=master)](https://travis-ci.org/kbenoit/LIWCalike)
  6 | [![Appveyor Build
  7 | status](https://ci.appveyor.com/api/projects/status/kn31ca24tnnrbwth/branch/master?svg=true)](https://ci.appveyor.com/project/kbenoit/liwcalike/branch/master)
  8 | [![codecov.io](https://codecov.io/github/kbenoit/LIWCalike/LIWCalike.svg?branch=master)](https://codecov.io/github/kbenoit/LIWCalike/coverage.svg?branch=master)
  9 | 
 10 | LIWCalike: an R implementation of the Linguistic Inquiry and Word Count
 11 | -----------------------------------------------------------------------
 12 | 
 13 | Built on the quanteda package for text analysis, LIWCalikes provides a
 14 | simple interface to the analysis of text by counting words and other
 15 | textual features, including the application of a dictionary to produce a
 16 | tabular report of percentages. This provides similar functionality to
 17 | the LIWC stand-alone software.
 18 | 
 19 | The user must supply a dictionary, which can include one of the custom
 20 | LIWC dictionaries if these have been purchased from
 21 | <http://liwc.wpengine.com>, or any other dictionary supplied by the
 22 | user. The `dictionary()` constructor of the **quanteda** package, on
 23 | which **LIWCalike** is built, can read both LIWC and Wordstat-formatted
 24 | dictionary files, or you can use it to create a dictionary from an R
 25 | list object (a named list of character vectors, where each character
 26 | vector is a set of dictionary match patterns and its associated name is
 27 | the dictionary key).
 28 | 
 29 | ### Differences from the LIWC standalone software
 30 | 
 31 | This package is designed for R users and those wishing to build
 32 | functionality by extending the
 33 | [**quanteda**](https://github.com/kbenoit/quanteda) package for text
 34 | analysis. If you prefer to have a complete, stand-alone user interface,
 35 | then you should purchase and use the [LIWC standalone
 36 | software](http://liwc.wpengine.com). This has several advantages:
 37 | 
 38 | -   LIWC allows direct importing of files, including binary (Word, pdf,
 39 |     etc) formats. To use **LIWCalike**, you will need to import these
 40 |     into the **quanteda** package first.  
 41 |     **LIWCalike** also works fine with simple character vectors, if you
 42 |     prefer to use standard R methods to create your input object (e.g.
 43 |     `readLines()`, `read.csv()`, etc.)
 44 | 
 45 | -   LIWC provides direct outputs in the form of csv, Excel files, etc.
 46 |     By contrast, **LIWCalike** returns a `data.frame`, which you have to
 47 |     export yourself (e.g. using `write.csv()`.)
 48 | 
 49 | -   LIWC provides easy segmentation, through a GUI. By contrast, with
 50 |     **LIWCalike** you will have to segment the texts yourself.
 51 |     (**quanteda** provides easy ways to do this using `segment()` and
 52 |     `changeunits()`.)
 53 | 
 54 | -   LIWC color codes the dictionary value matches in your texts and
 55 |     displays these in a nice graphical window.
 56 | 
 57 | -   LIWC provides four composite measures that are not included in
 58 |     **LIWCalike**: “Analytic”, “Clout”, “Authentic”, and “Tone”. These
 59 |     are based on proprietary algorithms, as described and refernced in
 60 |     [Pennebaker, J.W., Boyd, R.L., Jordan, K., & Blackburn, K. (2015).
 61 |     The development and psychometric properties of LIWC2015. Austin, TX:
 62 |     University of Texas at Austin. D OI:
 63 |     10.15781/T29G6Z](http://liwc.wpengine.com/wp-content/uploads/2015/11/LIWC2015_LanguageManual.pdf).
 64 | 
 65 | Using dictionaries with LIWCalike
 66 | ---------------------------------
 67 | 
 68 | No dictionaries are supplied with **LIWCalike**, it is up to you to
 69 | supply these. With the **quanteda** functions for creating or importing
 70 | dictionaries, however, this is quite easy.
 71 | 
 72 | With the LIWC 2007, external dictionaries were distributed with the
 73 | software that could be used in the format read by Provalis Research’s
 74 | [*Wordstat*](http://provalisresearch.com/products/content-analysis-software/).
 75 | Because I purchases a license for this product, I have that file and can
 76 | use it with **LIWCalike**.
 77 | 
 78 | Using it is quite straightforward:
 79 | 
 80 | ``` r
 81 | library("LIWCalike")
 82 | library("quanteda")
 83 | #> Package version: 1.2.1
 84 | #> Parallel computing: 4 of 8 threads used.
 85 | #> See https://quanteda.io for tutorials and examples.
 86 | #> 
 87 | #> Attaching package: 'quanteda'
 88 | #> The following object is masked from 'package:utils':
 89 | #> 
 90 | #>     View
 91 | 
 92 | # read in the dictionary
 93 | liwc2007dict <- dictionary(file = "~/Dropbox/QUANTESS/dictionaries/LIWC/LIWC2007.cat", 
 94 |                            format = "wordstat")
 95 | tail(liwc2007dict, 1)
 96 | #> Dictionary object with 1 primary key entry and 2 nested levels.
 97 | #> - [SPOKEN CATEGORIES]:
 98 | #>   - [ASSENT]:
 99 | #>     - absolutely, agree, ah, alright*, aok, aw, awesome, cool, duh, ha, hah, haha*, heh*, hm*, huh, lol, mm*, oh, ok, okay, okey*, rofl, uhhu*, uhuh, yah, yay, yea, yeah, yep*, yes, yup
100 | #>   - [NON-FLUENCIES]:
101 | #>     - er, hm*, sigh, uh, um, umm*, well, zz*
102 | #>   - [FILLERS]:
103 | #>     - blah, idon'tknow, idontknow, imean, ohwell, oranything*, orsomething*, orwhatever*, rr*, yakn*, ykn*, youknow*
104 | 
105 | # our test data
106 | data_char_testphrases
107 | #>  [1] "Test sentence for LIWCalike.  Second sentence."                   
108 | #>  [2] "Each row is a document."                                          
109 | #>  [3] "Comma, period."                                                   
110 | #>  [4] "The red-shirted lawyer gave her ex-boyfriend $300 out of pity :(."
111 | #>  [5] "LOL :)."                                                          
112 | #>  [6] "(Parentheses) for $100."                                          
113 | #>  [7] "Say \"what\" again!!"                                             
114 | #>  [8] "Why are we here?"                                                 
115 | #>  [9] "Other punctation: ^; %, &."                                       
116 | #> [10] "Sentence one.  Sentence two! :-)"
117 | 
118 | # call LIWCalike
119 | output <- liwcalike(data_char_testphrases, liwc2007dict)
120 | #> Warning: 'nfeature' is deprecated.
121 | #> Use 'nfeat' instead.
122 | #> See help("Deprecated")
123 | 
124 | #> Warning: 'nfeature' is deprecated.
125 | #> Use 'nfeat' instead.
126 | #> See help("Deprecated")
127 | 
128 | # view some results
129 | output[, c(1:7, ncol(output)-2)]
130 | #>    docname Segment WC WPS.document WPS.meanSentenceLength Sixltr   Dic
131 | #> 1    text1       1  8        text1                      3  37.50 37.50
132 | #> 2    text2       2  6        text2                      5  16.67 50.00
133 | #> 3    text3       3  4        text3                      2   0.00 25.00
134 | #> 4    text4       4 18        text4                     12  11.11 61.11
135 | #> 5    text5       5  4        text5                      1   0.00 25.00
136 | #> 6    text6       6  7        text6                      3  14.29 28.57
137 | #> 7    text7       7  7        text7                      3   0.00 42.86
138 | #> 8    text8       8  5        text8                      4   0.00 80.00
139 | #> 9    text9       9  9        text9                      2  11.11 11.11
140 | #> 10  text10      10  9       text10                      2  22.22 22.22
141 | #>    LINGUISTIC PROCESSES.FUNCTION WORDS Apostro
142 | #> 1                                25.00       0
143 | #> 2                                50.00       0
144 | #> 3                                 0.00       0
145 | #> 4                                22.22       0
146 | #> 5                                 0.00       0
147 | #> 6                                14.29       0
148 | #> 7                                28.57       0
149 | #> 8                                60.00       0
150 | #> 9                                11.11       0
151 | #> 10                               22.22       0
152 | ```
153 | 
154 | How to Install
155 | --------------
156 | 
157 | **LIWCalike** is currently only available on GitHub, not on CRAN. The
158 | best method of installing it is through the **devtools** package:
159 | 
160 |     devtools::install_github("kbenoit/LIWCalike")
161 | 
162 | This will also automatically install the **quanteda** package on which
163 | **LIWCalike** is built.
164 | 
165 | Comments, feedback, and code of conduct
166 | ---------------------------------------
167 | 
168 | I welcome your comments and feedback. Please file issues on the issues
169 | page, and/or send me comments at <kbenoit@lse.ac.uk>.
170 | 
171 | Please note that this project is released with a [Contributor Code of
172 | Conduct](CONDUCT.md). By participating in this project you agree to
173 | abide by its terms.
174 | 


--------------------------------------------------------------------------------