├── simple.docx
├── data
    ├── data.rds
    ├── data.RData
    ├── ListeningTourResponses.csv
    └── ListeningTourResponses.xlsx
├── .gitignore
├── data-raw
    ├── ITDEI.docx
    ├── NW, Alaska, CA.docx
    ├── Operations II.docx
    ├── ListeningTour_CCI.docx
    ├── SE DEIJ Listening Tour.docx
    ├── SouthWestListeningTour.docx
    ├── ListeningTour_Marketing.docx
    ├── DEIJ LISTENING TOUR_comms.docx
    ├── LandscapeDept_ListeningTour.docx
    ├── Listening Tour - Development.pdf
    ├── Listening Tour - Marketing.pdf
    ├── ListeningTour_Development.docx
    ├── DEIJ LISTENING TOUR_cons law.docx
    ├── 1-3-2020 DEIJ Listening Session - Governmnet Relations.docx
    └── 12-17-19 DEIJ Listening Session - Rockies and Plains CCattelino.docx
├── DEIJ_NLP.Rproj
├── R
    ├── cleaning.R
    ├── functions.R
    └── analysis.R
├── simple.Rmd
└── index.Rmd


/simple.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/simple.docx


--------------------------------------------------------------------------------
/data/data.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data/data.rds


--------------------------------------------------------------------------------
/data/data.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data/data.RData


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 
6 | /data*
7 | /data-raw*
8 | 


--------------------------------------------------------------------------------
/data-raw/ITDEI.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/ITDEI.docx


--------------------------------------------------------------------------------
/data-raw/NW, Alaska, CA.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/NW, Alaska, CA.docx


--------------------------------------------------------------------------------
/data-raw/Operations II.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/Operations II.docx


--------------------------------------------------------------------------------
/data-raw/ListeningTour_CCI.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/ListeningTour_CCI.docx


--------------------------------------------------------------------------------
/data/ListeningTourResponses.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data/ListeningTourResponses.csv


--------------------------------------------------------------------------------
/data/ListeningTourResponses.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data/ListeningTourResponses.xlsx


--------------------------------------------------------------------------------
/data-raw/SE DEIJ Listening Tour.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/SE DEIJ Listening Tour.docx


--------------------------------------------------------------------------------
/data-raw/SouthWestListeningTour.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/SouthWestListeningTour.docx


--------------------------------------------------------------------------------
/data-raw/ListeningTour_Marketing.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/ListeningTour_Marketing.docx


--------------------------------------------------------------------------------
/data-raw/DEIJ LISTENING TOUR_comms.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/DEIJ LISTENING TOUR_comms.docx


--------------------------------------------------------------------------------
/data-raw/LandscapeDept_ListeningTour.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/LandscapeDept_ListeningTour.docx


--------------------------------------------------------------------------------
/data-raw/Listening Tour - Development.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/Listening Tour - Development.pdf


--------------------------------------------------------------------------------
/data-raw/Listening Tour - Marketing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/Listening Tour - Marketing.pdf


--------------------------------------------------------------------------------
/data-raw/ListeningTour_Development.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/ListeningTour_Development.docx


--------------------------------------------------------------------------------
/data-raw/DEIJ LISTENING TOUR_cons law.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/DEIJ LISTENING TOUR_cons law.docx


--------------------------------------------------------------------------------
/data-raw/1-3-2020 DEIJ Listening Session - Governmnet Relations.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/1-3-2020 DEIJ Listening Session - Governmnet Relations.docx


--------------------------------------------------------------------------------
/data-raw/12-17-19 DEIJ Listening Session - Rockies and Plains CCattelino.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/DEIJ_NLP/master/data-raw/12-17-19 DEIJ Listening Session - Rockies and Plains CCattelino.docx


--------------------------------------------------------------------------------
/DEIJ_NLP.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/R/cleaning.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(stringr)
 3 | library(tidyr)
 4 | 
 5 | dat <- read.csv(file = 'data/LIsteningTourResponses.csv', header = TRUE, stringsAsFactors = FALSE)
 6 | source(file ='functions.R')
 7 | 
 8 | # TEXT CLEANING
 9 | 
10 | # eliminate/expand contractions
11 | dat$Statement <- sapply(dat$Statement, fix_contractions)
12 | 
13 | #' function to remove special characters
14 | remove_spChars <- function(x) gsub("[^a-zA-Z0-9 ]", " ", x)
15 | 
16 | # remove special characters
17 | dat$Statement <- sapply(dat$Statement, remove_spChars)
18 | 
19 | # convert everything to lower case
20 | dat$Statement <- sapply(dat$Statement, tolower)
21 | 
22 | # eliminate by-product double spaces (may need to run twice)
23 | dat$Statement <- str_replace(dat$Statement, "  ", " ")
24 | 
25 | # we will nedd a row number column to ID individual statements later
26 | dat <- mutate(dat, text = Statement, Statement = row_number())
27 | # data is clean, save the data
28 | saveRDS(dat, file = 'data/data.rds')
29 | 
30 | 


--------------------------------------------------------------------------------
/R/functions.R:
--------------------------------------------------------------------------------
 1 | #' remove special characters from a text string
 2 | #' 
 3 | #' @param x the input string
 4 | #' @return string
 5 | #'
 6 | remove_spChars <- function(x){
 7 |   gsub("[^a-zA-Z0-9 ]", " ", x)
 8 | } 
 9 | 
10 | #' function to expand contractions in an English-language source
11 | #' 
12 | #' @param doc the input string
13 | #' @return input string with contractions substituted
14 | fix_contractions <- function(doc) {
15 |   # "won't" is a special case as it does not expand to "wo not"
16 |   doc <- gsub("won't", "will not", doc)
17 |   doc <- gsub("can't", "can not", doc)
18 |   doc <- gsub("n't", " not", doc)
19 |   doc <- gsub("'ll", " will", doc)
20 |   doc <- gsub("'re", " are", doc)
21 |   doc <- gsub("'ve", " have", doc)
22 |   doc <- gsub("'m", " am", doc)
23 |   doc <- gsub("'d", " would", doc)
24 |   # 's could be 'is' or could be possessive: it has no expansion
25 |   doc <- gsub("'s", "", doc)
26 |   return(doc)
27 | }
28 | 
29 | #' function that accepts the lda model and num word to display
30 | #' 
31 | #' @param lda_model output from topicmodels::LDA
32 | #' @param num_words limit output to n most frequent words
33 | #' 
34 | #' @return tibble with columns 'topic', 'term', 'beta', and 'row'
35 | top_terms_per_topic <- function(lda_model, num_words) {
36 |   
37 |   #tidy LDA object to get word, topic, and probability (beta)
38 |   topics_tidy <- tidy(lda_model, matrix = "beta")
39 |   
40 |   
41 |   top_terms <- topics_tidy %>%
42 |     group_by(topic) %>%
43 |     arrange(topic, desc(beta)) %>%
44 |     #get the top num_words PER topic
45 |     slice(seq_len(num_words)) %>%
46 |     arrange(topic, beta) %>%
47 |     #row is required for the word_chart() function
48 |     mutate(row = row_number()) %>%
49 |     ungroup() %>%
50 |     #add the word Topic to the topic labels
51 |     mutate(topic = paste("Topic", topic, sep = " "))
52 |   #create a title to pass to word_chart
53 |   title <- paste("LDA Top Terms for", k, "Topics")
54 |   return(top_terms)
55 | }
56 | 
57 | #' function to perform the silhouette test on kmeans output
58 | #' @param k number of clusters to test
59 | #' @param d document term matrix
60 | #' @return mean silhouette score
61 | silhouette_test <- function(k, d){
62 |   km <- kmeans(d, k)
63 |   ss <- silhouette(km$cluster, dist(d))
64 |   return(mean(ss[,3]))
65 | }
66 | 
67 | #' return the frequency and number of departments for a term
68 | #' @param dt data frame containing textual data
69 | #' @param term character string of term to quantify
70 | #' @return vector with the number of responses containing the term (x) and
71 | #' the number of different departments providing those responses (y)
72 | dept_importance <- function(dt, term){
73 |   filt <- filter(dt, Category != "Efforts", grepl(term, text))
74 |   x <- nrow(filt)
75 |   y <- length(unique(filt$Department))
76 |   return(c(x,y))
77 | }
78 | 


--------------------------------------------------------------------------------
/simple.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DEIJ Listening Tour Themes"
  3 | author: "Michael Evans"
  4 | date: "March 30, 2020"
  5 | output: word_document
  6 | always_allow_html: yes
  7 | ---
  8 | 
  9 | ```{r setup, include=FALSE}
 10 | library(cleanNLP)
 11 | library(cluster)
 12 | library(dplyr)
 13 | library(DT)
 14 | library(plotly)
 15 | library(tidytext)
 16 | library(tidyr)
 17 | library(tm)
 18 | library(topicmodels)
 19 | 
 20 | source(file = 'R/functions.R')
 21 | load('data/data.Rdata')
 22 | #dat <- readRDS(file = 'data/data.rds')
 23 | 
 24 | # define some styling parameters for graphs
 25 | tlf <- list(color = 'black', size = 14)
 26 | tkf <- list(color = 'black', size = 12)
 27 | 
 28 | # set seed for repeatability
 29 | seed <- 1024
 30 | ```
 31 | 
 32 | ## Background
 33 | 
 34 | The DEIJ working group previously conducted an organizational 'listening tour' in which members spoke with staff in each of Defenders' departments and field offices to understand the current landscape of ongoing efforts related to Diversity, Equity, Inclusion, and Justice and identify where staff saw opportunities for growth in the organization. Defenders' staff had a lot of constructive thoughts and distilling these ideas into actionable themes quickly became a challenge.
 35 | 
 36 | To help extract cohesive themes from these responses, we decided to try using a field of machine learning called 'Natural Language Processing' (NLP). The discipline of NLP covers a collection of techniques to automate the extraction of themes, sentiments, and associations from text. The working groups’ goal is to use these insights to help identify focal areas in which Defenders can improve our DEIJ practices. The DEIJ working group will use this insight to help inform the initiatives we will focus on in the coming year.
 37 | 
 38 | ## Our Findings:
 39 | 
 40 | Altogether, we collected `r nrow(dat)` ‘statements’ from Defenders staff over the course of the listening tour. After processing and standardizing these statements, we were able to distill four focal areas:
 41 | 
 42 | 1.	Initiating a paid internship program
 43 | 2.	Engaging and working with tribal governments
 44 | 3.	Using social media to engage more diverse audiences
 45 | 4.	Defining equitable hiring protocols and providing training
 46 | 
 47 | While there were several analyses and some interpretation needed to arrive at this list, the process can best be visualized by the frequency of two-word phrases, or ‘bigrams’:
 48 | 
 49 | ```{r bigram_counts, warnings = FALSE, echo = FALSE}
 50 | # lets see the most frequently used bigrams
 51 | bigram_counts%>%
 52 |   plot_ly(type = 'bar', y = ~bigram, x = ~n,
 53 |           orientation = 'h',
 54 |           marker = list(color = '#005596'))%>%
 55 |   layout(
 56 |     xaxis = list(
 57 |       title = 'Frequency',
 58 |       tickfont = tkf,
 59 |       titlefont = tlf,
 60 |       showgrid = FALSE,
 61 |       linewidth = 1
 62 |     ),
 63 |     yaxis = list(
 64 |       title = 'Bigram',
 65 |       titlefont = tlf,
 66 |       tickfont = tkf,
 67 |       showticklabels = FALSE
 68 |     ),
 69 |     title = 'Top 10 Most Common Bigrams',
 70 |     titlefont = list(color = 'black', size = 16)
 71 |   )%>%
 72 |   add_annotations(
 73 |     text = ~bigram,
 74 |     showarrow = FALSE,
 75 |     x = ~n,
 76 |     xanchor = 'right',
 77 |     font = list(color = 'white')
 78 |   )
 79 | ```
 80 | 
 81 | These were the most frequently occurring two-word phrases (that consisted of meaningful words) that appeared across all the responses gathered during the listening tour. These phrases are helpful because each represents an intuitive concept. To get an even better sense of the relative importance of these concepts we measured not only how often each bigram was mentioned overall, but how many different departments mentioned them.
 82 | 
 83 | ```{r bigram_deptfreq, echo = FALSE, warning = FALSE, error = FALSE}
 84 | lapply(bigram_counts$bigram, function(term){
 85 |   vec <- dept_importance(dat, term)
 86 |   return(data.frame(bigram = term, total = vec[1], depts = vec[2]))
 87 |   })%>%
 88 |   bind_rows()%>%
 89 |   plot_ly(type = 'scatter', mode = 'markers',
 90 |           marker = list(color = '00000000'),
 91 |           x = ~total, y = ~depts)%>%
 92 |   add_annotations(text = bigram_counts$bigram,
 93 |                   font = list(size = 12, color= 'black'),
 94 |                   showarrow = TRUE,
 95 |                   arrowcolor = 'white',
 96 |                   xanchor = 'center',
 97 |                   arrowhead = 0,
 98 |                   ax = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 99 |                   ay = c(20, -20, 0, 30, 0, 0, -30, 0, 10, -10 ))%>%
100 |   layout(
101 |     xaxis = list(title = 'Term frequency',
102 |                  titlefont = tlf,
103 |                  tickfont = tkf,
104 |                  showgrid = FALSE,
105 |                  range = c(0,5)),
106 |     yaxis = list(title = '# Departments',
107 |                  titlefont = tlf,
108 |                  tickfont = tkf,
109 |                  showgrid = FALSE,
110 |                  range = c(0, 5))
111 |   )
112 | ```
113 | 
114 | Taking the bigrams that appear in the top right portion of this plot provided us with a solid definition of major domains in which Defenders could focus our DEIJ efforts: paid internships; social media, tribal governments/indigenous communities; hiring practices.
115 | 
116 | ## Conclusion
117 | 
118 | We were able to use Natural Language Processing to quickly and objectively filter through thoughts provided by Defenders' staff during the DEIJ listening tour to extract the most salient terms, sentiments, and themes. Using automated processes provides a replicable method for identifying themes and priorities in the future, should Defenders conduct similar or more extended social surveys. The four themes that we identified will help guide the DEIJ working group as we craft a strategy and list of objectives for our work.


--------------------------------------------------------------------------------
/R/analysis.R:
--------------------------------------------------------------------------------
  1 | ## ANALYSIS
  2 | library(cleanNLP)
  3 | library(cluster)
  4 | library(dplyr)
  5 | library(plotly)
  6 | library(tidytext)
  7 | library(tidyr)
  8 | library(tm)
  9 | library(topicmodels)
 10 | 
 11 | source(file = 'R/functions.R')
 12 | dat <- readRDS(file = 'data/data.rds')
 13 | 
 14 | # some words will come up frequently that are not helpful
 15 | unhelpful_words <- c("defenders",
 16 |                      "deij",
 17 |                      "diversity",
 18 |                      "wildlife",
 19 |                      "diverse")
 20 | 
 21 | # TOKENIZE DATA
 22 | # single words
 23 | dat_tokenized <- dat %>%
 24 |   tidytext::unnest_tokens(word, text) %>%
 25 |   anti_join(stop_words) %>%
 26 |   distinct() %>%
 27 |   filter(!word %in% unhelpful_words) %>%
 28 |   filter(nchar(word) > 3)
 29 | 
 30 | # lets see the most frequently used words
 31 | dat_tokenized%>%
 32 |   filter(Category != 'Efforts')%>%
 33 |   count(word, sort = TRUE)%>%
 34 |   top_n(10)%>%
 35 |   mutate(word = reorder(word, n))%>%
 36 |   plot_ly(type = 'bar', y = ~word, x = ~n,
 37 |           orientation = 'h')%>%
 38 |   layout(barmode = 'stack')
 39 | 
 40 | # create document-term matrix
 41 | dt_mat <- dat_tokenized%>%
 42 |   filter(Category != 'Efforts')%>%
 43 |   mutate(Statement = as.character(Statement))%>%
 44 |   count(Statement, word, sort = TRUE)%>%
 45 |   ungroup()%>%
 46 |   tidytext::cast_dtm(document = Statement, term = word, value = n)
 47 | 
 48 | inspect(dt_mat[1:4, 1:8])
 49 | 
 50 | seed <- 1024
 51 | # run the LDA analysis
 52 | 
 53 | # test a series of k's for likelihood minima
 54 | # ks <- sapply(seq(2,20,1), function(k){LDA(dt_mat, k = k, method = "GIBBS", control = list(seed = seed))@loglikelihood})
 55 | # 
 56 | # plot(seq(2,20,1), ks)
 57 | # 
 58 | # k <- 9
 59 | 
 60 | # calculate average silhouette value for a series of ks
 61 | avg_sil <- sapply(2:20, function(x){silhouette_test(x, dt_mat)})
 62 | plot_ly(type = 'scatter', mode = 'lines', x = 2:20, y = avg_sil)%>%
 63 |   layout(yaxis = list(title = 'Silhouette score', range = c(0, 0.5)))
 64 | 
 65 | k <- 5
 66 | 
 67 | lda <- topicmodels::LDA(dt_mat, k = k, method = "GIBBS", control = list(seed = seed))
 68 | 
 69 | # return the probability per term of inclusion in each topic
 70 | betas <- tidy(lda, matrix = 'beta')
 71 | 
 72 | # get the top terms per topic
 73 | topic_terms <- top_terms_per_topic(lda, 7)
 74 | 
 75 | # plot the top terms per topic
 76 | subplot(lapply(unique(topic_terms$topic), function(x){
 77 |   plot_ly(data = filter(topic_terms, topic == x)%>%
 78 |             mutate(term = reorder(term, beta)),
 79 |           type = 'bar', y = ~term, x = ~beta,
 80 |           orientation = 'h',
 81 |           name = x)%>%
 82 |     layout(xaxis = list(range = c(0, max(betas$beta))),
 83 |            yaxis = list(tickfont = list(size = 8),
 84 |                         showticklabels = FALSE))%>%
 85 |     add_annotations(
 86 |       text = ~term,
 87 |       x = ~beta, xanchor = 'right',
 88 |       font = list(color = 'white'),
 89 |       showarrow = FALSE
 90 |     )
 91 |   }),
 92 |   nrows = (k%/%2)+1)%>%
 93 |   layout(title = paste('Terms (k = ', k, ')', sep = ""))
 94 | 
 95 | # Calculate TF-IDF
 96 | dat_tokenized%>%
 97 |   filter(Category != 'Efforts')%>%
 98 |   count(Statement, word)%>%
 99 |   ungroup()%>%
100 |   bind_tf_idf(term = word, document = Statement, n = n)%>%
101 |   ungroup()%>%
102 |   arrange(desc(tf_idf))%>%
103 |   top_n(10, tf_idf)%>%
104 |   mutate(word = reorder(word, tf_idf))%>%
105 |   plot_ly(type = 'bar', orientation = 'h', x = ~tf_idf, y = ~word)
106 | 
107 | # BIGRAMS
108 | # pull out 2-word phrases from all statements
109 | bigrams <- dat %>%
110 |   unnest_tokens(bigram, text, token = "ngrams", n = 2)
111 | 
112 | # separate word pairs so we can filter out unwanted words
113 | bigrams_separated <- bigrams %>%
114 |   separate(bigram, c("word1", "word2"), sep = " ")
115 | 
116 | # filter stop words from each half and pairs with the same word
117 | bigrams_filtered <- bigrams_separated %>%
118 |   filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
119 |   filter(!word1 %in% unhelpful_words, !word2 %in% unhelpful_words)%>%
120 |   filter(word1 != word2, Category != 'Efforts')
121 | 
122 | # rejoin bigrams and tally by department
123 | bigram_counts <- bigrams_filtered %>%
124 |   # recombine the bigrams
125 |   unite(bigram, word1, word2, sep = " ") %>%
126 |   # filter out current efforts
127 |   filter(Category != 'Efforts')%>%
128 |   # group by bigram and department then add count and sort
129 |   count(bigram, sort = TRUE) %>%
130 |   # count ungroups, so re-group by department (optional)
131 |   #group_by(Department) %>%
132 |   slice(seq_len(10)) %>%
133 |   #ungroup() %>%
134 |   arrange(n) %>%
135 |   mutate(row = row_number(), bigram = reorder(bigram, n))
136 | 
137 | # lets see the most frequently used bigrams
138 | bigram_counts%>%
139 |   plot_ly(type = 'bar', y = ~bigram, x = ~n,
140 |           orientation = 'h')%>%
141 |   layout(barmode = 'stack')
142 | 
143 | # Bigram LDA
144 | bigram_dtm <- bigrams_filtered%>%
145 |   unite(bigram, word1, word2, sep = " ")%>%
146 |   mutate(Statement = as.character(Statement))%>%
147 |   count(Statement, bigram, sort = TRUE)%>%
148 |   ungroup()%>%
149 |   cast_dtm(document = Statement, term = bigram, value = n)
150 | 
151 | inspect(bigram_dtm[1:4, 1:8])
152 | 
153 | # test a series of k's for likelihood minima
154 | seed <- 1024
155 | 
156 | # ks <- sapply(seq(2,20,1), function(k){LDA(bigram_dtm, k = k, method = "GIBBS", control = list(seed = seed))@loglikelihood})
157 | # 
158 | # plot(seq(2,20,1), ks)
159 | # 
160 | # k <- 12
161 | 
162 | # calculate average silhouette value for a series of ks
163 | avg_sil <- sapply(2:20, function(x){silhouette_test(x, bigram_dtm)})
164 | plot_ly(type = 'scatter', mode = 'lines', x = 2:20, y = avg_sil)%>%
165 |   layout(yaxis = list(title = 'Silhouette score', range = c(0, 0.5)))
166 | #plot(2:20, type='b', avg_sil, xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)
167 | k <- 5
168 | 
169 | bigram_lda <- LDA(bigram_dtm, k = k, method = "GIBBS", control = list(seed = seed))
170 | 
171 | bigram_betas <- tidy(bigram_lda, matrix = 'beta')
172 | 
173 | bigram_topic_terms <- top_terms_per_topic(bigram_lda, 7)%>%
174 |   group_by(topic)%>%arrange(sort(beta))
175 | 
176 | subplot(lapply(unique(bigram_topic_terms$topic), function(x){
177 |   plot_ly(data = filter(bigram_topic_terms, topic == x)%>%
178 |             mutate(term = reorder(term, beta)),
179 |           type = 'bar', y = ~term, x = ~beta,
180 |           orientation = 'h',
181 |           name = x)%>%
182 |     layout(xaxis = list(range = c(0, max(bigram_betas$beta))),
183 |            yaxis = list(tickfont = list(size = 10)))
184 |   }),nrows = (k%/%2) + (k%%2))%>%
185 |   layout(title = paste('Bigrams (k = ', k, ')', sep = ""))
186 | 
187 | # STEM ANALYSIS.
188 | # use cleanNLP tools to create stems
189 | cnlp_init_udpipe()
190 | 
191 | anno <- dat%>%
192 |   filter(Category != 'Efforts')%>%
193 |   cnlp_annotate(verbose=FALSE)
194 | 
195 | # create document term matrix on word stems
196 | anno_mat <- anno$token%>%
197 |   filter(nchar(token)>3, !token %in% unhelpful_words)%>%#, upos == 'VERB' | upos == 'NOUN')%>%
198 |   anti_join(stop_words, by = c('token' = 'word'))%>%
199 |   count(doc_id, lemma, sort = TRUE)%>%
200 |   ungroup()%>%
201 |   cast_dtm(document = doc_id, term = lemma, value = n)
202 | 
203 | # test a series of k's for likelihood minima
204 | avg_sil <- sapply(2:20, function(x){silhouette_test(x, anno_mat)})
205 | plot_ly(type = 'scatter', mode = 'lines', x = 2:20, y = avg_sil)%>%
206 |   layout(yaxis = list(title = 'Silhouette score', range = c(0, 0.5)))
207 | k <- 7
208 | 
209 | # ks <- sapply(seq(2,20,1), function(k){LDA(anno_mat, k = k, method = "GIBBS", control = list(seed = seed))@loglikelihood})
210 | # 
211 | # plot(seq(2,20,1),ks)
212 | # k <- 8
213 | 
214 | # run lda analysis
215 | anno_lda <- LDA(anno_mat, k = k, method = "GIBBS", control = list(seed = seed))
216 | 
217 | # return the probability per term of inclusion in each topic
218 | anno_betas <- tidy(anno_lda, matrix = 'beta')
219 | 
220 | # get the top terms per topic
221 | anno_topic_terms <- top_terms_per_topic(anno_lda, 7)
222 | 
223 | # plot the top terms per topic
224 | subplot(lapply(unique(anno_topic_terms$topic), function(x){
225 |   plot_ly(data = filter(anno_topic_terms, topic == x)%>%
226 |             mutate(term = reorder(term, beta)),
227 |           type = 'bar', y = ~term, x = ~beta,
228 |           orientation = 'h',
229 |           name = x)%>%
230 |     layout(xaxis = list(range = c(0, max(anno_betas$beta))),
231 |            yaxis = list(tickfont = list(size = 8)))
232 | }),nrows = (k%/%2)+1)%>%
233 |   layout(title = paste('Stem (k = ', k, ')', sep = ""))
234 | 
235 | 
236 | # K-MEANS CLUSTERING
237 | # One word tokens
238 | 
239 | # calculate average silhouette value for a series of ks
240 | avg_sil <- sapply(2:20, function(x){silhouette_test(x, dt_mat)})
241 | plot(2:20, type='b', avg_sil, xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)
242 | k <- 3
243 | # cluster terms according to k
244 | km <- kmeans(dt_mat, k)
245 | # grab the top 10 words most strongly associated with each topic
246 | kmeans_topics <- as.data.frame(t(km$centers))
247 | colnames(kmeans_topics) <- paste("Topic", 1:k)
248 | kmeans_topics$term <- rownames(kmeans_topics)
249 | kmeans_topics <- gather(kmeans_topics, topic, score, -term)%>%
250 |   group_by(topic)%>%
251 |   arrange(topic, desc(score))%>%
252 |   slice(seq_len(10))%>%
253 |   arrange(topic, score)
254 | 
255 | subplot(lapply(unique(kmeans_topics$topic), function(x){
256 |   plot_ly(data = filter(kmeans_topics, topic == x),
257 |           y = ~term, x = ~score,
258 |           type = 'bar', orientation = 'h')
259 |   }), nrows = 3)%>%
260 |   layout(title = 'Annotation (k = 8)')
261 | 


--------------------------------------------------------------------------------
/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Applying machine learning to advance Diversity, Equity, Inclusion & Justice at Defenders"
  3 | author: "Michael Evans"
  4 | date: "`r Sys.Date()`"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | library(cleanNLP)
 10 | library(cluster)
 11 | library(dplyr)
 12 | library(DT)
 13 | library(plotly)
 14 | library(tidytext)
 15 | library(tidyr)
 16 | library(tm)
 17 | library(topicmodels)
 18 | 
 19 | source(file = 'R/functions.R')
 20 | load('data/data.Rdata')
 21 | #dat <- readRDS(file = 'data/data.rds')
 22 | 
 23 | # define some styling parameters for graphs
 24 | tlf <- list(color = 'black', size = 14)
 25 | tkf <- list(color = 'black', size = 12)
 26 | 
 27 | # set seed for repeatability
 28 | seed <- 1024
 29 | ```
 30 | 
 31 | ## Diversity, Equity, Inclusion & Justice
 32 | 
 33 | Defenders of Wildlife is committed to becoming a more diverse, equitable, and inclusive organization and strives to be a leader in these efforts among environmental NGOs. The diversity, equity, inclusion and justice (DEIJ) working group was formed to help guide Defenders' efforts along this path.
 34 | 
 35 | As a first step, the DEIJ working group conducted an organizational 'listening tour.' DEIJ Working Group members spoke with staff in each of Defenders’ departments and field offices to catalog ongoing efforts related to DEIJ and identify where staff saw opportunities for growth in the organization. Defenders' staff had many constructive thoughts, and distilling these ideas into actionable themes quickly became a challenge.
 36 | 
 37 | To help extract common themes from these responses, we decided to use a field of machine learning called ‘Natural Language Processing’ (NLP). The discipline of NLP covers a collection of techniques to automate the extraction of themes, sentiments, and associations from text. Our goal was to use this kind of analysis to help identify focal areas in which Defenders can advance our DEIJ practices in the coming year. This document will:
 38 | 
 39 | 1. Describe how we used and analyze the listening tour responses
 40 | 2. Present and interpret outputs of different NLP analyses
 41 | 3. Identify the final set of common themes extracted from the listening tour
 42 | 
 43 | ## Data processing
 44 | 
 45 | Statements were divided into efforts currently underway in the department, and opportunities that staff members saw for the organization to improve its DEIJ efforts.  Because we're most interested in setting goals for future work, in these analyses we focused only on the 'Opportunity' statements.
 46 | 
 47 | For these statements to be useful, we have to pre-process all the raw responses. We standardized language by making all characters lowercase, removing punctuation, and fixing contractions.
 48 | 
 49 | ```{r input_data, echo = FALSE}
 50 | datatable(dat[,c(1,2,4)],
 51 |           rownames = FALSE,
 52 |           colnames = c('Dept.', 'Category', 'Statement'),
 53 |           caption = 'Table 1. Example responses from listening tour',
 54 |           options = list(
 55 |             dom = 't'
 56 |           ))
 57 | ```
 58 | 
 59 | Because the goal of NLP is to identify themes and sentiments using the frequency of terms and their associations, it is helpful to remove common words that (usually) do not add meaning.  There is a standard library of these words, and we added a few that also fit these criteria (e.g. 'Defenders', 'Wildlife', etc.). Finally, we 'tokenize' the data - breaking each statement into individual words or phrases. This resulted in `r nrow(dat_tokenized)` single words.
 60 | 
 61 | ```{r tokenization, echo = FALSE}
 62 | datatable(dat_tokenized[, c(1,3,4)],
 63 |           rownames = FALSE,
 64 |           colnames = c('Dept.', 'Statement', 'Term'),
 65 |           caption = 'Table 2. Example terms from listening tour data.',
 66 |           options = list(
 67 |             dom = 't'
 68 |           )
 69 | )
 70 | ```
 71 | 
 72 | ## Term frequency
 73 | 
 74 | Now that we have our listening tour statements broken down into individual (hopefully) meaningful tokens, the first thing we can do is look at the most common terms in these statements. Here are the top 10 single words.
 75 | 
 76 | ```{r term_counts, warnings = FALSE, echo = FALSE}
 77 | # lets see the most frequently used words
 78 | dat_tokenized%>%
 79 |   filter(Category != 'Efforts')%>%
 80 |   count(word, sort = TRUE)%>%
 81 |   top_n(10)%>%
 82 |   mutate(word = reorder(word, n))%>%
 83 |   plot_ly(type = 'bar', y = ~word, x = ~n,
 84 |           orientation = 'h',
 85 |           marker = list(color = '#005596'))%>%
 86 |   layout(
 87 |     xaxis = list(
 88 |       title = 'Frequency',
 89 |       titlefont = tlf,
 90 |       tickfont = tkf,
 91 |       showgrid = FALSE,
 92 |       linewidth = 1
 93 |     ),
 94 |     yaxis = list(
 95 |       title = 'Term',
 96 |       titlefont = tlf,
 97 |       tickfont = tkf,
 98 |       showticklabels = FALSE
 99 |     ),
100 |     title = 'Top 10 Most Common Terms',
101 |     titlefont = list(color = 'black', size = 16)
102 |   )%>%
103 |   add_annotations(
104 |     text = ~word,
105 |     showarrow = FALSE,
106 |     x = ~n,
107 |     xanchor = 'right',
108 |     font = list(color = 'white')
109 |   )
110 | ```
111 | 
112 | These terms give us some idea of the common themes expressed in the listening tour, such as engagement with communities and training.
113 | 
114 | ## Cluster analysis
115 | 
116 | The frequencies of terms in listening tour responses has given us a good idea of where opportunities for DEIJ improvements across Defenders might exist.  A more sophisticated analysis uses the same data to automatically identify clusters of terms. There are several techniques available for this task, and each uses the frequency of co-occurrence of different terms across responses to cluster terms together. We can then interpret the theme represented by each cluster by examining the words they contain.
117 | 
118 | *Note: these methods usually require large (>10,000 records) datasets, and so our use here with `r nrow(dat_tokenized)` data points is somewhat experimental.*
119 | 
120 | ### How many themes?
121 | 
122 | But how many themes are present in the responses? The first step in this analysis is to group our data into progressively greater number of clusters, and use a measure of cohesion to see which number is most supported. Here we use the silhouette test, which compares the dissimilarity within clusters to that between clusters. Lower scores indicate stronger support, and we choose the cluster number (k) at which the rate of decline plateaus. This provides a balance between likelihood and simplicity of interpretation.
123 | 
124 | ```{r k, warnings = FALSE, echo = FALSE}
125 | # calculate average silhouette value for a series of ks
126 | avg_sil <- sapply(2:20, function(x){silhouette_test(x, dt_mat)})
127 | 
128 | plot_ly(type = 'scatter', mode = 'lines', x = 2:20, y = avg_sil)%>%
129 |   layout(yaxis = list(title = 'Silhouette score', range = c(0, 0.5)))
130 | 
131 | k <- 7
132 | ```
133 | 
134 | This graph tells us that the most likely number of thematic clusters for single terms is `r k`. Assuming `r k` groups, we can use an algorithm called 'Latent Dirichlet Allocation' to assign terms to each of 7 clusters based on how frequently different terms appear together. Plotting the terms most strongly associated with each cluster gives us an idea of the 'themes' captured in each cluster.
135 | 
136 | ```{r lda, warnings = FALSE, echo = FALSE}
137 | lda <- topicmodels::LDA(dt_mat, k = k, method = "GIBBS", control = list(seed = seed))
138 | 
139 | # return the probability per term of inclusion in each topic
140 | betas <- tidy(lda, matrix = 'beta')
141 | 
142 | # get the top terms per topic
143 | topic_terms <- top_terms_per_topic(lda, 7)
144 | 
145 | subplot(lapply(unique(topic_terms$topic), function(i){
146 |   if(i == 'Topic 1'){
147 |     col <- c(rep('black', 6), 'white')
148 |     anchor <- c(rep('left', 6), 'right')
149 |   }else{
150 |     col <- 'black'
151 |     anchor <- 'left'
152 |   }
153 |   plot_ly(data = filter(topic_terms, topic == i)%>%
154 |             mutate(term = reorder(term, beta)),
155 |           type = 'bar', y = ~term, x = ~beta,
156 |           orientation = 'h',
157 |           name = i)%>%
158 |     layout(xaxis = list(range = c(0, max(betas$beta)),
159 |                         showgrid = FALSE,
160 |                         linewidth = 1),
161 |            yaxis = list(showticklabels = FALSE),
162 |            margin = list(r = 100))%>%
163 |     add_annotations(
164 |       text = ~term,
165 |       x = ~beta,
166 |       xanchor = anchor,
167 |       font = list(color = c(rep('black', 6), 'white')),
168 |       showarrow = FALSE
169 |     )
170 |   }),
171 |   nrows = (k%/%2) + (k%%2))%>%
172 |   layout(title = paste('Terms (k = ', k, ')', sep = ""))
173 | ```
174 | 
175 | Clustering analyses require human interpretation to make sense of the clusters that emerge based on the words they contain. In this small dataset, some of the clusters are more ambiguous than others - however there appear to be several clear themes. For instance, Topic 1 contains terms related to community outreach and engagement. Topic 7 seems to include words related to personnel practices, and Topic 6 encapsulates expanding Defenders' audience. Seeing the terms that co-occur frequently in staff responses is a starting point for understanding the common themes that were expressed.
176 | 
177 | ## Two-word phrases
178 | 
179 | Thus far, we've only looked at single terms to understand the topics present in the listening tour responses. It can also be informative to examine two word phrases, or bigrams. In the listening tour dataset, there were `r nrow(bigrams_filtered)` instances of meaningful bigrams.
180 | 
181 | ```{r bigrams, echo = FALSE}
182 | datatable(bigrams_filtered[, c(1,3,4,5)],
183 |           rownames = FALSE,
184 |           colnames = c('Dept.', 'Statement', 'Word 1', 'Word 2'),
185 |           caption = 'Table 3. Example bigrams from listening tour data.',
186 |           options = list(dom = 't')
187 | )
188 | ```
189 | 
190 | Looking at the most used two-word bigrams might provide some additional clarity around important topics.
191 | 
192 | ```{r bigram_counts, warnings = FALSE, echo = FALSE}
193 | # lets see the most frequently used bigrams
194 | bigram_counts%>%
195 |   plot_ly(type = 'bar', y = ~bigram, x = ~n,
196 |           orientation = 'h',
197 |           marker = list(color = '#005596'))%>%
198 |   layout(
199 |     xaxis = list(
200 |       title = 'Frequency',
201 |       tickfont = tkf,
202 |       titlefont = tlf,
203 |       showgrid = FALSE,
204 |       linewidth = 1
205 |     ),
206 |     yaxis = list(
207 |       title = 'Bigram',
208 |       titlefont = tlf,
209 |       tickfont = tkf,
210 |       showticklabels = FALSE
211 |     ),
212 |     title = 'Top 10 Most Common Bigrams',
213 |     titlefont = list(color = 'black', size = 16)
214 |   )%>%
215 |   add_annotations(
216 |     text = ~bigram,
217 |     showarrow = FALSE,
218 |     x = ~n,
219 |     xanchor = 'right',
220 |     font = list(color = 'white')
221 |   )
222 | ```
223 | 
224 | The themes emerging based on the most common two-word phrases used during the listening tour are clearer. The working group will likely only take up 2 or 3 areas around which we will develop projects and initiatives, so how might we prioritize this list?
225 | 
226 | In addition to the raw frequency, it could be informative to see how these terms were distributed among departments. We can do this by plotting the number of departments that used a term against the overall frequency of that term. Ideas that Defenders may want to prioritize will be mentioned frequently and by many departments.
227 | 
228 | ```{r bigram_deptfreq, echo = FALSE, warning = FALSE, error = FALSE}
229 | lapply(bigram_counts$bigram, function(term){
230 |   vec <- dept_importance(dat, term)
231 |   return(data.frame(bigram = term, total = vec[1], depts = vec[2]))
232 |   })%>%
233 |   bind_rows()%>%
234 |   plot_ly(type = 'scatter', mode = 'markers',
235 |           marker = list(color = '00000000'),
236 |           x = ~total, y = ~depts)%>%
237 |   add_annotations(text = bigram_counts$bigram,
238 |                   font = list(size = 12, color= 'black'),
239 |                   showarrow = TRUE,
240 |                   arrowcolor = 'white',
241 |                   xanchor = 'center',
242 |                   arrowhead = 0,
243 |                   ax = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
244 |                   ay = c(20, -20, 0, 30, 0, 0, -30, 0, 10, -10 ))%>%
245 |   layout(
246 |     xaxis = list(title = 'Term frequency',
247 |                  titlefont = tlf,
248 |                  tickfont = tkf,
249 |                  showgrid = FALSE,
250 |                  range = c(0,5)),
251 |     yaxis = list(title = '# Departments',
252 |                  titlefont = tlf,
253 |                  tickfont = tkf,
254 |                  showgrid = FALSE,
255 |                  range = c(0, 5))
256 |   )
257 | ```
258 | 
259 | In fact, taking the bigrams that appear in the top right portion of this plot would provide common themes on which Defenders could focus expanding and improving DEIJ practices. Let's examine the responses from which these phrases originated to get some context. For example:
260 | 
261 | ```{r bigram_phrases, echo = FALSE, eval = TRUE, warning = FALSE, error = FALSE}
262 | filter(dat, grepl("social media", text))%>%
263 |   select(Department, text)%>%
264 |   datatable(
265 |           rownames = FALSE,
266 |           colnames = c('Dept.', 'Statement'),
267 |           caption = 'Table 4. Responses including "social media"',
268 |           options = list(
269 |             dom = 't'
270 |           ))
271 | ```
272 | 
273 | In theory, we can perform a clustering analysis for bigrams as well. Unfortunately, these clustering algorithms failed to converge using bigrams. This is likely because these kinds of analyses usually require large data sets, and the `r nrow(bigrams_filtered)` bigrams recorded were insufficient.
274 | 
275 | However, Just by examining the statements associated with the most important bigrams, we identified:
276 | 
277 | 1. Initiating a paid internship program
278 | 2. Engaging and working with tribal governments
279 | 2. Using social media to engage more diverse audiences
280 | 4. Defining equitable hiring protocols and providing training
281 | 
282 | ## Conclusion
283 | 
284 | Natural Language Processing is a useful tool for making sense of written text. These techniques still require some human interpretation, but we were able to use these techniques to quickly filter through a large amount of text and extract the most salient terms, sentiments, and themes.  Using automated processes also provides a replicable method for identifying themes and priorities in the future, should Defenders conduct similar or more extended types of social response data.
285 | 
286 | ```{r bigram_k, warnings = FALSE, echo = FALSE}
287 | # avg_sil <- sapply(2:20, function(x){silhouette_test(x, bigram_dtm)})
288 | # 
289 | # plot_ly(type = 'scatter', mode = 'lines', x = 2:20, y = avg_sil)%>%
290 | #   layout(yaxis = list(title = 'Silhouette score', range = c(0, 0.5)))
291 | # 
292 | # k <- 9
293 | ```
294 | 
295 | ```{r bigram_lda, warnings = FALSE, echo = FALSE}
296 | # bigram_lda <- LDA(bigram_dtm, k = k, method = "GIBBS", control = list(seed = seed))
297 | # 
298 | # bigram_betas <- tidy(bigram_lda, matrix = 'beta')
299 | # 
300 | # bigram_topic_terms <- top_terms_per_topic(bigram_lda, 7)%>%
301 | #   group_by(topic)%>%arrange(sort(beta))
302 | # 
303 | # subplot(lapply(unique(bigram_topic_terms$topic), function(x){
304 | #   plot_ly(data = filter(bigram_topic_terms, topic == x)%>%
305 | #             mutate(term = reorder(term, beta)),
306 | #           type = 'bar',
307 | #           orientation = 'h',
308 | #           y = ~term, x = ~beta,
309 | #           name = x)%>%
310 | #     layout(xaxis = list(showgrid = FALSE,
311 | #                         linewidth = 1,
312 | #                         range = c(0, max(bigram_betas$beta)),
313 | #                         showgrid = FALSE,
314 | #                         linewidth = 1),
315 | #            yaxis = list(showticklabels = FALSE))%>%
316 | #     add_annotations(
317 | #       text = ~term,
318 | #       x = ~beta, xanchor = 'right',
319 | #       showarrow = FALSE,
320 | #       font = list(color = 'white')
321 | #     )
322 | #   }),
323 | #   nrows = (k%/%2) + (k%%2))%>%
324 | #   layout(title = paste('Bigrams (k = ', k, ')', sep = ""))
325 | 
326 | ```
327 | 


--------------------------------------------------------------------------------