├── .gitignore
├── README.md
└── schemaNLP_clean.R


/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # User-specific files
 9 | .Ruserdata
10 | 
11 | # Example code in package build process
12 | *-Ex.R
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | 
23 | # produced vignettes
24 | vignettes/*.html
25 | vignettes/*.pdf
26 | 
27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
28 | .httr-oauth
29 | 
30 | # knitr and R markdown default cache directories
31 | *_cache/
32 | /cache/
33 | 
34 | # Temporary files created by R markdown
35 | *.utf8.md
36 | *.knit.md
37 | 
38 | # R Environment Variables
39 | .Renviron
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Automated Scoring of Schematic Content Using Natural Language Processing
 2 | 
 3 | Remembering past events and imagining future events often requires individuals to draw upon schematic knowledge, or knowledge about what typically happens in situations. To enable researchers to study schemas, we developed a measure of typical content in narratives. This script automates the scoring of schematic content in narratives by:
 4 | 
 5 | -    Using GloVe embeddings to create dictionaries of words related to specific cues.
 6 | -    Counting how many words in each narrative match the relevant dictionary.
 7 | 
 8 | A dictionary is simply a list of relevant words. For example, a dictionary for the cue 'beach' would contain words like 'sand' and 'waves'.
 9 | 
10 | The code provided here will produce scores according to methods described in ([Wynn et al., 2022](https://www.sciencedirect.com/science/article/pii/S1053810022000344?casa_token=x0LIK_gDaRsAAAAA:6LItAH6udi70-SEGwkJ3i3QAlHiqvzMIz9cPwRVPGzZch0Wgb-Ucf49ktBYPjMs4mdY9lSv-mQ)). Work is ongoing to develop and extensively validate this approach.
11 | 
12 | Feel free to reach out to the authors with any questions. We can be contacted at r.vangenugten@northeastern.edu and jordwynn@uvic.ca
13 | 
14 | 
15 | ## Data Analysis Workflow
16 | 
17 | Below, we provide some information on how to use this code.
18 | 
19 | ### Prerequisites
20 | 
21 | Before running the script, ensure you have:
22 | - R and RStudio installed on your computer.
23 | - GloVe embeddings downloaded (glove.Rdata). You can find the .Rdata file on google drive [here](https://drive.google.com/file/d/13huoIUVwwvOMr-pRAAI81hMzBnhL93rF/view)
24 | - Enough space on your computer for the 3GB download of GloVe
25 | 
26 | ### Data Preparation
27 | 
28 | Your input data should be a CSV file named transcriptions_all.csv, containing the following columns:
29 | 
30 | - Subject: Numeric identifier for the subject.
31 | - Trial: Numeric identifier for the trial.
32 | - Cue: The cue word or phrase (e.g., "beach").
33 | - Transcript: The text narrative to be analyzed.
34 | 
35 | 
36 | Note: Ensure that the cue words are single words compatible with GloVe embeddings. If your cues are multi-word phrases (e.g., "thanksgiving dinner"), simplify them to single words (e.g., "thanksgiving").
37 | 
38 |   
39 | ### Setting Up the Environment
40 | 
41 | - Move GloVe Embeddings: Place the glove.Rdata file you downloaded in the same folder as the code script.
42 | - Move Your Data: Place transcriptions_all.csv in the same folder as the code script. Alternatively, update the code to point to its location.
43 | 
44 | 
45 | ### Running the Script
46 | 
47 | - Double Check to Ensure All Files Are in the Right Place: The code, transcriptions_all.csv, and glove.Rdata should be in the same folder.
48 | - Open the Script in Rstudio.
49 | - Review the Script: Before running, read through the comments. You should be able to use it without making code changes.
50 | - Run the Script: Execute the script in RStudio. This may take some time, especially loading GloVe embeddings and computing similarities. 
51 | 
52 | ### Interpreting the Output
53 | 
54 | Output Files:
55 | - schema_dictionaries.csv: Contains the lists of words similar to each cue.
56 | - narratives_scores.csv: DataFrame with your schema scores for each narrative!
57 |     - Key columns:
58 |       - nonStopword_wordCount: Total number of words in the narrative after removing stopwords. Stopwords are common or filler words (e.g., 'the', 'and', etc.)
59 |       - SchemaWordsCount: Number of words matching the schema dictionary.
60 |       - SchemaWordsIdentified: The actual words identified as schema-related.
61 |       - SchemaMismatchCount: Baseline score using other cues for comparison. (e.g., using all non-beach dictionarires for calculating schema scores for a beach narrative)
62 | 
63 | 
64 | Understanding Schema Scores:
65 | - Schema Words Count: Higher counts indicate that the narrative contains more words related to the cue.
66 | - Mismatch Count: Provides a baseline to compare against, representing chance levels of schema word occurrence.
67 | 
68 | ### Customization and Tips
69 | 
70 | - Adjusting Number of Similar Words: You can change num_similar_words_to_use to include more or fewer words in the schema dictionaries.
71 | - Reviewing Cue Words: After inspecting schema_dictionaries.csv, you may find that certain cue words don't produce the expected related words. 
72 |   - Consider Changing the Cue: If a cue word doesn't capture the intended meaning (e.g., "stream" leading to "Netflix" related words), replace it with a more appropriate word (e.g., "river" or "forest").
73 | - Updating Stopwords: Add any irrelevant but frequent words to myStopwords to exclude them from analysis.
74 | 
75 | 
76 | ### Troubleshooting
77 | 
78 | - Slow Performance: The script may take a long time to run.  This is normal.
79 | - Cue Words Not Found: If a cue word is not in the GloVe vocabulary, choose an alternative word with a similar meaning. Verify that the cue word is correctly spelled and in lowercase.
80 | - Error in Working with Data: Ensure that the data types are correct in your input .csv file. For example, ensure that participant is a number rather than a string (e.g, 32 vs '32c4sx').
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/schemaNLP_clean.R:
--------------------------------------------------------------------------------
  1 | #---
  2 | # Author: Ruben van Genugten
  3 | # Title: "Automated Scoring of Schematic Content using Natural Language Processing"
  4 | # Code for use in Wynn et al. (2022)
  5 | # Updated on 11/27/24 to improve readability and usability
  6 | #---
  7 | 
  8 | # Data Setup:
  9 | # Input file: 'transcriptions_all.csv' with the following columns:
 10 | # - Subject: Numeric identifier for the subject
 11 | # - Trial: Numeric identifier for the trial
 12 | # - Cue: The cue word or phrase
 13 | # - Transcript: The text transcript to be analyzed
 14 | 
 15 | 
 16 | # Overview of the Code:
 17 | # - Load required libraries and GloVe data
 18 | # - Read in data to score
 19 | # - Perform basic data cleaning, including removal of stopwords
 20 | # - Define function to calculate similarities between words using GloVe embeddings
 21 | # - Use similarity function to identify similar words for each cue (e.g., 'sand' for 'bbeach')
 22 | # - Process narratives. For each word in a narrative, identify whether it belongs to the schema based on similarity
 23 | 
 24 | # User Checks:
 25 | # 1. Rename cues if necessary. GloVe works with individual words, so replace multi-word cues (e.g., "thanksgiving dinner") with single words (e.g., "thanksgiving").
 26 | # 2. After running the code, check the word lists used (written out in 'schema_dictionaries.csv') to ensure words align with the intended meaning.
 27 | #    - Sometimes, you will need to change your cue word ('stream' -> 'river' or 'forest') to ensure you are capturing the anticipated meaning (e.g., stream may create a list of netflix related words instead)
 28 | 
 29 | 
 30 | #### ---- Set Working Directory ---- ####
 31 | 
 32 | # Set working directory to the location of this script
 33 | currentPath <- dirname(rstudioapi::getActiveDocumentContext()$path)
 34 | setwd(currentPath)
 35 | 
 36 | #### ---- Load in Packages ---- ####
 37 | 
 38 | # Install and load 'pacman' package if not already installed
 39 | if (!require("pacman")) {
 40 |   install.packages("pacman")
 41 |   library('pacman')
 42 | }
 43 | 
 44 | # Load necessary packages using 'pacman'
 45 | pacman::p_load(
 46 |   openxlsx, dplyr, ggplot2, ggpubr, stringr, glue, qdap, #hablar,
 47 |   textstem, text2vec, qdapDictionaries, tm, wordcloud, lexicon,
 48 |   textclean, tidyverse, tidytext, caret
 49 | )
 50 | 
 51 | # Note: Conflicts may occur between packages (e.g., 'dplyr' and others).
 52 | # If unexpected errors arise, check for conflicting functions.
 53 | 
 54 | #### ---- Read in data ---- ####
 55 | 
 56 | # Read the transcriptions data (ensure the file is in the working directory)
 57 | # that is, place your data in the same folder as your code, or specify the full path
 58 | story <- read.csv('transcriptions_all.csv')
 59 | 
 60 | #### ----- Load GloVe Embedding Matrix ----- ####
 61 | 
 62 | # Load GloVe data (this may take a while). 
 63 | # glove.Rdata should be in the same folder as the code, or specify the full path/location
 64 | load('glove.Rdata')
 65 | 
 66 | #### ---- Define Additional Stopwords to Remove  ---- ####
 67 | 
 68 | # Stop-words are words that don't carry much meaning, or words that we don't want to count
 69 | # Removing these reduces the noise in our analyses.
 70 | 
 71 | # Custom stopwords to exclude from analysis
 72 | myStopwords <- c(
 73 |   "like", "just", "can", "people", "around", "yeah", "see", "uh", "really", "um",
 74 |   "kind", "one", "lot", "it's", "i'm", "nice", "there's", "get", "time", "also",
 75 |   "know", "hear", "smell", "touch"
 76 | )
 77 | 
 78 | # Include cue words to prevent them from being counted as details
 79 | cueNames <- unique(as.character(story$Cue))
 80 | cueNames <- c(cueNames, tolower(cueNames))  # Include lowercase versions
 81 | 
 82 | # Combine with stopwords from various sources
 83 | # tm stopwords, snowball & SMART stopwords are already used by tidytext
 84 | quanteda_stopwords1 <- stopwords::stopwords(language = 'en', source = 'nltk')
 85 | quanteda_stopwords2 <- stopwords::stopwords(language = 'en', source = 'stopwords-iso')
 86 | 
 87 | # Load additional stopwords from 'lexicon' package
 88 | data(sw_loughran_mcdonald_long)
 89 | lexicon_stopwords1 <- sw_loughran_mcdonald_long
 90 | 
 91 | # Merge all stopwords into one vector and remove duplicates
 92 | myStopwords <- unique(c(
 93 |   myStopwords, quanteda_stopwords1, quanteda_stopwords2,
 94 |   lexicon_stopwords1, cueNames
 95 | ))
 96 |   
 97 | myStopwordsTibble <- tibble(word = myStopwords)
 98 | 
 99 | #### ----- Basic Data Cleaning ----- ####
100 | 
101 | # Cast all words to lowercase and strip additional whitespace (e.g. two spaces in between words),
102 | # Lemmatize, which involves turning words into their base form (e.g. 'cars' -> 'car')
103 | # We are using the tidy text format (https://www.tidytextmining.com/) for further analysis
104 | # This format involves one-token-per-row, where a token is a unit of text like a word
105 | # This involves modifying the data format. Let's start:
106 | 
107 | # Add an index column to keep track of each narrative
108 | story$index <- 1:nrow(story)
109 | 
110 | # Convert the story data frame to a tibble and rename 'Transcript' to 'text'
111 | story_tibble <- tibble(story)
112 | story_tibble<- dplyr::rename(story_tibble,
113 |                              text = Transcript)
114 | 
115 | story_tibble <- story_tibble %>%
116 |   mutate(
117 |     Trial = as.integer(Trial),
118 |     index = as.integer(index),
119 |     Cue = as.factor(Cue),
120 |     Subject = as.factor(Subject),
121 |     text = as.character(text)
122 |   )
123 | 
124 | # Split the text into one-word-per-row format (tokenize)
125 | tidy_story <- story_tibble %>%
126 |   unnest_tokens(word, text)
127 | 
128 | # Remove stopwords and then lemmatize words (reduce to base form; running -> run)
129 | tidy_story_clean <- tidy_story %>% 
130 |   anti_join(get_stopwords()) %>% 
131 |   anti_join(myStopwordsTibble) %>%
132 |   mutate(word_lemma = textstem::lemmatize_words(word)) # Automatically lowercases words
133 | 
134 | head(tidy_story_clean) # inspect the data
135 | 
136 | #### ----- Define function for similarity of words ----- ####
137 | 
138 | # Function to find the top N most similar words to a given word using GloVe embeddings
139 | find_similar_words <- function(word, embedding_matrix, n = 5) {
140 |   similarities <- sim2(
141 |     embedding_matrix,
142 |     embedding_matrix[word, , drop = FALSE],
143 |     method = "cosine"
144 |   )
145 |   similarities[, 1] %>%
146 |     sort(decreasing = TRUE) %>%
147 |     head(n)
148 | }
149 | 
150 | 
151 | #### ----- Compute Similar Words for Each Cue ----- ####
152 | 
153 | # This is a preparatory step. For each cue, find a list of 50000 most similar words
154 | # We do this so that we can later access the stored similar words.
155 | # If we want to use the 10,000 most similar words, our functions allow you to pare down later.
156 | # But this prep allows us to not re-compute a time-consuming step every time.
157 | 
158 | # Retrieve all unique cues
159 | allCues <- unique(as.character(tidy_story_clean$Cue))
160 | 
161 | # Initialize a list to store top similar words for each cue
162 | cue_topSimilarities <- list()
163 | numSimilarWords <- 50000  # Number of similar words to retrieve
164 | 
165 | # Find similar words for each cue
166 | for (cue in allCues) {
167 |   cue_topSimilarities[[cue]] <- find_similar_words(
168 |     cue,
169 |     embedding_matrix,
170 |     numSimilarWords
171 |   )
172 | }
173 | 
174 | # Optionally save the computed similarities to avoid rerunning
175 | # save(cue_topSimilarities, file = 'cue_topSimilarities.RData')
176 | 
177 | # Optionally load the computed similarities
178 | # load('cue_topSimilarities.RData')
179 | 
180 | # Save the word lists so that we can examine them for sanity checking.
181 | cue_dictionaryWords <- lapply(cue_topSimilarities, names)
182 | cue_dictionaryWords_df <- as.data.frame(cue_dictionaryWords)
183 | write.csv(cue_dictionaryWords_df, 'schema_dictionaries.csv', row.names = FALSE)
184 | 
185 | 
186 | #### ----- Define function for calculating schema scores ----- ####
187 | 
188 | 
189 | # Function to compute the schema score for a given narrative based on a specific cue.
190 | # It calculates the number of words in the narrative that are among the top similar words to the cue
191 | # It also returns the subset of the narrative containing those words (i.e. the schema words)
192 | 
193 | get_schema_score <- function(cue, narrative, numWords){
194 |   mostSimilarWords <- names(cue_topSimilarities[[as.character(cue)]]) # Accesses the list of words similar to the cue from cue_topSimilarities, which is a precomputed list where each cue maps to its vector of similar words. From that list extract names.
195 |   mostSimilarWords <- mostSimilarWords[1:numWords] # the original exhaustive lists are 50,000 words. How long do we make the dictionary/list of most similar words? Selects the top numWords similar words.
196 |   words_inNarrative <- mostSimilarWords[mostSimilarWords %in% as.character(narrative$word)] # words_inNarrative is a vector of words that are both highly similar to the cue and present in the narrative. identical to intersect(mostSimilarWords, as.character(narrative$word))
197 |   narrativeDf_onlyTopXWords <- narrative[narrative$word %in% words_inNarrative,] # Filters the narrative data frame to include only rows where the word is in the schema word list
198 |   totalWords_withTopX <- nrow(narrativeDf_onlyTopXWords) # Counts the number of rows (words) in the filtered narrative from the previous step. Result: totalWords_withTopX is an integer representing the number of words in the narrative that are in the schema word list (i.e., mostSimilarWords).
199 |   
200 |   return(list(totalWords_withTopX, narrativeDf_onlyTopXWords)) # Returns a list containing two elements: First element ([[1]]): totalWords_withTopX, the count of matching words. Second element ([[2]]): narrativeDf_onlyTopXWords, the data frame of matching words.
201 | }
202 | 
203 | # Function to compute the mismatch schema score using other cues
204 | # loop through all cues that are not the relevant cue (e.g., all cues except 'beach' for a beach narrative)
205 | # calculate schema scores with the incorrect dictionaries to establish a baseline chance level.
206 | 
207 | get_schema_mismatch_score <- function(cue, narrative, numWords){
208 |   mismatchCues <- allCues[allCues != as.character(cue)]
209 |   detailCounts <-c()
210 |   for(myCue in mismatchCues){
211 |     detailCounts <- c(detailCounts, get_schema_score(myCue, narrative, numWords)[[1]])
212 |   }
213 |   mean_misMatchDetails <- mean(detailCounts)
214 |   return(mean_misMatchDetails)
215 | }
216 | 
217 | #### ----- Annotate All Narratives ----- ####
218 | 
219 | # Prepare a data frame to store the results.
220 | # Take the existing dataframe and add columns to fill in.
221 | story_scores <- story
222 | story_scores$nonStopword_wordCount <- NULL
223 | story_scores$SchemaWordsCount <- NULL
224 | story_scores$SchemaWordsIdentified <- NULL
225 | story_scores$SchemaMismatchCount <- NULL
226 | 
227 | # how big are we going to make our word lists/dictionaries?
228 | num_similar_words_to_use <- 2000
229 | 
230 | # Process each narrative to compute schema scores.
231 | for(i in unique(tidy_story_clean$index)) {
232 |   thisStory <- tidy_story_clean[tidy_story_clean$index == i,]
233 |   whichCue <- as.character(thisStory$Cue[1])
234 | 
235 |   schema_out <- get_schema_score(whichCue, thisStory, num_similar_words_to_use)
236 |   story_scores[story_scores$index==i, "nonStopword_wordCount"] <- nrow(thisStory) ## get total number of words (excluding stop words) in narrative
237 |   story_scores[story_scores$index==i, "SchemaWordsCount"] <- schema_out[[1]]
238 |   story_scores[story_scores$index==i, "SchemaWordsIdentified"] <- schema_out[[2]]$word %>% paste(collapse = " ")
239 |   story_scores[story_scores$index==i, "SchemaMismatchCount"] <- get_schema_mismatch_score(whichCue, thisStory, num_similar_words_to_use)
240 | }
241 | 
242 | # Write the annotated narratives to a CSV file
243 | write.csv(story_scores, 'narratives_scores.csv')
244 | 
245 | 
246 | 


--------------------------------------------------------------------------------