├── HowToLoadBotDetector.R
├── README.md
├── flagsuspects.R
├── 180816mTurkLowQualityResponseDetection.R
└── 180816mTurkLowQualityResponseDetection_Example.R


/HowToLoadBotDetector.R:
--------------------------------------------------------------------------------
 1 | source_github <- function(u) {
 2 |   # load package
 3 |   require(RCurl)
 4 |   
 5 |   # read script lines from website
 6 |   script <- getURL(u, ssl.verifypeer = FALSE)
 7 |   
 8 |   # parase lines and evaluate in the global environment
 9 |   eval(parse(text = script))
10 | }
11 | 
12 | source("https://raw.githubusercontent.com/SICLab/detecting-bots/master/180816mTurkLowQualityResponseDetection.R")
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # bot.detector() Version 1.3
 2 | This is a function designed for Qualtrics surveys to detect responses that may be from bots and survey-farmers.
 3 | This function creates a vector that you can save as a new column in your dataset that assigns a "score" to each response. 
 4 | The score is intended to count the number of features in each response that are associated with bots and survey-farmers. 
 5 | We recommend that you examine responses with high scores manually before excluding them. This function cannot replace the human eye- it can only guide it. 
 6 | Suggestions for new features to add to the function should be emailed to jprims2@uic.edu
 7 | 
 8 | Suggested Citation: 
 9 | Prims, J., Motyl, M. (2018). A tool for detecting low quality data in internet research. GitHub: https://github.com/SICLab/detecting-bots
10 | 
11 | Files: 
12 |  - Function: 180816mTurkLowQualityResponseDetection.R
13 |  - Example: 180816mTurkLowQualityResponseDetection_Example.R
14 |  - Load function from GitHub: HowToLoadBotDetector.R
15 |  
16 | This function assigns a score to each response. The higher the score, the more features associated with bots or survey-farmers. 
17 | It is best to examine each response with a high score manually. This function cannot replace the human eye- it can only guide it. 
18 | 
19 | bot.detector(Latitude, Longitude, Threshold, Time, Comments, Comments2, Comments3)
20 | 
21 | Function arguments: 
22 |   1. Latitude - A column with latitude coordinates for your respondant. 
23 |   2. Longitude - A column with longitude coordinates for your respondant. 
24 |   3. Threshold -  If a single latitude/longitude pair exceeds this proportion of the sample, it is considered suspicious. (Default is .01.)
25 |   4. Time - An optional column with Qualtrics-formatted date and time stamps. (MM/DD/YYYY HH:MM)
26 |   5. Comments - An optional free-response field. 
27 |   6. Comments2 - A second, optional free-response field. 
28 |   7. Comments3 - A third, optional free-response field. 
29 | 
30 | Scoring: 
31 |   Scores can go as high as 8 if you have three free-response fields. 
32 |   - Having a latitude and longitude that appears in more than the specified threshold adds 1 point. (Default threshold is .01.)
33 |   - Having a duplicate latitude and longitude, AND responding within 10 minutes of the other responses from the same latitude and longitude adds 1 point. (I recommend using the StartedDate column, but any column in Qualtrics date-time format [MM/DD/YYYY HH:MM)] will do.)
34 |   - Comments consisting solely of phrases typically attributed to bots/duplicate responses/survey farmers adds 1 point. (Send new suggestions for phrases to jprims2@uic.edu.)
35 |   - Duplicate comments that other respondants have already made in response to the same question add 1 point. 
36 |   - Comments containing the word "very" add 1 point.  (See https://www.maxhuibai.com/blog/a-proposed-procedure-for-testing-the-evidentiary-value-of-responses-from-duplicated-gps-sources-comments-invited)
37 |   
38 |   - Max score for only latitude and longitude: 1
39 |   - Max score for latitude, longitude, and time: 2
40 |   - Max score for latitude, longitude, and one free-response: 3
41 |   - Max score for latitude, longitude, time, and one free-response: 4
42 |   - Max score for latitude, longitude, and two free-responses: 5
43 |   - Max score for latitude, longitude, time, and two free-responses: 6
44 |   - Max score for latitude, longitude, and three free-responses: 7
45 |   - Max score for latitude, longitude, time, and three free-responses: 8
46 |   
47 | Upcoming changes: 
48 |  Adding a built-in list of suspicous locations using the method described in this post: https://www.facebook.com/groups/psychmap/permalink/670236310019961/ (Suggested by @NivReggev)
49 | 


--------------------------------------------------------------------------------
/flagsuspects.R:
--------------------------------------------------------------------------------
  1 | ################################
  2 | ### VSP & Geolocation Method ###
  3 | ################################
  4 | 
  5 |   # Code by JP Prims
  6 | 
  7 | ## Format: 
  8 |   # flag.suspects(Latitude, Longitude, IP)
  9 | 
 10 | ## Arguments: 
 11 |   # Latitude: A column of latitudes. (Optional)
 12 |   # Longitude: A column of longitudes. (Optional)
 13 |   # IP: A column of IP addresses. (Optional)
 14 | 
 15 |   # You must enter either LATITUDE AND LONGITUDE or IP for the function to run. 
 16 | 
 17 | ## Notes: 
 18 |   # If you enter ONLY LATITUDE AND LONGITUDE: 
 19 |     # The function returns a 1 if that latitude and longitude is suspicious, and a 0 if it is not. 
 20 | 
 21 |   # If you enter ONLY IP ADDRESS: 
 22 |     # The function returns a 1 if the IP address is from a suspicious ISP, and a 0 if it is not. 
 23 | 
 24 | flag.suspects <- function(Latitude, Longitude, IP){
 25 |   
 26 |   # Makes sure that necessary packages are installed and loaded
 27 |   ifelse(!"ipapi" %in% installed.packages(), devtools::install_github("hrbrmstr/ipapi"), library(ipapi))
 28 |   require(maps)
 29 |   require(leaflet)
 30 |   ifelse(!"package:ipapi" %in% search(), library(ipapi), NA) # Only necessary if you're installing ipapi for the first time. 
 31 |   
 32 |   # Reading in list of suspicious locations
 33 |   urlfile <- 'https://raw.githubusercontent.com/jprims/flag.suspects/master/suspiciousthings.csv'
 34 |   datsus<-read.csv(urlfile)
 35 |   
 36 |   if(missing(Latitude)) {
 37 |     NULL
 38 |   } else {
 39 |     
 40 |     
 41 |     ##### Duplicate GPS coordinates #####
 42 |     # Creating empty vector
 43 |     bot.susp <- bot.susp <- rep(0, length(Latitude))
 44 |     
 45 |     # Creating an object combining those two into one column 
 46 |     latlong <- ifelse(!is.na(Latitude), paste(Latitude,Longitude), NA)
 47 |     
 48 |     
 49 |     # Creating a list of "bad" GPS locations
 50 |     badgps <- ifelse(!is.na(Latitude), paste(datsus$badlat, datsus$badlong), NA)
 51 |     
 52 |     # This checks if the coordinates are duplicated. If so, it adds a point. 
 53 |     bot.susp <- ifelse(!is.na(latlong), ifelse(latlong %in% badgps, bot.susp + 1,  bot.susp), bot.susp) 
 54 |     
 55 |   }
 56 |   
 57 |   #### ISP check ####
 58 |   # This part makes this argument optional for the function. 
 59 |   if(missing(IP)) {
 60 |     NULL
 61 |   } else {
 62 |     if(missing(Latitude)) {
 63 |       # Creating empty vector
 64 |       bot.susp <- bot.susp <- rep(0, length(IP))
 65 |     } else {
 66 |       NULL
 67 |     }
 68 |     
 69 |     # First, we create a list of suspicious VSPs. 
 70 |     # vsps <- c("B2 Net Solutions Inc.", "Cogent Communications", "ColoCrossing","Corporate Colocation Inc.",
 71 |     #           "Hostwinds LLC.", "Joe's Datacenter LLC", "Kamatera Inc.", "DigitalOcean LLC", "SECURED SERVERS LLC", "NA", 
 72 |     #           "Leaseweb USA Inc.", "QuadraNet Inc", "Total Server Solutions L.L.C.", "ZSCALER INC.", "SoftLayer Technologies Inc.",
 73 |     #           "PODOJIL CONTRACTING  S.A.", "Nobis Technology Group  LLC", "Linode, LLC", "KVCHOSTING.COM LLC",
 74 |     #           "tzulo  inc.", "Micfo  LLC.", "Airtek Solutions C.A.", "Contina")
 75 |     # 
 76 |     # I'd like to remove punctuation, and make all of the suspicious ISPs lowercase, just to make matches easier.
 77 |     vsps <- tolower(datsus$badisp)
 78 |     vsps <- gsub("[[:punct:]]", "", vsps)
 79 |     
 80 |     # Now, let's get the isps. 
 81 |     locations <- geolocate(IP)
 82 |     
 83 |     # Cleaning that up too, so it's lowercase, and missing punctuation. 
 84 |     locations$isp <- tolower(locations$isp)
 85 |     locations$isp <- gsub("[[:punct:]]", "", locations$isp)
 86 |     
 87 |     # Now, returning 1 or 0. 
 88 |     bot.susp <- ifelse(!is.na(locations$isp), ifelse(pmatch(locations$isp, vsps, nomatch = 0, duplicates.ok = TRUE) > 0, bot.susp + 1,  bot.susp), bot.susp)
 89 |   } 
 90 |   
 91 |   # this sets the threshold for a bot warning, depending if they entered the IP argument or not. 
 92 |   if(missing(IP)) {
 93 |     outputs <- ifelse(bot.susp == 1, 1, 0)
 94 |   } else if(missing(Latitude)) {
 95 |     outputs <- ifelse(bot.susp == 1, 1, 0)
 96 |   }  else{
 97 |     outputs <- ifelse(bot.susp >= 1, 1, 0)
 98 |   }  
 99 |   
100 |   return(outputs)
101 |   
102 | } 
103 | 


--------------------------------------------------------------------------------
/180816mTurkLowQualityResponseDetection.R:
--------------------------------------------------------------------------------
  1 | ####################################
  2 | #### Testing mTurk Bot Function ####
  3 | ####################################
  4 | 
  5 | # This is an example of how to use this function. This function is designed to identify low-quality mTurk responses. 
  6 | # This function assigns a score to each response. The higher the score, the higher the probability that the respondant is a bot or survey-farmer. 
  7 | # It is best to examine each response with a high score manually. This function cannot replace the human eye- it can only guide it. 
  8 | 
  9 | # Function arguments: 
 10 |   # Data - your dataset
 11 |   # Latitude - A column with latitude coordinates for your respondant. 
 12 |   # Longitude - A column with longitude coordinates for your respondant. 
 13 |   # Time - An optional column with Qualtrics-formatted date and time stamps. 
 14 |   # Comments - An optional free-response field. 
 15 |   # Comments2 - A second, optional free-response field. 
 16 |   # Comments3 - A third, optional free-response field. 
 17 | 
 18 | # Scoring: 
 19 |   # Scores can go as high as 7 if you have three free-resposne fields. 
 20 |   # Having a latitude and longitude that appears in more than 1% of responses adds 1 point. (I recommend changing the percentage depending on the size of your dataset.)
 21 |   # Having a duplicate latitude and longitude, AND responding within 10 minutes of the other responses from the same latitude and longitude adds 1 point. (I recommend StartedDate.)
 22 |   # Comments consisting solely of phrases typically attributed to bots/duplicate responses/survey farmers adds 1 point. (Send new suggestions for phrases to jprims2@uic.edu.)
 23 |   # Duplicate comments that other respondants have already made in response to the same question add 1 point. 
 24 |   # Max score for only latitude and longitude: 1
 25 |   # Max score for latitude, longitude, and time: 2
 26 |   # Max score for latitude, longitude, and one free-response: 3
 27 |   # Max score for latitude, longitude, time, and one free-response: 4
 28 |   # Max score for latitude, longitude, and two free-responses: 5
 29 |   # Max score for latitude, longitude, time, and two free-responses: 6
 30 |   # Max score for latitude, longitude, and three free-responses: 7
 31 |   # Max score for latitude, longitude, time, and three free-responses: 8
 32 | 
 33 | 
 34 | # Loading in the function
 35 |   
 36 | bot.detector <- function(Latitude, Longitude, Time,  Threshold = .01, Comments, Comments2, Comments3){
 37 |   
 38 |   # This loads in required packages. (Mostly for the Time argument.)
 39 |   require(tidyr)
 40 |   require(dplyr)
 41 |   require(zoo)
 42 |   
 43 |   
 44 |   # This creates a new column to store our bot suspicion score. 
 45 |   bot.susp <- rep(0, length(Latitude))
 46 |   
 47 |   # First, let's work on detecting if there are some coordinates that appear in more than 1% of the a. 
 48 |   # With Qualtrics, the columns we want to look at are Latitude and Longitude. 
 49 |   
 50 |   # Creating an object combining those two into one column 
 51 |   latlong <- paste(Latitude,Longitude)
 52 |   
 53 |   # This counts the number of times each coordinate appears in the aset. 
 54 |   llcount <- summary(as.factor(latlong))
 55 |   
 56 |   # This determines if a certain latitude and longitude appears in more than 1% of responses.
 57 |   lllots <- llcount > length(Latitude) * Threshold # You can change the .01 to change the % of the sample. 
 58 |   
 59 |   # Pulls out the coordinates that make up more than 1% of the sample.   
 60 |   llmany <- names(lllots[lllots == TRUE]) 
 61 |   
 62 |   # Adds a 1 to the bot suspicion column if the coordinates appear in more than 1% of the sample
 63 |   bot.susp <- ifelse(latlong %in% llmany, 1,  0)
 64 |   
 65 |   # Now, let's check if their free response contains "good" or "NICE!"
 66 |   suswords <- c("good","NICE!")
 67 |   
 68 |   # Transform vector of phrases to lowercase
 69 |   suswords <- tolower(suswords) # See https://www.maxhuibai.com/blog/evidence-that-responses-from-repeating-gps-are-random for illustration
 70 |   
 71 |   
 72 |   # Check if person specified a column of times. If so, run.
 73 |   if(missing(Time)) {
 74 |     NULL
 75 |   } else {
 76 |     # First, converting time to a format R can use. Using the typical Qualtrics organization.
 77 |     Time <- as.POSIXct(Time, tz = "", format = "%m/%d/%Y %H:%M", optional = FALSE)
 78 |     Time <- as.numeric(Time)
 79 |     
 80 |     # I'd like to make a dataframe so I can filter things.
 81 |     tempdat <- data.frame(latlong, Time)
 82 |     # Now, adding an ID
 83 |     tempdat$id <- 1:(nrow(tempdat))
 84 |     
 85 |     # This filters it so the dataframe only keeps rows with suspicious coordinates, and moves it to long format.
 86 |     tempdatw <- spread(subset(tempdat, tempdat$latlong %in% llmany), latlong, Time)
 87 |     
 88 |     # Fill in NAs with 0s
 89 |     tempdatw[is.na(tempdatw)] <- 0
 90 |     
 91 |     # Check if time difference between a duplicate and the previous duplicate response is between 1 and 600 seconds (10 minutes)
 92 |     # Code for 1 duplicate and more duplicates
 93 |     ifelse(ncol(tempdatw) == 2,
 94 |            # If one repeating coordinate
 95 |            ifelse(abs(tempdatw[,2] - lag(tempdatw[,2], n = 1L)) < 600 & abs(tempdatw[,2] - lag(tempdatw[,2], n = 1L)) > 1, TRUE, FALSE),
 96 |            # If multiple coordinates
 97 |            tempdatw[,-1] <- lapply(tempdatw[,-1], function(x) ifelse(abs(x - lag(x, n = 1L)) < 600 & abs(x - lag(x, n = 1L)) > 1, TRUE, FALSE))
 98 |     )
 99 |     
100 |     # I think I need to sum the two columns into one. 
101 |     ifelse(ncol(tempdatw) == 2, 
102 |            tempdatw$sum <- tempdatw[,2],
103 |            tempdatw$sum <- rowSums(tempdatw[,-1]))
104 |     
105 |     # Putting it back in long format, so I can merge it back in with our temporary data frame
106 |     tempdatl <- tempdatw[,c("id","sum")]
107 |     
108 |     # Merge back in to tempdat
109 |     
110 |     findat <- merge(tempdat, tempdatl[,c("id","sum")], by = "id", all.x = TRUE)
111 |     
112 |     findat$sum <- ifelse(is.na(findat$sum), 0, findat$sum)
113 |     
114 |     
115 |     # Now, let's add that suspicion!
116 |     
117 |     bot.susp <- ifelse(findat$sum >= 1, bot.susp + 1, bot.susp) 
118 |   }
119 |   
120 |   
121 |   # Check if person specified a free-response. If so, run. 
122 |   if(missing(Comments)) {
123 |     NULL
124 |   } else {
125 |     
126 |     # Adds 1 to the bot suspicion column if suspicous phrases appear in the responses.
127 |     
128 |     # Encoding
129 |     Comments <- enc2utf8(as.character(Comments))
130 |     
131 |     # Transform comment vectors to lowercase
132 |     Comments <- tolower(Comments)
133 |     
134 |     # Putting the arguments in this order makes sure it won't flag comments that contain the word "good," but also have other content.
135 |     bot.susp <- ifelse(Comments %in% suswords, bot.susp + 1, bot.susp)
136 |     
137 |     # Now, check if any free responses are 100% matches to other free responses. 
138 |     bot.susp <- ifelse(Comments%in%Comments[which(duplicated(Comments, incomparables = c('',NA)))], bot.susp + 1, bot.susp)
139 |   }
140 |   
141 |   # Check if person specified second free-response. If so, run. 
142 |   if(missing(Comments2)) {
143 |     NULL
144 |   } else {
145 |     # Encoding
146 |     Comments2 <- enc2utf8(as.character(Comments2))
147 |     
148 |     # Transform comment vectors to lowercase
149 |     Comments2 <- tolower(Comments2)
150 |     
151 |     # Adds 1 to the bot suspicion column if suspicous phrases appear in the responses.
152 |     # Putting the arguments in this order makes sure it won't flag comments that contain the word "good," but also have other content.
153 |     bot.susp <- ifelse(Comments2 %in% suswords, bot.susp + 1, bot.susp)
154 |     
155 |     # Now, check if any free responses are 100% matches to other free responses. 
156 |     bot.susp <- ifelse(Comments2%in%Comments2[which(duplicated(Comments2, incomparables = c('',NA)))], bot.susp + 1, bot.susp)
157 |   }
158 |   
159 |   # Check if person specified third free-response. If so, run. 
160 |   if(missing(Comments3)) {
161 |     NULL
162 |   } else {
163 |     
164 |     # Encoding
165 |     Comments3 <- enc2utf8(as.character(Comments3))
166 |     
167 |     # Transform comment vectors to lowercase
168 |     Comments3 <- tolower(Comments3)
169 |     
170 |     # Adds 1 to the bot suspicion column if suspicous phrases appear in the responses.
171 |     # Putting the arguments in this order makes sure it won't flag comments that contain the word "good," but also have other content.
172 |     bot.susp <- ifelse(Comments3 %in% suswords, bot.susp + 1, bot.susp)
173 |     
174 |     # Now, check if any free responses are 100% matches to other free responses. 
175 |     bot.susp <- ifelse(Comments3%in%Comments3[which(duplicated(Comments3, incomparables = c('',NA)))], bot.susp + 1, bot.susp)
176 |   }
177 |   
178 |   # Outputting results
179 |   return(bot.susp)
180 |   
181 | }
182 | 
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/180816mTurkLowQualityResponseDetection_Example.R:
--------------------------------------------------------------------------------
  1 | ####################################
  2 | #### Testing mTurk Bot Function ####
  3 | ####################################
  4 | 
  5 | # This is an example of how to use this function. This function is designed to identify low-quality mTurk responses. 
  6 | # This function assigns a score to each response. The higher the score, the higher the probability that the respondant is a bot or survey-farmer. 
  7 | # It is best to examine each response with a high score manually. This function cannot replace the human eye- it can only guide it. 
  8 | 
  9 | # Function arguments: 
 10 |   # Data - your dataset
 11 |   # Latitude - A column with latitude coordinates for your respondant. 
 12 |   # Longitude - A column with longitude coordinates for your respondant. 
 13 |   # Time - An optional column with Qualtrics-formatted date and time stamps. 
 14 |   # Comments - An optional free-response field. 
 15 |   # Comments2 - A second, optional free-response field. 
 16 |   # Comments3 - A third, optional free-response field. 
 17 | 
 18 | # Scoring: 
 19 |   # Scores can go as high as 7 if you have three free-resposne fields. 
 20 |   # Having a latitude and longitude that appears in more than 1% of responses adds 1 point. (I recommend changing the percentage depending on the size of your dataset.)
 21 |   # Having a duplicate latitude and longitude, AND responding within 10 minutes of the other responses from the same latitude and longitude adds 1 point. (I recommend StartedDate.)
 22 |   # Comments consisting solely of phrases typically attributed to bots/duplicate responses/survey farmers adds 1 point. (Send new suggestions for phrases to jprims2@uic.edu.)
 23 |   # Duplicate comments that other respondants have already made in response to the same question add 1 point. 
 24 |   # Max score for only latitude and longitude: 1
 25 |   # Max score for latitude, longitude, and time: 2
 26 |   # Max score for latitude, longitude, and one free-response: 3
 27 |   # Max score for latitude, longitude, time, and one free-response: 4
 28 |   # Max score for latitude, longitude, and two free-responses: 5
 29 |   # Max score for latitude, longitude, time, and two free-responses: 6
 30 |   # Max score for latitude, longitude, and three free-responses: 7
 31 |   # Max score for latitude, longitude, time, and three free-responses: 8
 32 | 
 33 | # Creating a dataset with suspected bots. 
 34 |   LocationLatitude <-  c(1:100, 9, 9, 10, 10)
 35 |   LocationLongitude <-  c(-1:-100, 9, 9, 10, 10)
 36 |   time <- c(seq(c(ISOdate(2018,8,17)), by = "10 min", length.out = 100), seq(c(ISOdate(2018,8,17)), by = "1 min", length.out = 4))
 37 |   comments <- c(rep("blep",92),"good","NICE!", "yeet","yeet","Yeet","good","blah","boop","cheese","jumprope","good","NICE!")
 38 |   comments2 <- c(rep("boom", 92), "hey","NICE!","zoop","yeet","loop","good","heck","doggo","jumprope","nominal","good","NICE!")
 39 |   dat <- data.frame(LocationLatitude, LocationLongitude, time, comments, comments2)
 40 |   
 41 |   # Previewing dataset
 42 |   head(dat)
 43 | 
 44 |   bot.detector <- function(Latitude, Longitude, Time,  Threshold = .01, Comments, Comments2, Comments3){
 45 |     
 46 |     # This loads in required packages. (Mostly for the Time argument.)
 47 |     require(tidyr)
 48 |     require(dplyr)
 49 |     require(zoo)
 50 |     
 51 |     
 52 |     # This creates a new column to store our bot suspicion score. 
 53 |     bot.susp <- rep(0, length(Latitude))
 54 |     
 55 |     # First, let's work on detecting if there are some coordinates that appear in more than 1% of the a. 
 56 |     # With Qualtrics, the columns we want to look at are Latitude and Longitude. 
 57 |     
 58 |     # Creating an object combining those two into one column 
 59 |     latlong <- paste(Latitude,Longitude)
 60 |     
 61 |     # This counts the number of times each coordinate appears in the aset. 
 62 |     llcount <- summary(as.factor(latlong))
 63 |     
 64 |     # This determines if a certain latitude and longitude appears in more than 1% of responses.
 65 |     lllots <- llcount > length(Latitude) * Threshold # You can change the .01 to change the % of the sample. 
 66 |     
 67 |     # Pulls out the coordinates that make up more than 1% of the sample.   
 68 |     llmany <- names(lllots[lllots == TRUE]) 
 69 |     
 70 |     # Adds a 1 to the bot suspicion column if the coordinates appear in more than 1% of the sample
 71 |     bot.susp <- ifelse(latlong %in% llmany, 1,  0)
 72 |     
 73 |     # Now, let's check if their free response contains "good" or "NICE!"
 74 |     suswords <- c("good","NICE!")
 75 |     
 76 |     # Transform vector of phrases to lowercase
 77 |     suswords <- tolower(suswords) # See https://www.maxhuibai.com/blog/evidence-that-responses-from-repeating-gps-are-random for illustration
 78 |     
 79 |     
 80 |     # Check if person specified a column of times. If so, run.
 81 |     if(missing(Time)) {
 82 |       NULL
 83 |     } else {
 84 |       # First, converting time to a format R can use. Using the typical Qualtrics organization.
 85 |       Time <- as.POSIXct(Time, tz = "", format = "%m/%d/%Y %H:%M", optional = FALSE)
 86 |       Time <- as.numeric(Time)
 87 |       
 88 |       # I'd like to make a dataframe so I can filter things.
 89 |       tempdat <- data.frame(latlong, Time)
 90 |       # Now, adding an ID
 91 |       tempdat$id <- 1:(nrow(tempdat))
 92 |       
 93 |       # This filters it so the dataframe only keeps rows with suspicious coordinates, and moves it to long format.
 94 |       tempdatw <- spread(subset(tempdat, tempdat$latlong %in% llmany), latlong, Time)
 95 |       
 96 |       # Fill in NAs with 0s
 97 |       tempdatw[is.na(tempdatw)] <- 0
 98 |       
 99 |       # Check if time difference between a duplicate and the previous duplicate response is between 1 and 600 seconds (10 minutes)
100 |       # Code for 1 duplicate and more duplicates
101 |       ifelse(ncol(tempdatw) == 2,
102 |              # If one repeating coordinate
103 |              ifelse(abs(tempdatw[,2] - lag(tempdatw[,2], n = 1L)) < 600 & abs(tempdatw[,2] - lag(tempdatw[,2], n = 1L)) > 1, TRUE, FALSE),
104 |              # If multiple coordinates
105 |              tempdatw[,-1] <- lapply(tempdatw[,-1], function(x) ifelse(abs(x - lag(x, n = 1L)) < 600 & abs(x - lag(x, n = 1L)) > 1, TRUE, FALSE))
106 |       )
107 |       
108 |       # I think I need to sum the two columns into one. 
109 |       ifelse(ncol(tempdatw) == 2, 
110 |              tempdatw$sum <- tempdatw[,2],
111 |              tempdatw$sum <- rowSums(tempdatw[,-1]))
112 |       
113 |       # Putting it back in long format, so I can merge it back in with our temporary data frame
114 |       tempdatl <- tempdatw[,c("id","sum")]
115 |       
116 |       # Merge back in to tempdat
117 |       
118 |       findat <- merge(tempdat, tempdatl[,c("id","sum")], by = "id", all.x = TRUE)
119 |       
120 |       findat$sum <- ifelse(is.na(findat$sum), 0, findat$sum)
121 |       
122 |       
123 |       # Now, let's add that suspicion!
124 |       
125 |       bot.susp <- ifelse(findat$sum >= 1, bot.susp + 1, bot.susp) 
126 |     }
127 |     
128 |     
129 |     # Check if person specified a free-response. If so, run. 
130 |     if(missing(Comments)) {
131 |       NULL
132 |     } else {
133 |       
134 |       # Adds 1 to the bot suspicion column if suspicous phrases appear in the responses.
135 |       
136 |       # Encoding
137 |       Comments <- enc2utf8(as.character(Comments))
138 |       
139 |       # Transform comment vectors to lowercase
140 |       Comments <- tolower(Comments)
141 |       
142 |       # Putting the arguments in this order makes sure it won't flag comments that contain the word "good," but also have other content.
143 |       bot.susp <- ifelse(Comments %in% suswords, bot.susp + 1, bot.susp)
144 |       
145 |       # Now, check if any free responses are 100% matches to other free responses. 
146 |       bot.susp <- ifelse(Comments%in%Comments[which(duplicated(Comments, incomparables = c('',NA)))], bot.susp + 1, bot.susp)
147 |     }
148 |     
149 |     # Check if person specified second free-response. If so, run. 
150 |     if(missing(Comments2)) {
151 |       NULL
152 |     } else {
153 |       # Encoding
154 |       Comments2 <- enc2utf8(as.character(Comments2))
155 |       
156 |       # Transform comment vectors to lowercase
157 |       Comments2 <- tolower(Comments2)
158 |       
159 |       # Adds 1 to the bot suspicion column if suspicous phrases appear in the responses.
160 |       # Putting the arguments in this order makes sure it won't flag comments that contain the word "good," but also have other content.
161 |       bot.susp <- ifelse(Comments2 %in% suswords, bot.susp + 1, bot.susp)
162 |       
163 |       # Now, check if any free responses are 100% matches to other free responses. 
164 |       bot.susp <- ifelse(Comments2%in%Comments2[which(duplicated(Comments2, incomparables = c('',NA)))], bot.susp + 1, bot.susp)
165 |     }
166 |     
167 |     # Check if person specified third free-response. If so, run. 
168 |     if(missing(Comments3)) {
169 |       NULL
170 |     } else {
171 |       
172 |       # Encoding
173 |       Comments3 <- enc2utf8(as.character(Comments3))
174 |       
175 |       # Transform comment vectors to lowercase
176 |       Comments3 <- tolower(Comments3)
177 |       
178 |       # Adds 1 to the bot suspicion column if suspicous phrases appear in the responses.
179 |       # Putting the arguments in this order makes sure it won't flag comments that contain the word "good," but also have other content.
180 |       bot.susp <- ifelse(Comments3 %in% suswords, bot.susp + 1, bot.susp)
181 |       
182 |       # Now, check if any free responses are 100% matches to other free responses. 
183 |       bot.susp <- ifelse(Comments3%in%Comments3[which(duplicated(Comments3, incomparables = c('',NA)))], bot.susp + 1, bot.susp)
184 |     }
185 |     
186 |     # Outputting results
187 |     return(bot.susp)
188 |     
189 |   }
190 |   
191 |   
192 | 
193 | 
194 | # Testing the function
195 |   
196 |  dat$bot.susp <- bot.detector(dat$LocationLatitude, dat$LocationLongitude, Threshold = .01, Time = dat$time, Comments = dat$comments, Comments2 = dat$comments2)
197 |   
198 |  summary(dat$bot.susp)
199 |  


--------------------------------------------------------------------------------