├── Class Projects
    ├── Code for Class.R
    ├── Collecting Data for Github Project.R
    └── Sankey.html
├── Class R Code
    ├── Class #1 R Code.R
    ├── Class #2 R Code.R
    ├── Class #3 R Code.R
    ├── Class #4 R Code.R
    ├── Class #5 R Code.R
    ├── Class #6 R Code.R
    ├── Class #7 R Code.R
    ├── Class #8 R Code.R
    └── Class #9 R Code.R
├── Class Slides
    ├── Class #1 Slides.html
    ├── Class #2 Slides.html
    ├── Class #3 Slides.html
    ├── Class #4 Slides.html
    ├── Class #5 Slides.html
    ├── Class #6 Slides.html
    ├── Class #7 Slides.html
    ├── Class #8 Slides.html
    └── Class #9.Slides.html
└── README.md


/Class Projects/Code for Class.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #read in document from google docs
 3 | 
 4 | our_interests<-read.csv("https://docs.google.com/spreadsheets/d/1zIeZ-9fbnCM1AQ3vt_OT8Kqwdqup2VA90b2m0jnkzkM/pub?gid=0&single=true&output=csv", row.names=1)
 5 | 
 6 | #cluster our interests
 7 | 
 8 | 
 9 | # Ward Hierarchical Clustering
10 | #create distance matrix
11 | distance_matrix <- dist(our_interests, method = "euclidean") 
12 | fit <- hclust(distance_matrix, method="ward") 
13 | # display dendogram
14 | plot(fit) 
15 | groups <- cutree(fit, k=5) # cut tree into 5 clusters
16 | # draw dendogram with red borders around the 5 clusters 
17 | rect.hclust(fit, k=5, border="red")
18 | 
19 | 
20 | #Non-Hierarchical cluster analysis
21 | 
22 | kmeans_clusters <- kmeans(our_interests, 5)
23 | library(cluster) 
24 | clusplot(our_interests, kmeans_clusters$cluster, color=TRUE, shade=TRUE, 
25 |          labels=2, lines=0)
26 | 
27 | 
28 | # Now where are the good puzzles?
29 | 
30 | # http://www.unc.edu/~ncaren/cite_network_full/cites.html
31 | 


--------------------------------------------------------------------------------
/Class Projects/Collecting Data for Github Project.R:
--------------------------------------------------------------------------------
1 | # Collecting data for group project
2 | 
3 | print("hello world")
4 | 


--------------------------------------------------------------------------------
/Class R Code/Class #1 R Code.R:
--------------------------------------------------------------------------------
  1 | # SCRIPT FOR CLASS #1, COMPUTATIONAL SOCIOLOGY
  2 | # Instructor: Chris Bail 
  3 | # Copyright, Chris Bail
  4 | 
  5 | 
  6 | # GETTING STARTED WITH YOUR WORKING DIRECTORY
  7 | 
  8 | # Setting Your Working Directory
  9 | 
 10 | # First let's identify your "working directory," or the 
 11 | # place where the files you want to work with are located. 
 12 | # At first they were online in our class's Dropbox folder, 
 13 | # but you have since downloaded them onto your computer. 
 14 | # Identifying the working directory is important because 
 15 | # you will need to know it in order to load files, import 
 16 | # data, and export graphs or other types of analysis.
 17 | 
 18 | # in order to identify your working directory, highlight 
 19 | # the line below and then click "Return" while holding down 
 20 | # "Control." This tells RStudio that you want to "run" or
 21 | # execute whatever line you are working on. You can also
 22 | # use the "Run" button in the upper right-hand side of this 
 23 | # pane of RStudio.
 24 | 
 25 | getwd()
 26 | 
 27 | # You should now see the output of this command below. By 
 28 | # default, R sets the working directory to the "home" folder 
 29 | # on your computer, or the folder that contains the file you 
 30 | # double clicked on.
 31 | 
 32 | # Often you will want to change the working directory, either
 33 | # because you want to work with data in a new folder, or 
 34 | # because you want to tell R to save your work to a folder
 35 | # that is more convenient for your work flow
 36 | 
 37 | # the command below will set your working directory to be
 38 | # your desktop
 39 | 
 40 | setwd("~/Desktop")
 41 | 
 42 | # The ~ sign here replaces the more detailed name of your 
 43 | # computer for example, if I were to use the complete name 
 44 | # of my desktop folder is: setwd("/Users/christopherandrewbail/Desktop)
 45 | # I am going to set my home folder as follows:
 46 | 
 47 | setwd("/Users/christopherandrewbail/Desktop/Dropbox/ODUM R COURSES/Intro to R Class Dropbox/")
 48 | 
 49 | 
 50 | # Next, let's take a look at what documents are in your
 51 | # home folder. 
 52 | 
 53 | list.files()
 54 | 
 55 | # Basic Operations in R
 56 | 
 57 | # Perhaps the most basic thing one can do is use
 58 | # R as a calculator
 59 | 
 60 | 1+1
 61 | 
 62 | # Now let's create our first object or variable in R
 63 | # To do this, you need to use the "<-" operator
 64 | 
 65 | my_number<-2
 66 | 
 67 | # we have now created a numeric variable whose value 
 68 | # is 2. Note that you can also use the "=" sign if you
 69 | # prefer (my_number=2). 
 70 | 
 71 | # Notice in the top right hand pane of Rstudio there is
 72 | # now a value for my_number. 
 73 | 
 74 | # now lets try some basic operations
 75 | 2*my_number
 76 | 2+my_number 
 77 | 2-my_number
 78 | my_number/3
 79 | my_number^3
 80 | 
 81 | # if we want to store the results of these basic
 82 | # operations, we could use the "<-" operator again
 83 | 
 84 | my_new_number<-2*my_number
 85 | 
 86 | # when naming variables or objects in r, try to 
 87 | # avoid terms that may confuse r because they are
 88 | # similar to commands. For example, don't name a 
 89 | # variable "mean" or "median." Also, keep in mind
 90 | # that R is case sensitive. If one letter is
 91 | # accidentally capitalized, your command won't 
 92 | # work.
 93 | 
 94 | # We can also create character or "string" variables
 95 | # by using either double or single quotation marks.
 96 | 
 97 | my_name<-"Georg Simmel"
 98 | 
 99 | # If we want to see the variable, we can use this
100 | # command
101 | 
102 | print(my_name)
103 | 
104 | #


--------------------------------------------------------------------------------
/Class R Code/Class #2 R Code.R:
--------------------------------------------------------------------------------
  1 | # SCRIPT FOR CLASS #2, COMPUTATIONAL SOCIOLOGY
  2 | # Instructor: Chris Bail 
  3 | # Copyright, Chris Bail
  4 | 
  5 | 
  6 | 
  7 | Vectors
  8 | 
  9 | # Many objects in R are vectors. These are sequences
 10 | # of multiple variables. We define a vector as follows
 11 | 
 12 | my_vector<-c(1, 3, 4, 9)
 13 | 
 14 | 
 15 | # Next, Let's try out some basic operations on 
 16 | # numeric vectors:
 17 | 
 18 | mean(my_vector)
 19 | median(my_vector)
 20 | max(my_vector)
 21 | min(my_vector)
 22 | summary(my_vector)
 23 | 
 24 | # Note that vectors can also be sequences of strings
 25 | 
 26 | my_word_vector<-c("Roy Williams","Is","The Best")
 27 | 
 28 | # often, you will want to grab one variable within a
 29 | # vector. This command, for example, selects the third
 30 | # number in my_word_vector 
 31 | 
 32 | my_word_vector[3]
 33 | 
 34 | # Let's pause to try this out. Here's an excercise:
 35 | # 1) create your own vector of numbers;
 36 | # 2) create a new variable that is the mean of
 37 | # your vector
 38 | 
 39 | # Example Solution:
 40 | 
 41 | my_new_vector<-c(100,200, 549)
 42 | average_vector<-mean(my_new_vector)
 43 | average_vector
 44 | 
 45 | 
 46 | #Matrices
 47 | 
 48 | # Vectors are a basic building block of matrices,
 49 | # another critical type of object in R. To create
 50 | # a matrix, we use the "matrix()" function.
 51 | 
 52 | my_matrix <- matrix(c(1,2,1,2, 64000,38000,100000,200000,
 53 |                       1,5,17,21 ), nrow = 4, ncol = 3)
 54 | 
 55 | # the first value required by this function is a 
 56 | # vector of numbers or characters. We use nrow and ncol
 57 | # to specify the number of rows and columns.
 58 | 
 59 | # to look at our matrix, you can run this line:
 60 | my_matrix
 61 | 
 62 | # or, you can click on "my_matrix" in the upper-right
 63 | # pane of RStudio.
 64 | 
 65 | # often, we will need to grab one row of a matrix, or
 66 | # one column. To do this, we use the "," operator:
 67 | 
 68 | my_matrix[1,]
 69 | 
 70 | # The "," operator specifies whether you are requesting
 71 | # the rows or the columns of the matrix. To request
 72 | # the first column, we would run
 73 | 
 74 | my_matrix[,1]
 75 | 
 76 | # To get the value of a cell within a matrix, we need
 77 | # to tell R about both the row and the column:
 78 | 
 79 | my_matrix[1,2]
 80 | 
 81 | # 64,000 is the number that is in the second column of the
 82 | # first row
 83 | 
 84 | Lists
 85 | 
 86 | # A third important type of R object
 87 | # is a list. Lists are like vectors, but unique
 88 | # in that they may contain multiple types of 
 89 | # data (e.g. strings, numbers, or even matrices)
 90 | 
 91 | # Let's create a list
 92 | 
 93 | my_list<-list(9, "Roy Williams", my_matrix)
 94 | 
 95 | # Let's take a look
 96 | my_list
 97 | 
 98 | # Let's say we wanted to grab "Roy Williams" from
 99 | # our list. We can just write:
100 | 
101 | my_list[2]
102 | 
103 | 
104 | # "Why are we spending so much time with Matrices and Lists?" 
105 | # you may ask. It is because many forms of programming
106 | # require a basic familarity with matrices and lists, and
107 | # if you get into working with big data you will almost
108 | # surely need o know how to work with them.
109 | 
110 | #Data Frames
111 | 
112 | # Matrices and lists are also important because they are 
113 | # the building blocks of what may be the most important
114 | # type of object in R: data frames.
115 | 
116 | # Data frames are very similar to datasets you might load
117 | # into Stata/SPSS/SAS in that they have rows, columns, and
118 | # column names, etc.
119 | 
120 | # In order to create a data frame, we can use the 
121 | # following command on our matrix:
122 | 
123 | my_data_frame<-as.data.frame(my_matrix)
124 | 
125 | # Note that there is now a new object in the upper
126 | # right "Environment" pane of RStudio. If we click
127 | # up there, we see that R has already chosen some
128 | # arbitrary names for our columns (V1, V2, V3).
129 | 
130 | # R uses some clunky syntax to change column names.
131 | # This is worth our time, however, because column
132 | # names often change when you are manipulating 
133 | # data
134 | 
135 | # lets change "V1" to "Sex"
136 | 
137 | colnames(my_data_frame)[colnames(my_data_frame)=="V1"]<-"Sex"
138 | 
139 | # But let's say we want to use words instead of numbers to
140 | # describe sex. In this case, we need to change the
141 | # contents of the data frame as follows:
142 | 
143 | my_data_frame$Sex[my_data_frame$Sex==1]<-"Female"
144 | my_data_frame$Sex[my_data_frame$Sex==2]<-"Male"
145 | 
146 | # That was a mouthful, huh? The "$" operator is 
147 | # how you tell R that you are looking for a specific
148 | # variable within the data frame.
149 | 
150 | # now lets look at our data frame
151 | 
152 | my_data_frame
153 | 
154 | # Now let's figure out the sex breakdown of our
155 | # data using the "table" command.
156 | 
157 | table(my_data_frame$Sex)
158 | 
159 | # Ok, let's step back again so that you can try 
160 | # this out on your own:
161 | # 1) Change the name of the Second column in
162 | # my_data_frame to "Income";
163 | # 2) Calculate the median of the Income variable
164 | 
165 | # MANIPULATING DATA
166 | 
167 | # Until now, we have been working at a very 
168 | # abstract level. This is because I needed 
169 | # to teach you some basic concepts before we
170 | # can start to work with real data.
171 | 
172 | # R Data Files have the extension .Rdata 
173 | # We will work with these soon, but let's 
174 | # begin by pulling in other types of data
175 | # files, because it's unlikely that you 
176 | # will be working with an .Rdata file if 
177 | # you are coming from another program
178 | # such as STATA.
179 | 
180 | # Importing Spreadsheets
181 | 
182 | # R has a variety of ways of importing data.
183 | # For example, data often comes in .csv
184 | # format. To read this, we use the read.csv
185 | # command
186 | 
187 | sample_csv_data<-read.csv("Sample_CSV_Data.csv")
188 | 
189 | # As the upper right hand pane of RSTudio
190 | # now shows, these data have 9909 observations
191 | # and 406 variables.
192 | 
193 | # By default, R has assumed that the first
194 | # line of these data are the variable names.
195 | # to list all of the variable names, we can
196 | # write
197 | 
198 | colnames(sample_csv_data)
199 | 
200 | # We do not have the dictionary for these data,
201 | # so we can only guess what these codes mean.
202 | 
203 | # R also treats any strings as factors. This can
204 | # become problematic later if you try to perform
205 | # operations on string variables that are actually
206 | # factor variables
207 | 
208 | # In order to see the "class" of a variable-
209 | # or whether it is a numeric, character, or 
210 | # factor variable, we can use the class() command
211 | 
212 | class(sample_csv_data$Institution_Name)
213 | 
214 | # Yep, it's a factor. If we want to prevent R
215 | # from defaulting to this behavior, we can add
216 | # an option to our read.csv command. Options
217 | # for most commands are specified by a comma
218 | # after the name of the object you want to apply
219 | # the command to.
220 | 
221 | # to illustrate this point a bit better, let's
222 | # look at the "help" file for read.csv. Earlier
223 | # I said there is no manual for R. The "help" 
224 | # file is the closest thing we've got, and it's
225 | # not always great.
226 | 
227 | ?read.csv
228 | 
229 | # now we can see that there are many different
230 | # types of options that can be specified. Let's
231 | # try:
232 | 
233 | sample_csv_data<-read.csv("Sample_CSV_Data.csv", 
234 |                           stringsAsFactors=FALSE)
235 | 
236 | # this tells R not to import strings as factors. In many
237 | # cases, you will want to add lots of different options
238 | # to an R command. We will get to these cases soon.
239 | 
240 | # But before we do, let's try to import some other types
241 | # of data. For example, what if you are a STATA user
242 | # trying to make the transition to R so that you can
243 | # analyze some Stata Data using a technique that is 
244 | # only available in R?
245 | 
246 | #Installing Packages and Importing Data
247 | 
248 | # To do this, we need to install a new package in R. 
249 | # Until now, we have been using "Base R" which refers
250 | # to all of the standard commands that come when you
251 | # download R. But most users will want to take advantage
252 | # of the rapidly expanding number of packages available. 
253 | # Indeed, some of these have become so instrumental for
254 | # computational sociology that I cannot imagine life without them.
255 | 
256 | # To open Stata data we are going to use the "Haven"
257 | # package written by a fellow named Hadley Wickham. He
258 | # is one of the most prolific authors of R packages for
259 | # computational social science and is very well respected 
260 | # within the R community.
261 | 
262 | # To add a package onto R, we use the install.packages
263 | # command
264 | 
265 | install.packages("haven")
266 | 
267 | # Though you only need to install a package once you
268 | # must "call" it within individual R scripts as follows:
269 | 
270 | library(haven)
271 | 
272 | # You can also do this by writing require(haven)
273 | 
274 | # Here is where things can get messy. There is a group
275 | # called the R Core Development Team which oversees and
276 | # approves R packages in order to make them more useable.
277 | # in order to get your package approved you have to write
278 | # a help file, so we can write.
279 | 
280 | # to find these help packages, you can either navigate
281 | # to the "packages" pane of RStudio on the lower right
282 | # pane, or you can google the name of the package to 
283 | # find the CRAN site (This stands for the Comprehensive
284 | # R Archive Network).
285 | 
286 | # Often you can also find a "vignette" or a pdf document
287 | # that not only explains some of the commands in the 
288 | # package but applies them to real data. These are often
289 | # easier to follow then the help files themselves.
290 | 
291 | # In this case, I know we want the "read_stata" command:
292 | 
293 | sample_stata_data<-read_stata("Sample Stata Data.dta")
294 | 
295 | # note that this "Haven" package also allows you to read
296 | # SPSS and SAS files, and write R files into these formats
297 | # as well.
298 | 
299 | # If you plan to work with text data or other types of web-
300 | # based data you will probably encounter different types of
301 | # data structures that we do not have time to cover in this
302 | # class, but will be covered in my course on Thursday. For 
303 | # example, JSON data, or html data.
304 | 
305 | # Subsetting Data Frames
306 | 
307 | # Manipulating data is a core task of computational social science. 
308 | #A recent New York Times Article suggests 80% of data scientists'
309 | # time is spent cleaning data, while only 20% of their time
310 | # is spent analyzing it. See: 
311 | #http://www.nytimes.com/2014/08/18/technology/for-big-data-scientists-hurdle-to-insights-is-janitor-work.html?_r=0
312 | 
313 | # This is because many data sets are either unstructured, or semi-
314 | # structured, or because they have large amounts of missing
315 | # data, or because they have to be manipulated in order to
316 | # be analyzed for one reason or another
317 | 
318 | # First, let's work on subsetting data. This simply means breaking
319 | # up a data frame into chunks. The syntax is similar to that we
320 | # used when we worked with matrices. For example, let's say
321 | # we want to take all respondents from our stata dataset who are
322 | # less than 50. The variable we want is called "age."
323 | 
324 | respondents_under_50<-sample_stata_data[sample_stata_data$age<50,]
325 | 
326 | # Once again, this is some tricky syntax. We first need to tell R
327 | # which dataset we want to manipulate. Everything inside the 
328 | # parentheses is our instructions to R about what subset we want.
329 | # Remember that the "," before the last "]" here is critical. 
330 | # We are are telling R that we want all rows that meet the criteria.
331 | # This is also the first time we have used a "logical operator"
332 | # in this case "<" you can also use ">" and "<="
333 | 
334 | # There are also a variety of useful commands to identify missing 
335 | # data. This is important because often when one is working with
336 | # big data one cannot simply eyeball the data to identify patterns
337 | # of missing-ness.
338 | 
339 | # First, let's drop all rows of the dataset that have any missing
340 | #data. To do this, we use the complete.cases() command:
341 | 
342 | no_missing_data<-sample_csv_data[complete.cases(sample_csv_data),]
343 | 
344 | # this dropped every single row. This is because of the structure
345 | # of this dataset (certain questions were asked of some respondents
346 | # but not others).
347 | 
348 | # more often, you might want to identify all rows that are missing
349 | # data on one single variable in order to identify patterns of 
350 | # missingness. Let's load some new data from the Dropbox in
351 | # order to illustrate this
352 | 
353 | pewdata<-read.csv("Sample_Pew_Data.csv")
354 | 
355 | # working with different datasets is useful because it gives you
356 | # a sense of the range of different problems you might encounter
357 | # with data cleaning. In this data set, for example, missing data
358 | # was coded as 9 instead of "NA" (or empty cells, which R would
359 | # have read in as NA). 
360 | 
361 | # lets look at missing data on the "pew10" variable, which is about
362 | # whether people supported the construction of the "Ground Zero"
363 | # mosque in New York in 2011. First, let's change the 9's to NAs
364 | 
365 | pewdata$pew10[pewdata$pew10==9]<-NA
366 | 
367 | missing<-pewdata[is.na(pewdata$pew10),]
368 | 
369 | #If we want to take all the values where "pew10" is NOT
370 | #missing, we would do this:
371 |   
372 | no_missing<-pewdata[!is.na(pewdata$pew10),]
373 | 
374 | # Note that is.na() is a logical operator. If we write
375 | 
376 | is.na(pewdata$pew10)
377 | 
378 | # we see TRUE/FALSE values for each row of the data frame on this
379 | # variable.
380 | 
381 | #Recoding Variables 
382 | 
383 | # now lets say we wanted to find all of the men with missing data. 
384 | # First let's find the variable
385 | 
386 | colnames(pewdata)
387 | 
388 | # now let's see how the variable is coded
389 | table(pewdata$sex)
390 | 
391 | # Looks like 1s and 2s. I happen to know that 1=Male in these
392 | # data, so:
393 | 
394 | missing<-pewdata[is.na(pewdata$pew10)& pewdata$sex==1,]
395 | 
396 | # Note again that we need the "," because we are telling R
397 | # we want the rows. If we wanted to trim columns from the data
398 | # we would need to put the content we want after the ","-
399 | # we can either use the numbers of the columns or their names. Let's
400 | # say we just want the two variables we've been working with so
401 | # far:
402 | 
403 | gender_and_mosque<-pewdata[,c("sex","pew10")]
404 | 
405 | # remember that the "c()" operator is necessary here because we
406 | # are asking for multiple variables. 
407 | 
408 | # let's say we wanted everything but the first column in the dataset.
409 | # First we would need to know the number of columns. We can use 
410 | # ncol() for this purpose
411 | 
412 | ncol(pewdata)
413 | 
414 | # Then we simply tell R we want rows 2 to 52 using the ":" operator,
415 | # which indicates a sequence.
416 | 
417 | no_first_column<-pewdata[,2:52]
418 | 
419 | # I also want to note that we could combine the two steps as follows:
420 | 
421 | no_first_column<-pewdata[,2:ncol(pewdata)]
422 | 
423 | # I'm noting this because it will be helpful to know that this is 
424 | # possible when we discuss programming later in this class.
425 | 
426 | # You now know the basics of manipulating a data frame in R. Let's
427 | # pause for another exercise:
428 | # 1) Figure out the age of the oldest man in the dataset
429 | 
430 | #Reshaping Data Frames
431 | 
432 | # Another very common task in computational sociology is reshaping data. For
433 | # example, suppose we wanted to examine partisanship by race. The
434 | # Patyln variable describes the following question within the Pew Data:
435 | # "As of today do you lean more to the Republican Party" or more to The
436 | # Democratic party" The possible answers are 1: Republican, 2: Democrat;
437 | # 9: Missing.
438 | 
439 | # It's annoying that these are not already correctly coded, but this is
440 | # a common task in computational sociology, so first, let's recode the numeric
441 | # data into strings or characters:
442 | 
443 | pewdata$partyln[pewdata$partyln==1]<-"Republican"
444 | pewdata$partyln[pewdata$partyln==2]<-"Democrat"
445 | pewdata$partyln[pewdata$partyln==9]<-NA
446 | 
447 | # let's check to make sure it worked:
448 | 
449 | table(pewdata$partyln)
450 | 
451 | # Now we also need to recode the race variables. 
452 | 
453 | pewdata$race[pewdata$race==1]<-"White"
454 | pewdata$race[pewdata$race==2]<-"African American"
455 | pewdata$race[pewdata$race==3]<-"Asian or Pacific Islander"
456 | pewdata$race[pewdata$race==4]<-"Mixed Race"
457 | pewdata$race[pewdata$race==5]<-"Native American"
458 | pewdata$race[pewdata$race==6]<-"Other"
459 | pewdata$race[pewdata$race==9]<-NA
460 | 
461 | table(pewdata$race)
462 | 
463 | # we can get a cross tab by doing this:
464 | table(pewdata$partyln, pewdata$race)
465 | 
466 | # Just for fun, let's save our cleaned up dataframe
467 | # in R format- we'll use it for some analysis tomorrow
468 | save(pewdata, file="Pew Data.Rdata")
469 | 
470 | # but let's say we want the average age by race. As is 
471 | # common with R, there are many different ways to do this.
472 | # let's continue using base R. For the record, one could use
473 | # the "plyr" package, the "reshape" package, and the 
474 | # "data.frame" package, just to name a few.
475 | 
476 | aggregate(pewdata$age, by=list(pewdata$race), FUN=mean)
477 | 
478 | # What if we want the average age by both race and party?
479 | aggregate(pewdata$age, by=list(pewdata$race, pewdata$partyln), FUN=mean)
480 | 
481 | # And once again we could store these data as follows:
482 | 
483 | age_by_race<-aggregate(pewdata$age, by=list(pewdata$race), FUN=mean)
484 | 
485 | # Merging Data Frames 
486 | 
487 | # Another very common task you might face in R is merging multiple
488 | # datasets. This is one of the most common tasks you might encounter 
489 | # in data cleaning and manipulation precisely because R can have
490 | # so many objects loaded in memory at once.
491 | 
492 | # Imagine, for example, that we want to add average income by race
493 | # to our dataset that describes average age. I put a very
494 | # small spreadsheet in the Dropbox that describes average
495 | # income by race.
496 | 
497 | race_income_data<-read.csv("Income By Race.xlsx")
498 | 
499 | # this gives us an error, because this is an .xlsx
500 | # file, and not a .csv file. This is a total pain.
501 | # Because the file is so small, we might be tempted to 
502 | # either a) open Excel and save it as .csv or b) 
503 | # just input the data manually into R.
504 | 
505 | # But what if this dataset were huge, or had some funky
506 | # character encoding that would be lost if you saved it
507 | # as .csv? This is a common problem when working with
508 | # big data.
509 | 
510 | # In this type of situation, you would need to do some
511 | # research. I said earlier that R does not have a manual
512 | # One can, however, search for R help via RSTudio by typing
513 | # ?? before the search term
514 | ??xlsx
515 | 
516 | # Bit this is a bit confusing, it does not give us good
517 | # advice on which of these packages might be best, or why
518 | # A better solution is simply to google your problem
519 | # let's try "Import .xlsx into R"
520 | 
521 | # The very first result comes from a site called "Stack
522 | # Overflow. In my opinion this is by far the most reliable
523 | # site for information about R, and other programming 
524 | # language as well. This is because it boasts an extermely
525 | # large and diverse user base, a sort of "hive mind"
526 | 
527 | # Here is the site:
528 | # http://stackoverflow.com/questions/7049272/importing-xlsx-file-into-r
529 | 
530 | # At the top of the page is a user who is asking a question
531 | # The question gets "votes" that indicate how important it
532 | # is for the question to be answered.
533 | 
534 | # First, there are a variety of comments on the question, asking
535 | # for clarification or recommending other resources.
536 | 
537 | # Below, there are answers. First, note that there are eleven
538 | # answers to the question! This shows both the potential and the
539 | # disorganization of R. Which answer is best? Each answer gets
540 | # votes by other users, so we can see here that the best answer
541 | # is the first one. (You should also note that individual users
542 | # have different reputation scores, and you might use those as
543 | # a guide as well).
544 | 
545 | # The consensus on this page is to use the XLConnect package. 
546 | # first, we need to install it:
547 | 
548 | install.packages("XLConnect")
549 | 
550 | # let's try it out
551 | race_income_data <- readWorksheet(loadWorkbook("Income By Race.xlsx"),sheet=1)
552 | 
553 | # It didn't work. Why? Because we did not call the packages.
554 | # Let's try again:
555 | 
556 | library(XLConnect)
557 | race_income_data <- readWorksheet(loadWorkbook("Income By Race.xlsx"),sheet=1)
558 | 
559 | # That was a lot of work for a little reward, but again
560 | # the point was to help you learn how to resolve a real-
561 | # world type of situation.
562 | 
563 | # Ok, now that we have finally read the data into R
564 | # we can now merge it together with our data frame.
565 | # We could do this within base r using the "merge"
566 | # command, but it is a bit clunky. Most folks now
567 | # prefer to use the "plyr" package because it is 
568 | # faster and more intuitive. 
569 | 
570 | install.packages("plyr")
571 | library(plyr)
572 | 
573 | #The command for merging datasets is called "join"
574 | merged_data<-join(age_by_race, race_income_data)
575 | 
576 | # This looked like it worked, but if we view the 
577 | # merged dataset, we see that it added NAs instead
578 | # of the values
579 | 
580 | View(merged_data)
581 | 
582 | # Why? This particular command from the plyr package
583 | # automatically searches for column names that are 
584 | # shared by both files. Let's check things out with 
585 | # colnames()
586 | 
587 | colnames(age_by_race)
588 | colnames(race_income_data)
589 | 
590 | # When we ran the "aggregate" command above, it gave
591 | # the columns new, arbitrary names. We need to fix
592 | # the column names so that they are the same across
593 | # The datasets
594 | 
595 | colnames(age_by_race)[colnames(age_by_race)=="Group.1"]<-"race"
596 | colnames(age_by_race)[colnames(age_by_race)=="x"]<-"age"
597 | 
598 | # Let's try to merge again
599 | 
600 | merged_data<-join(age_by_race, race_income_data)
601 | 
602 | # Once again, it looks like it worked. But if we view
603 | # the data again, we see that only the income of 
604 | # Whites was added. 
605 | 
606 | # Note that R did not give us an error message. This
607 | # is because it did exactly what we asked it to do:
608 | # merge all the rows that could be merged. But this
609 | # is the type of easy mistake that can create major
610 | # headaches further down the line. This is why it's
611 | # important to always view or table() your dataframes
612 | # after you manipulate them.
613 | 
614 | # In order to diagnose the problem, lets table race
615 | # in both dataframes
616 | 
617 | table(age_by_race$race)
618 | table(race_income_data$race)
619 | 
620 | # Aha, we can now see that most of the races were not
621 | # merged because they were only in one of the two
622 | # data frames. We ALSO see that the African American
623 | # row in the age_by_race data frame was not merged
624 | # because the race_income_data uses the term "Black." 
625 | # The terms for Asians also need to be recoded
626 | # Let's change this so that our merge will work:
627 | 
628 | race_income_data[race_income_data=="Black"]<-"African American"
629 | race_income_data[race_income_data=="Asian"]<-"Asian or Pacific Islander"
630 | 
631 | # And now let's try the merge again
632 | 
633 | merged_data<-join(age_by_race, race_income_data)
634 | 
635 | View(merged_data)
636 | 
637 | # finally, it worked.
638 | 
639 | # This is the conclusion of the first Day of this Class.
640 | # My goal was to help you get R up and Running and master
641 | # some of the basic object types and data manipulation
642 | # commands. These are by far the most frustrating parts
643 | # of learning R. Tomorrow, we will begin to get to analysis,
644 | # visualization, and programming, which is really where
645 | # R begins to shine.
646 | 
647 | 
648 | 
649 | 
650 | 
651 | 
652 | 


--------------------------------------------------------------------------------
/Class R Code/Class #3 R Code.R:
--------------------------------------------------------------------------------
  1 | # SCRIPT FOR CLASS #3, COMPUTATIONAL SOCIOLOGY
  2 | # Instructor: Chris Bail 
  3 | # Copyright, Chris Bail
  4 | 
  5 | 
  6 | 
  7 | #1: Introduction to Programming
  8 | 
  9 | # Though STATA, SPSS, and SAS provide basic forms of programming,
 10 | # R blows them out of the water. This is because R is built upon
 11 | # extremely powerful languages such as C++, but also because it has
 12 | # the object-oriented and open-source characteristics necessary
 13 | # to interface with other programming languages such as Python.
 14 | 
 15 | # At the same time, the syntax of programming in R can be rather
 16 | # funky. People who come to R from other languages such as C++
 17 | # or Python often report being frustrated by its clunky syntax.
 18 | # There are ways to make R simulate the syntax of other programming
 19 | # languages, but I will proceed to teach you the "R way" because
 20 | # I assume that most of you are hoping to use R as your main
 21 | # programming tool. 
 22 | 
 23 | #I should note that many people believe that the first programming
 24 | # language you learn is the hardest. Once you get a sense of the
 25 | # basic concepts of programming, it is much easier to translate
 26 | # what you know from one language to another.
 27 | 
 28 | # 2: Functions
 29 | 
 30 | # The most basic form of programming is a function. We've actually
 31 | # already been using them extensively throughout the course. But
 32 | # we've been using them without seeing the "source code" or the 
 33 | # complicated list of instructions that R processes each time
 34 | # we run a command such as corrgram, or tableplot
 35 | 
 36 | # A function is simply a set of instructions or tasks that one
 37 | # may apply to any type of object in R. Let's take a very basic
 38 | #function
 39 | 
 40 | my_function <- function(x) x+2
 41 | 
 42 | # This function takes a number (x) and adds two to the number.
 43 | # let's try it:
 44 | 
 45 | my_function(2)
 46 |   
 47 | # functions can get much, much more complicated.
 48 | another_function <- function(x, y, z) {
 49 |   x <- mean(x)^2
 50 |   y <- cos(y)-5
 51 |   z <- log(z)*200
 52 |   c(x,y,z)
 53 |   }
 54 | 
 55 | # this function requires three inputs (x, y, and z). The part
 56 | # between the brackets tells R what we want to do to each of these
 57 | # three inputs. The "c()" tells R that we want to display the results
 58 | # if we did not include the c(), the function would still run, but
 59 | # we would need to type "x,""y," or "z," to see the results for each
 60 | # variable.
 61 | 
 62 | # If you are just getting started out in R, you will probably not
 63 | # write too many of your own functions, but you will probabbly soon
 64 | # begin borrowing functions from others that you find online. It
 65 | # is also important that you understand how a function works in
 66 | # case you begin borrowing segments of other people's codes. If 
 67 | # you do not understand why their code works, you probably will
 68 | # not be able to modify it to suit your own purposes.
 69 | 
 70 | # 3: Loops
 71 | 
 72 | # Another central type of programming in R is the "for" loop. This
 73 | # is one of the oldest types of programming in computer science. We
 74 | # might use a for loop when we want to repeat some type of function
 75 | # or transformation across a large number of rows in a data frame, or
 76 | # a large number of files in a folder.
 77 | 
 78 | #let's begin with a very simple example:
 79 | 
 80 | for (i in 1:6){
 81 |   print("Jim Moody is bad-a$$")
 82 | }
 83 | 
 84 | # Let's start working with an example to illustrate. Let's say we 
 85 | # have a folder full of .csv files that describe different health
 86 | # indicators from OECD, but we are really only interested in data
 87 | # about Korea. 
 88 | 
 89 | # Let's build a for loop that opens each file, grabs the data from
 90 | # korea, and then makes another data frame. We need to begin with
 91 | # a few steps that may seem strange or unnecessary, but it will
 92 | # soon become clear why we need to do them.
 93 | 
 94 | # first, we need to tell R where the data are. I've placed it in
 95 | # the dropbox in a folder entitled "OECD Health Data. Let's use
 96 | # list.files to count the number of files. 
 97 | 
 98 | list.files("OECD Health Data")
 99 | 
100 | #The first thing we need to do with a for loop is initialize it, or tell it how
101 | # many times we want it to repeat the action. We therefore need
102 | # to count the number of files "
103 | 
104 | filenames<-list.files("OECD Health Data")
105 | number_of_files<-length(filenames)
106 | 
107 | # now, let's create an empty data frame to store our data:
108 | koreadata<-as.data.frame(NULL)
109 | 
110 | # now, let's loop into each file
111 | 
112 | for(i in 1:number_of_files){
113 |   
114 |   filepath<-paste("OECD Health Data/", filenames[i], sep="")
115 |   data<-read.csv(filepath, stringsAsFactors = FALSE)
116 |   newdata<-data[data$Location=="Korea",]
117 |   newdata$indicator<-filenames[i]
118 |   koreadata<-rbind(koreadata,newdata)
119 | }
120 | 
121 | # There is quite a lot to explain here. Let's begin by the first
122 | # line. The "i" here is the variable we are going to loop through.
123 | # so if i=1, then we are looking at the first file in the folder. If
124 | # i=2 we are looking at the second file in the folder, etc.
125 | 
126 | # the 1:number_of_files, tells R that we want to repeat the steps
127 | # within the loop for values between 1 and number_of_files. In this
128 | # case our number_of_files variable equals 5, so we are telling R
129 | # to repeat these steps for all five files in our folder.
130 | 
131 | # Everything within the brackets is what we want r to do for each
132 | # file.
133 | 
134 | # The first thing we want it to do is open the csv file, but to
135 | # do this we need to tell it the full file path of the file. We
136 | # could type filenames[i] but this would just get us the name of
137 | # the file, and not the whole file path.
138 | 
139 | # to create the file path, we are using the paste function.
140 | # this function takes two strings and joins them together. The
141 | # sep here refers to what we want R to put in between the two
142 | # strings, in this case, we want nothing, so we put no text or
143 | # spaces in between the quotation marks. 
144 | 
145 | # the second line in the loop simply reads in the .csv file 
146 | #using the file path we just created.
147 | 
148 | # The third line selects only the data for Korea, which we can
149 | #find because all of the .csv files we are reading in have used
150 | # the same column names, and the same capitalization for the term
151 | # "Korea."
152 | 
153 | # In the fourth line, we are creating a new variable in the data
154 | # frame we created in the preceding line that describes the name
155 | # of the metric. In this case, the names are sloppy because they 
156 | # include all of the .csv formatting. We could clean this up
157 | # using a command such as gsub() but let's keep it simple for now
158 | 
159 | # The final line in the loop is critical. We are telling R to
160 | # take this new data frame we created and append it to the blank
161 | # data frame we created before we started the loop. With each
162 | # iteration of the loop, the data frame gets one more row.
163 | 
164 | # That was a detailed explanation, but my goal was to try and
165 | # work in a few useful commands into a practical example which
166 | # you might encounter in your own work.
167 | 
168 | # There are other types of loops in R that we do not have time
169 | # to cover (e.g. "while" loops, and if/else statements). My hope
170 | # is that if you want to learn more about these types of loops
171 | # you now have a base level of knowledge to learn about them.
172 | 
173 | # Loops are slow in most languages, but particularly in R. You 
174 | # may never care about speed if you are only working with datasets
175 | #<10,000 observations in R, but if you want to get into big data
176 | # you will probably want to look into activities at loops.
177 | 
178 | # On the other hand, you can also chose to be a "hack" or a sloppy
179 | # programmer, and simply run your code on a really powerful
180 | # machine. I'll describe how to do this later during this class.
181 | 
182 | ========================================================
183 |   # **Now you try it:
184 |   
185 |   # Write a forloop that goes through each variable in our Pew Dataset   and replaces values of 9 with NA.
186 | 
187 |   #Hint: you may find the `ncol` function useful.
188 |   
189 |   #SOLUTION
190 |   
191 |   number_of_columns<-ncol(pewdata)
192 |  for (j in 1: number_of_columns){
193 |    pewdata[,j][pewdata[,j]==9]<-NA
194 |  }
195 | 
196 | 
197 | # 4: Vectorized functions
198 |   
199 | # One of the reasons that R is slow is that it is not a compiled 
200 | # language. In other words, you don't have to run a "set up" type
201 | # of program before you do your analysis.
202 | 
203 | # R can access compiled commands through a process called "vectorization"
204 | # It is not really important for you to understand what the difference
205 | # is. The important thing is that you will probably encounter
206 | # other people using vectorized commands because they are faster
207 | # and it is therefore important for you to understand how they
208 | # work.
209 | 
210 | # Vectorized functions within R are known as "apply" functions.
211 | # There are different types of apply commands for different
212 | # types of r objects. We are just going to look briefly at the one
213 | # for data frames, though there are also apply commands for lists 
214 | # and arrays.
215 | 
216 | # let's try to read our OECD Health files into R using apply. Once
217 | # again, we need a list of the names of the files:
218 | 
219 | filenames<-list.files("OECD Health Data")
220 | 
221 | # And now let's paste the file path into them
222 | filenames<-paste("OECD Health Data/", filenames, sep="")
223 |   
224 | # and now let's apply the read.csv command to each file:
225 | data<-lapply(filenames,read.csv)
226 | 
227 | # just one line! Note that the data is now in list format
228 | # and we'd have to clean it up to make it comparable to
229 | # the data we created within the for loop.
230 | 
231 | # The important thing isn't the usefulness of this command
232 | # in this context, but in other, larger datasets. The apply
233 | # command is particularly powerful because we can apply 
234 | # whatever function we want to our filenames- either other
235 | # people's r functions or our own.
236 | 
237 | # the syntax for apply commands can become somewhat opaque
238 | # because they do not spell out the functions. Also, one has 
239 | # to choose the appropriate apply command for the object in
240 | # question. A useful resource on the apply command is this
241 | # blog post: 
242 | # http://www.r-bloggers.com/using-apply-sapply-lapply-in-r/
243 | 
244 | # One final note: you can speed up plyr and dplyr commands
245 | # by specifying the "parallel processing" options that allow
246 | # r to take advantage of multiple CPUs that you may have on 
247 | # your machine. This can be particularly helpful if you use
248 | # the very powerful Amazon machines- or other cluster computing
249 | # technologies- described in section 4.6 below
250 | 
251 | # 5: Piping
252 | 
253 | # At the risk of giving you too many different options for 
254 | # programming, I'm going to introduce you to one of the newer,
255 | # more cutting edge ways of programming in R. This is called 
256 | # piping. 
257 | 
258 | # Piping is a way of passing data and functions in code without
259 | # initializing or iterating. Many people find it more intuitive
260 | # because it is a) less complex, and b) can be coded in a less
261 | # cluttered manner. 
262 | 
263 | # let's take a quick peak at the maggritr package
264 | 
265 | install.packages("magrittr")
266 | library(magrittr)
267 | 
268 | # The key contribution of this package is the `%>% operator.
269 | # Whatever is on the left side of this operator gets passed
270 | # to the right side.
271 | 
272 | # Let's look at some data on baby naming from the Social
273 | # Security administration.
274 | 
275 | install.packages("babynames")
276 | library(babynames)
277 | 
278 | # The real power of %>% comes when you combine it with other
279 | # packages. Let's combine it with the dplyr package for data
280 | # reshaping/manipulation:
281 | 
282 | # first, lets take the babynames data and pass it through the
283 | # "filter" command in dplyr which lets us request only names
284 | #where the first three letters start with "Ste." Then we will
285 | # use the group_by function of the same package to reshape
286 | # the data by year and sex. Finally, we will count the totals,
287 | # and plot it using ggplot
288 | 
289 | library(dplyr)
290 | library(ggplot2)
291 | 
292 | babynames %>%
293 |   filter(name %>% substr(1, 3) %>% equals("Ste")) %>%
294 |   group_by(year, sex) %>%
295 |   summarize(total = sum(n)) %>%
296 |   qplot(year, total, color = sex, data = ., geom = "line")%>%
297 |   add(ggtitle('Names starting with "Ste"')) %>%
298 |   print
299 | 
300 | # Notice that we never created a variable, a blank data frame
301 | # or any other object. Once again, for some, this is much easier
302 | # to follow. Regardless of whether you find it more intuitive,
303 | # you would probably agree that it is quicker to write.
304 | 
305 | # 6: Debugging your code
306 | 
307 | # Whether you are brand-new to coding or whether you've been
308 | # doing it for years, it is extremely easy to make small mistakes
309 | # that can make your code fail.
310 | 
311 | # Consider, for example, a for loop that never closes its brackets,
312 | # or a loop that uses the same letter to represent two different
313 | # variables in a model.
314 | 
315 | # In order to catch these annoying problems, we need to "de-bug"
316 | # our code. Thankfully, R has a number of built in tools as well
317 | # as user contributed packages that can help us do this.
318 | 
319 | # Perhaps the easiest way to debug your code, however, is right
320 | # here in RStudio.
321 | 
322 | # You've probably noticed by now that RStudio will try to complete
323 | # the code you write. Once you define a data frame, for example
324 | # it can help you write variable names, etc. It can also help
325 | # you find options within a function.
326 | 
327 | # You may have also noticed a red dot to the left of your code
328 | # or "Script" window. This describes some type of error. Usually
329 | # it is a syntax error, or some type of code that would result in
330 | # an error message in R.
331 | 
332 | # This is particularly useful if you are looking at a very large
333 | # amount of code. It may be something as simple as realizing that
334 | # you did not load a package before calling a function.
335 | 
336 | # RStudio also helps you find where brackets and parentheses
337 | #begin and end in your code.
338 | 
339 | # RSTudio also has more sophisticated debugging tools that are
340 | # described in detail here: 
341 | # https://support.rstudio.com/hc/en-us/articles/205612627-Debugging-with-RStudio
342 | 
343 | # One final note on programming. If you want to get into more 
344 | # advanced programming in R, I highly suggest the following 
345 | # site: http://adv-r.had.co.nz authored by Hadley Wickham
346 | 
347 | 


--------------------------------------------------------------------------------
/Class R Code/Class #4 R Code.R:
--------------------------------------------------------------------------------
  1 | # SCRIPT FOR CLASS #4, COMPUTATIONAL SOCIOLOGY
  2 | # Instructor: Chris Bail 
  3 | # Copyright, Chris Bail
  4 | 
  5 | 
  6 | # INTRODUCTION
  7 | 
  8 | # This class is designed to introduce you to the basic techniques
  9 | # for collecting large corpora-- or text-based data-- using R. 
 10 | 
 11 | # For example, the techniques you will learn in this class can be used
 12 | # to scrape text data from websites, extract social media messages or
 13 | # other types of texts from sites such as Twitter, Facebook, or Google, 
 14 | # or automate the collection of text using other internet-based tools.
 15 | 
 16 | # When people think about large text-based data sets, they tend to think
 17 | # immediately of social media sites or blogs. Yet one of the most 
 18 | # exciting things about recent years is that we are witnessing vast
 19 | # archive of historical archives as well. 
 20 | 
 21 | # Consider, for example, Google's nGram dataset, which is based upon
 22 | # digital copies of nearly every book in the English language, and many
 23 | # other languages as well. It is also increasingly easy to get historical
 24 | # newspaper data or television transcripts. Librarians across the world
 25 | # are rapidly digitizing hand-written texts from across the ages.
 26 | 
 27 | # These new wellsprings of data present unprecedented possibilities for
 28 | # academics, yet they also raise a number of new challenges. Fortunately,
 29 | # the fields of computer science and computational linguistics have 
 30 | # jointly produced a suite of new tools that make our job easier.
 31 | 
 32 | # Though we once had to hire teams of research assistants to collect, 
 33 | # standardize, and analyze large corpora, a single computer or group of
 34 | # computers can now do this to text-based datasets that are so large that
 35 | # human coders could never analyze them all.
 36 | 
 37 | # But these new techniques will be unfamiliar to you if you come from a 
 38 | # conventional statistics background. Text-based datasets do not come
 39 | # prepackaged. Instead, they are unstructured and usually very messy. 
 40 | 
 41 | # This is often because automated collection of texts often produces 
 42 | # texts that are formateed or structured differently. The first task 
 43 | # we will discuss in this class is simply how to automate collection
 44 | # of texts via the internet.
 45 | 
 46 | # COLLECTING TEXT-BASED DATA
 47 | 
 48 | # Before I describe automated techniques for collecting text-based data
 49 | # I'd like to point out that there is already a vast amount of data 
 50 | # out there that has already been compiled. For example, the New York
 51 | # Times offers a large dataset of its articles, as does Reuters. Google
 52 | # also makes is nGrams data available to the public. There are also a
 53 | # variety of archives of Wikipedia data.
 54 | 
 55 | # It is important to ask yourself whether you might be able to take 
 56 | # advantage of text-based datasets that someone else has collected 
 57 | # because you may be underestimated the amount of time it takes to collect
 58 | # vast amounts of data. On the one hand, new technologies make this
 59 | # easier than ever, but on the other hand the inherent messiness of
 60 | # automated text collection-- from inconsistent file formats to spelling
 61 | # differences to character encoding problems-- can make collecting
 62 | # your own text-based datasets quite a hassle.
 63 | 
 64 | # But if you are here, it is probably because you want to learn how
 65 | # to build your own datasets. And this is probably where the greatest
 66 | # value added is given that this is really a new frontier.
 67 | 
 68 | # 1.1 Screen-Scraping.
 69 | 
 70 | # Unfortunately, however, we are no longer in the "Wild Wild West" of big
 71 | # data. Only several years ago one could easily mine or "scrape" vast amounts
 72 | # of data from giant archives of information such as Google or Amazon.
 73 | 
 74 | # Yet major corporations have become wise to the value of their data, and the
 75 | # vast majority of sites now prevent you from scraping large amounts of data.
 76 | # There are some important exceptions to this, but by in large, sites such as
 77 | # Facebook, Twitter, or JSTOR will shut you down if you try to grab too
 78 | # much text in an automated fashion.
 79 | 
 80 | # I should also warn you that automated collection of text-based is also 
 81 | # often not only discouraged, but illegal. Years ago, several academics
 82 | # got into considerable trouble with Facebook and Google for trying to
 83 | # scrape data from these sites. To determine whether you can safely 
 84 | # automate data collection from a site, you need to visit its "Terms 
 85 | # of Service," which is a legally binding document that describes how 
 86 | # developers (in this case, you!) may interface with a site.
 87 | 
 88 | # Despite all of these issues, the first technique I want to teach you today is something called "screen-scraping." 
 89 | 
 90 | # Screen scraping refers to a type of computer program
 91 | # that reads in a web page, finds some information on it, grabs the
 92 | # information, and puts it into a spreadsheet or other type of data
 93 | # storage format.
 94 | 
 95 | # When we look at a web page, we typically see something that is very easy
 96 | # to digest. There is some combination of text and images in a relatively
 97 | # small number of formats that we have been taught to digest easily.
 98 | 
 99 | # But this is not what a webpage looks like to a computer. And if we want
100 | # to teach a computer to grab information from a web page for us, we need
101 | # to assume the perspective of a computer.
102 | 
103 | # To a computer, a webpage is a long list of formatting rules, scripts, text,
104 | # and audio-visual data that is all put together in one of two common formats:
105 | # HTML or XML. These long lines of code tell the website how to assemble text, # images and video on the vast range of devices that might try to load the 
106 | # page. It also generally shapes the "look" or "theme" of the website, and
107 | # how data is stored. But none of this is very important to understand in
108 | # detail unless you are interested in building websites.
109 | 
110 | # Let's look at an example. Consider, the following Wikipedia page about
111 | # the World Health Organization's Ranking of Different Countries' Health
112 | # systems: 
113 | 
114 | #https://en.wikipedia.org/wiki/World_Health_Organization_ranking_of_health_systems_in_2000
115 | 
116 | # To do screen scraping, we need to find the "Source Code," or the messy
117 | # list of instructions that a computer needs to display this page in the 
118 | # format we see before this.
119 | 
120 | # There are a variety of ways to see the source code of a website, but 
121 | # the easiest way is typically to use your web browser. In Chrome, for
122 | # example, we can go to the dropdown "View" menu, and then select 
123 | # "Developer" and then "View Source."
124 | 
125 | # Messy, huh? At the top of the source code we can see that this document
126 | # is an HTML file. We will need to know whether a site is in HTML or XML
127 | # because it will determine the type of tools we use in R to scrape it.
128 | 
129 | # In order to get a better feel for how the source code relates to the website
130 | #, let's navigate back to the wikipedia site:
131 | 
132 | #https://en.wikipedia.org/wiki/World_Health_Organization_ranking_of_health_systems_in_2000
133 | 
134 | # Let's say we want to scrape the data from the table on this page.
135 | # To do this, we are going to need to find out where this information is
136 | # within that messy HTML code.
137 | 
138 | # Fortunately, there are a number of useful tools we can do to find this
139 | # type of information. In Chrome, for example, we can right click on the 
140 | # part of the webpage we want to scrape, and click "inspect element."
141 | 
142 | # Now, when we mouse over the messy code in the text, Chrome highlights
143 | # the part of the page that this code creates. So if we move our mouse
144 | # around until it highlights the table, we can start to identify the part
145 | # of the code we need to scrape it. The thing we need is called the "xpath"
146 | # To get the xpath, we can again right click and Chrome gives us the option
147 | # to copy it to our clipboard.
148 | 
149 | # In my view, the best R package for screenscraping at present is the "rvest"
150 | # package, which was written by Hadley Wickham. R used to lag behind other
151 | # languages such as Python for web scraping, but rvest basically takes all the
152 | # best parts of these other languages and ports them into R.
153 | 
154 | # The first thing I'm going to do is set our class dropbox as my
155 | # working directory:
156 | 
157 | setwd("/Users/christopherandrewbail/Desktop/Dropbox/Teaching/Computational Soc Fall 2015/Course Dropbox")
158 | 
159 | # note: the file path will be different on your machine!
160 | 
161 | # The first thing we need to do is install rvest:
162 | 
163 | install.packages("rvest")
164 | 
165 | # Next, we need to remember to load rvest into our r code/r session
166 | 
167 | library(rvest)
168 | 
169 | # The first thing we need to do is to pass all of that messy source code
170 | # from the web and into r. To do this, we use the html() command:
171 | 
172 | wikipedia_page<-html("https://en.wikipedia.org/wiki/World_Health_Organization_ranking_of_health_systems_in_2000")
173 | 
174 | # Here I've created an object called "wikipedia page" that we are going
175 | # to reference in the rest of our code. If we type "wikipedia_page" we will
176 | # see all of that nonsense:
177 | 
178 | wikipedia_page
179 | 
180 | # HTML is broken up into sections that are called "nodes." The xpath tells
181 | # R which section we want. To get that section, we use the html_nodes()
182 | # command as follows:
183 | 
184 | section_of_wikipedia_html<-html_nodes(wikipedia_page, xpath='//*[@id="mw-content-text"]/table[1]')
185 | 
186 | #Once again, this object is going to be messy:
187 | 
188 | section_of_wikipedia_html
189 | 
190 | #But fortunately rvest has a command that let's us grab tables within
191 | #HTML sections, it's called "html_table()
192 | 
193 | health_rankings<-html_table(section_of_wikipedia_html)
194 | 
195 | # ..And voila. We have now scraped the health rankings data from Wikipedia
196 | 
197 | health_rankings
198 | 
199 | # It's still in a somewhat messy format though. In fact, let's check to
200 | # see what type of format it is in:
201 | 
202 | class(health_rankings)
203 | 
204 | # It's a list. To convert this to a data frame that we could easily
205 | # work with, we can simply write:
206 | 
207 | test<-as.data.frame(health_rankings)
208 | 
209 | # Unfortunately, many sites are not as "friendly" to automated text
210 | # collection as Wikipedia, which is not only decidely "open" to anyone
211 | # but also very consistent in the way it formats information.
212 | 
213 | # On messier sites, the "inspect element" trick in Chrome might not work.
214 | # But there is another way around this. Instead of getting the "xpath" we
215 | # can get something called the "css selector." 
216 | 
217 | # The easiest way to do this it to download a plugin for chrome called 
218 | # Selector Gadget. This is a tool that you can load when you look at a 
219 | # webpage in order to find the css selector in the html code.
220 | 
221 | # This website explains how to use it:
222 | 
223 | #http://cran.r-project.org/web/packages/rvest/vignettes/selectorgadget.html
224 | 
225 | # If you drag the link on this page onto the bookmarks bar, you can load the
226 | # selectorgadget anytime you are on a website you want to scrape.
227 | 
228 | # The next step is to click on the stuff you want to scrape, and then click
229 | # on something you DO NOT want to scrape. This helps the tool figure out
230 | # exactly how to describe what you want on the page. IT IS NOT PERFECT THOUGH.
231 | # Once again, different pages use different formats, but some combination
232 | # of this method with the Chrome/INspect Element method should work for
233 | # most webpages.
234 | 
235 | # Why don't we scrape a list of the 100 Twitter Users with the largest
236 | # numbers of followers so that we can use it when we work with Twitter in 
237 | # just a bit.
238 | 
239 | # Here is the link to the page:
240 | 
241 | # http://twittercounter.com/pages/100"
242 | 
243 | # After using the SelectorGadget tool, I determined that the 
244 | # css Selector for the data we want is called ".name-bio."
245 | 
246 | # To get the data, the process is almost identical to our
247 | # last example, except that we replace the xpath= with css=
248 | 
249 | toptwitter<-html("http://twittercounter.com/pages/100")
250 | toptwitternodes<-html_nodes(toptwitter, css=".name-bio")
251 | names<-html_text(toptwitternodes)
252 | 
253 | #Unfortunately, the html_table does not work, because the node
254 | #we selected is not a table, but just plain text, so we need to
255 | # run the code using the "html_text()" command:
256 | 
257 | toptwitter<-html("http://twittercounter.com/pages/100")
258 | toptwitternodes<-html_nodes(toptwitter, css=".name-bio")
259 | names<-html_text(toptwitternodes)
260 | 
261 | #Let's take a look
262 | 
263 | names
264 | 
265 | # The data we want is in there, but it's surround by a bunch of odd
266 | # characters. These characters are telling the webpage how many spaces to
267 | # put in between the text.
268 | 
269 | # Cleaning up messy text like this in R is a very common challenge.
270 | # One can approach the problem in a variety of different ways, but 
271 | # I am fond of using the "gsub()" command.
272 | 
273 | # The gsub command finds one character string and replaces it with another.
274 | # This line tells R to find the "\n"s and replace the with "" which means
275 | # nothing.
276 | 
277 | names<-gsub("\n","", names)
278 | 
279 | # THe last argument simply tells R the name of the object we want to apply
280 | # this text transformation to.
281 | 
282 | names
283 | 
284 | # This got rid of the "\n"s but not the "\t"s but to get rid of those,
285 | # we can just add another line of code:
286 | 
287 | names<-gsub("\t","", names)
288 | 
289 | #Let's check it out:
290 | 
291 | names
292 | 
293 | #Getting closer, but we are going to want to split up the names and the
294 | # Twitter addresses, which begin with "@"
295 | 
296 | # To do this, we can use the "strsplit" command:
297 | names_data<-(strsplit(names,"@"))
298 | 
299 | #This command simply tells R to split each string into two when it
300 | # encounters the "@" character
301 | 
302 | names_data
303 | 
304 | # It's split, but it's still in a funky format. Let's find out which
305 | # one:
306 | 
307 | class(names_data)
308 | 
309 | # It's a list, and we are going to want a data frame once again. Let's
310 | # use an apply function to extract the names and twitter handles in
311 | # separate steps:
312 | 
313 | twitter_names<-sapply(names_data,"[",1)
314 | twitter_handles<-sapply(names_data, "[", 2)
315 | 
316 | #Let's just make sure that they are now character vectors:
317 | class(twitter_names)
318 | 
319 | # now we can simply bind them together using "cbind" and 
320 | # as.data.frame()
321 | 
322 | twitter_data<-as.data.frame(cbind(twitter_names, twitter_handles))
323 | 
324 | # Now you know the basics of screen scraping. But there are two
325 | # more things you need to know about. First of all, if you are scraping
326 | # an XML website you can use other functions such as XML2R. There is
327 | # A nice tutorial on this here:
328 | 
329 | #http://cpsievert.github.io/slides/web-scraping/#1
330 | 
331 | # One more thing. Often, you don't want to scrape just one website,
332 | # but many websites. This means you need to generate a list of website
333 | # that you can then pass through a "for" loop and extract whatever 
334 | # type of data that you are searching for.
335 | 
336 | # IF you are trying to repeatedly scrape one website for lots of sub
337 | # pages, you may be able to recognize patterns in the way that the 
338 | # URLs are formed, and then use "gsub" or "paste" commands to change
339 | # your url calls to collect HTML or XML.
340 | 
341 | # By now you can probably tell that some screen scraping exercises
342 | # are much easier than others. It simply depends upon the structure
343 | # of the website, and its overarching structure.
344 | 
345 | # As I mentioned earlier, many sites now have functions that stop
346 | # you from scraping them. If you try to request to many different
347 | # sub-sections of the same site, for example, you will eventually
348 | # get an error that says something about "authentication" or an'
349 | # "SSL" error, or an "OAuth" error.
350 | 
351 | #WORKING WITH AN API
352 | 
353 | # Sites that block you- which are unfortunately most of the sites you
354 | # might want to scrape- usually offer a powerful alternative: an
355 | # Application Programming Interface (API)
356 | 
357 | # An API is a type of web infrastructure that enables a developer (you)
358 | # to request large amounts of specific information from a website. The
359 | # Website then creates a new URL that contains the data you request,
360 | # and you scrape it. This has become such an important part of the web
361 | # that most large websites now have APIs (e.g. Google, Twitter, Facebook
362 | # even the New York Times)
363 | 
364 | # APIs Are called Application Programming Interfaces because may of the people
365 | # who use them are building apps. For example, a music sharing website
366 | # might want to build an app that helps people expose their friends to
367 | # new types of music. But to do this it needs to request permission to
368 | # extract certain types of information about the person from a site
369 | # such as Facebook.
370 | 
371 | # But Facebook obviously can't give them all the info. Facebook needs to
372 | # make sure that the person wants them to access their data. They also
373 | # need to make sure the app develop can only access certain types of data
374 | # and not all the data that Facebook has.
375 | 
376 | # To do this, Facebook- and other sites that have APIs- have "authenticat# ion tokens," or "access keys." These are simply codes that you need to # give  when you request data from an API.
377 | 
378 | # Let's take a look at how the Facebook API works using the "Facebook
379 | # Graph API Explorer. This is a website that lets you see how an
380 | # API works, also known as a "sandbox":
381 | 
382 | # https://developers.facebook.com/tools/explorer
383 | 
384 | # try typing "me/friends" into the search bar below the text "FQL Query." # This is a tool that shows you what the results would look like if you #made this API request. 
385 | 
386 | # what it is actually doing is forming the URL request and then showing  you the JSON-format data that would load if you pasted the URL in your browser.  
387 | 
388 | # Most sites that have APIs do not have this type of "sandbox." but
389 | # learning how to master working with them is a really nice to skill because
390 | # there are so many APIs out there.
391 | 
392 | #At present, there are more than 13,000 APIs. You can see a list of them here: http://www.programmableweb.com/category/all/apis?order=field_popularity
393 | # Academics may be interested to know that many data archiving sites now offer
394 | # APIs (such as ProQuest). Many are free to use, but others cost significant
395 | # amounts of money. 
396 | 
397 | # Most APIs have "rate limits" which determine how many
398 | # requests for information a developer (you) can make within a certain time frame
399 | 
400 | 
401 | # In R, you can either interact with an API by forming requests for data within
402 | # a loop and "scraping" the resultant data from URLS "by hand," or you can
403 | # use a variety of user-generated packages to collect data.
404 | 
405 | # Because we already covered screen scraping, let's look at one of these packages. Let's start with the twitteR package.
406 | 
407 | install.packages("twitteR")
408 | 
409 | #The instruction manual for this package is here:
410 |   
411 | # http://cran.r-project.org/web/packages/twitteR/twitteR.pdf
412 | 
413 | # The first thing you need to do is register as a developer with Twitter.
414 | # in order to do this, you need to visit this page:
415 | 
416 | # apps.twitter.com 
417 | 
418 | #Unfortunately, if you don't have a Twitter account, you'll have to make one,
419 | # or follow along on your neighbor's laptop if they don't mind.
420 | 
421 | #THe next step is to click on "Create New App." You need to name your app, and 
422 | # provide some other credentials. It really does not matter much what you
423 | # put in here, because we are not building an app that other people are going
424 | # to use. I just put in my own website. You can leave the "Callback URL field blank."
425 | 
426 | # Our goal in doing this is to obtain a Twitter API Key which we need to extract
427 | # Any data from Twitter. TO do this we need to scroll down to the "Application
428 | #Settings section, and then click the blue "manage keys and access token" link
429 | # That is to the right of our Consumer Key
430 | 
431 | # The next thing we need to do is tell the twitteR package what our secret
432 | #login details are. I can't write mine in here because if this information
433 | # got out a hacker could use it to pose as me, or get data collected by my
434 | # app which I might not want her or him to have.
435 | 
436 | setup_twitter_oauth(consumer_key="TEXTOFYOURKEYHERE",
437 |                     consumer_secret="TEXTOFYOURSECRETHERE",
438 |                     access_token="TEXTOFACCESSTOKENHERE", 
439 |                     access_secret="TEXTOFACCESSSECRETHERE")
440 | 
441 | 
442 | # When we run this last line, it will ask us if we want to use a 
443 | # local file to store these "credentials." I am going to say "no"
444 | # and load these into R each time I need them.
445 | 
446 | # What this twitteR package is doing for us is simplifying some of
447 | # the complex URL requests we would need to put in each URL call
448 | # we make to the TWitter API. Once all of our authentication
449 | # information is in the system, we have a range of useful commands
450 | # available to us.
451 | 
452 | #First, we can define a Twitter user whose information we want to scrape
453 | # you can use my name, or feel free to put in your own name
454 | # instead of mine
455 | 
456 | user <- getUser("chris_bail")
457 | 
458 | # Let's get a list of my "friends"- by friends, the author of this package is referring
459 | # to the name of the people that I follow on Twitter:
460 | 
461 | friends<-user$getFriends()
462 | 
463 | # Now let's get a list of people who follow me on Twitter:
464 | followers<-user$getFollowers()
465 | 
466 | #We can also get a list of all my favorite Tweets:
467 | favorites<-favorites(user)
468 | 
469 | #This package also has some nice commands for formatting these
470 | # data as data.frames:
471 | 
472 | friendsdata <- twListToDF(friends)
473 | followersdata <- twListToDF(followers)
474 | 
475 | #This the command I would use to get a user's tweets:
476 | tweets<-userTimeline(user)
477 | 
478 | # I mentioned earlier that Twitter will set shut us
479 | # down if we ask for two much data. this command
480 | # let's me see the limits on what I can do
481 | # within a given time frame:
482 | 
483 | getCurRateLimitInfo()
484 | 
485 | # Remember that list of top twitter accounts we got?
486 | # Let's see if we can scrape network data from these folks.
487 | # First, let's remind ourselves what these data look like:
488 | 
489 | head(twitter_data)
490 | 
491 | # So what I am going to want to do is create a for loop
492 | # where I make each person the "user" in each iteration
493 | # and scrape the names of the people they follow:
494 | 
495 | # Create a blank data frame to store data we scrape
496 | twitter_network_data<-as.data.frame(NULL)
497 | # figure out how many rows we have to scrape
498 | z<-nrow(twitter_data)
499 | 
500 | # start for loop that gets names of people the user
501 | # follows and append them to the dataset we 
502 | # just created. Finally take a break between
503 | # pulling each user's Twitter data in order
504 | # to prevent Twitter rate limiting kicking in:
505 | 
506 | for(i in 1:z){
507 |   user <- getUser(twitter_data$twitter_handles[i])
508 |   people_user_follows <- user$getFriends()
509 |   people_user_follows<-twListToDF(people_user_follows)
510 |   people_user_follows$name_of_user<-twitter_data$twitter_handles[i]
511 |   twitter_network_data<-rbind(twitter_network_data, people_user_follows)
512 |   Sys.sleep(60)
513 |   print(i)
514 | }
515 | 
516 | # We don't have time to run this loop together, it will take quite a bit
517 | # of time to run. 
518 | 
519 | 
520 | ## There are many many more R packages for working with APIS:
521 | ## Here are a few: `RgoogleMaps`, `Rfacebook`, `rOpenSci`
522 | ##(this one combines many different APIs e.g. the Internet Archive),  
523 | ##`WDI`,`rOpenGov`,`rtimes`
524 | ##Many more are available but not yet on CRAN (install from 
525 | ##github or using devtools)
526 | 
527 | ## There are also APIS that you can use to do analyses, like plotly
528 | # for visualization.
529 | 
530 | # But there are still APIs that don't have R packages (many of them)
531 | 
532 | # Let's pretend there was no R package for Google Maps, what would we do?
533 | # first: look for patterns
534 | # https://maps.googleapis.com/maps/api/geocode/json?address=Durham,NorthCarolina&sensor=false
535 | # In this case, address goes between the first** `=` **and the** `&`
536 | 
537 | findGPS <- function(address,sensor = "false") {
538 |   beginning <- "http://maps.google.com/maps/api/geocode/json?"
539 |   paster <- paste0(beginning,"address=", address, "&sensor=false")
540 |   return(URLencode(paster))
541 | }
542 | 
543 | findGPS("Durham, North Carolina")
544 | 
545 | # let's put it all together
546 | 
547 | page<-findGPS("Durham, North Carolina")
548 | gpscoordinates <- fromJSON(page)
549 | latitude <- gpscoordinates$results[[1]]$geometry$location["lat"]
550 | longitude <- gpscoordinates$results[[1]]$geometry$location["lng"]
551 | gps<-c(latitude, longitude)
552 | gps
553 | 
554 | # we could then wrap them in a loop.
555 | 
556 | 
557 | 
558 | 
559 | 


--------------------------------------------------------------------------------
/Class R Code/Class #5 R Code.R:
--------------------------------------------------------------------------------
  1 | # SCRIPT FOR CLASS #5, COMPUTATIONAL SOCIOLOGY
  2 | # Instructor: Chris Bail 
  3 | # Copyright, Chris Bail
  4 | 
  5 | 
  6 | # INTRODUCTION
  7 | 
  8 | # What can we do with all of the data we collected last week given that
  9 | # we can't read it all by ourselves?
 10 | 
 11 | #Fortunately,
 12 | # the fields of computer science and computational linguistics have 
 13 | # jointly produced a suite of new tools that make our job easier.
 14 | 
 15 | # Though we once had to hire teams of research assistants to collect, 
 16 | # standardize, and analyze large corpora, a single computer or group of
 17 | # computers can now do this to text-based datasets that are so large that
 18 | # human coders could never analyze them all.
 19 | 
 20 | # But these new techniques will be unfamiliar to you if you come from a 
 21 | # conventional statistics background. Text-based datasets do not come
 22 | # prepackaged. Instead, they are unstructured and usually very messy. 
 23 | 
 24 | # This is often because automated collection of texts often produces 
 25 | # texts that are formateed or structured differently. The first task 
 26 | # we will discuss in this class is simply how to automate collection
 27 | # of texts via the internet.
 28 | 
 29 | # But in order to analyze the large amounts of texts that can be analyzed
 30 | # using these methods, we need to do something equally challenging: we
 31 | # need to transform these texts into numbers, so that they can we can
 32 | # classify them using automated tools for text analysis such as topic
 33 | # modeling. 
 34 | 
 35 | # first, we need a large corpus of documents. You may already 
 36 | # have these, but we are going to grab ours from the internet. To
 37 | # create our corpus, we are going to use the "tm" package
 38 | # in R:
 39 | 
 40 | #install.packages("tm")
 41 | library(tm)
 42 | 
 43 | # Next, I'm going to read in that political blogs data from our dropbox:
 44 | 
 45 | blog_data<-read.csv("poliblogs2008.csv", stringsAsFactors = FALSE)
 46 | 
 47 | # One of the key commands in the tm package is the "corpus" command. This
 48 | # creates- you guessed it- a corpus! We need to tell it the name of
 49 | # the variable from the data set we want to import, and we also need to tell it that this  object is a dataset, since the command can also be used to import a directory of text files or other types of data
 50 | 
 51 | #first let's figure out the names of the blog_data dataset:
 52 | 
 53 | colnames(blog_data)
 54 | 
 55 | # The one we want is called "documents," let's check one out:
 56 | 
 57 | blog_data$documents[1]
 58 | 
 59 | 
 60 | #did you notice the funny text in there (e.g. \xe5\xca ?). This happens if you don't have the correct character encoding
 61 | # I'm going to clean up the character encoding before we work with
 62 | 
 63 | blog_data$documents <- iconv(blog_data$documents, "latin1", "ASCII", sub="")
 64 | 
 65 | 
 66 | blog_corpus <- Corpus(VectorSource(as.vector(blog_data$documents))) 
 67 | 
 68 | # That's it! Now our data are in "corpus" format, which is going
 69 | # to let us begin to do run basic text processing commands on our
 70 | # blog posts, and eventually automated forms of content analysis
 71 | # known as topic modeling.
 72 | 
 73 | # We could of course code all of these blog posts by hand, or hire
 74 | # a team of undergrads to do this for us, but this would create a
 75 | # number of probems ranging from coder-burnout to inter-coder reliability
 76 | 
 77 | # The only alternative to hand coding used to be word count analysis, where
 78 | # one simply counts the number of times a word appears in a document. Over
 79 | # the past 10 years or so, however, we have taken leaps and bounds in the
 80 | # field of automated text analysis.
 81 | 
 82 | # I could introduce you to many different ways of classifying text, but
 83 | # for now, we are going to focus on the most popular method at the moment:
 84 | # topic modeling.
 85 | 
 86 | # Topic modeling is an automated technique that looks at patterns of how
 87 | # words co-appear within documents in order to classify them into latent
 88 | # groups of topics.
 89 | 
 90 | # This technique is not perfect, as we will see, but it is much much better
 91 | # then keyword analysis. This is because it is better at recognizing the 
 92 | # polysemy of words-- that is, how words can take on different meanings
 93 | # if they occur next to other words. 
 94 | 
 95 | # We don't have time to get into the math of topic modeling, but I will 
 96 | # just briefly tell you that the methods we are going to use are based upon
 97 | # a probabilistic Bayesian method known as Latent Dirichlet Allocation,
 98 | # which is often abbreviated as LDA.
 99 | 
100 | # Unfortunately, we cannot just run a simple "lda" command on the corpus
101 | # we created in the previous section of this class. This is because lda
102 | # must analyze numbers, and not words. More specifically, lda requires
103 | # us to create a document term matrix, or a set of numbers that describe
104 | # where different words occur across documents. These are the data that the
105 | # lda algorithms actually analyze.
106 | 
107 | # But even before we create a document term matrix, we need to make some
108 | # important decisions. It is common practice in the field of "Natural 
109 | # Language Processing" to pre-process text. This is because most text is 
110 | # messy- it contains punctuation, variations in spelling, and other
111 | # problems that make the lda algorithms less effective.
112 | 
113 | # for example, right now our corpus includes dashes (-) do we really want
114 | # our algorithm to treat this-- or any other punctuation mark-- as a "word"
115 | # that should carry a meaning? Probably not. Fortunately, the tm package
116 | # can remove all punctuation as follows:
117 | 
118 | blog_corpus <- tm_map(blog_corpus, content_transformer(removePunctuation)) 
119 | 
120 | # Also, if we replaced all words in our document with unique identifiers 
121 | # right now, the words "dog" and "Dog" would be treated differently 
122 | # because one includes a capital letter and the other does not.
123 | 
124 | # This command will make all the words lowercase to get around this
125 | # problem:
126 | 
127 | blog_corpus <- tm_map(blog_corpus,  content_transformer(tolower)) 
128 | 
129 | # to a computer, even a space in between words is treated is something
130 | # that is meaningful, so believe it or not we need to ask R to 
131 | # remove spaces before or after a word from our dataset:
132 | 
133 | blog_corpus <- tm_map(blog_corpus , content_transformer(stripWhitespace))
134 | 
135 | # Next, we need to decide what we want to do with extremely common words
136 | # such as "and" or "the." As soon as we move from words to numbers, all
137 | # words are treated equally, but we know that these very common words
138 | # are not going to add much meaning to our analysis.
139 | 
140 | # very common words are often called "stop words" or words we don't want
141 | # to include in our analysis. I put a csv file that contains a popular
142 | # list of stop terms in the dropbox, let's read that in, and use it to
143 | # remove those words from our corpus:
144 | 
145 | stoplist <- read.csv("english_stopwords.csv", header=TRUE, stringsAsFactors = FALSE)
146 | stoplist<-stoplist$stopword
147 | blog_corpus  <- tm_map(blog_corpus , content_transformer(removeWords), stoplist)
148 | 
149 | # I want to pause and note that there some within the field
150 | # who believe stop words should not be removed. Some people believe we
151 | # lost important context. For example the phrase "I hate the president," is
152 | # much much different than "I'd hate to be president" but if we remove stop
153 | # words, both phrases would be reduced to "hate" and "president."
154 | 
155 | # Another somewhat controversial issue is whether you should "stem" words.
156 | # Stemming means taking a word like "gladly" and transforming it into
157 | # its root word, which is "glad."
158 | 
159 | # This is actually a very complex task that requires some sophisticated
160 | # databases, fortunately, the tm package handles all of that for us
161 | # by calling data from some other websites
162 | 
163 | blog_corpus  <- tm_map(blog_corpus , content_transformer(stemDocument), language = "english")
164 | 
165 | # Ok, we are now finally ready to create our document- term matrix. 
166 | # The command for this in the tm package is:
167 | 
168 | Blog_DTM <- DocumentTermMatrix(blog_corpus, control = list(wordLengths = c(2, Inf)))
169 | 
170 | # I've asked R to only create the matrix for words that are great than 2 
171 | # characters long. This is to get rid of some messy stuff that was created
172 | # throughout the text pre-processing stages described above.
173 | 
174 | # Betcha can't wait to look at your first Document-Term matrix, huh?
175 | 
176 | inspect(Blog_DTM[300:310,1000:1002])
177 | 
178 | # some words hardly ever appear in any documents. In order to handle such 
179 | # words, we can drop them from our document term matrix because it
180 | # makes our topic models perform a bit better (they don't struggle
181 | # to figure out what to do with these rare terms):
182 | 
183 | DTM <- removeSparseTerms(Blog_DTM , 0.990) 
184 | 
185 | #I've now removed terms that only appear in .01 of all documents.
186 | 
187 | # Now that all of our words are properly cleaned, let's take a look
188 | # at some of the most popular terms. The following line finds all
189 | # the words that occur more than 3,000 times in the dataset
190 | 
191 | findFreqTerms(DTM, 3000)
192 | 
193 | # This is a good step to get a sense of whether or not there 
194 | # are still words in your document term matrix that you do 
195 | # not want to exert undue influence upon your topic models.
196 | 
197 | # Ok, now we are ready to run a topic model.... finally!
198 | 
199 | # One downside of topic models is they do not automatically
200 | # figure out how many topics exist in corpus. Ideally, 
201 | # we would have a sense of how many their might be. Let's
202 | # take a wild guess and say there's seven topics-
203 | # just for the purpose of illustration. 
204 | 
205 | # the number of topics in the lda package is controlled
206 | # by a parameter called k:
207 | 
208 | k<-7
209 | 
210 | # Now we need to set a bunch of additional parameters. We
211 | # don't have time to walk through what each of these mean
212 | # right now. Unfortunately. Some of them help us ensure
213 | # that we can get reproducible results, others help us
214 | # asset the fit of our model.
215 | 
216 | control_LDA_Gibbs <- list(alpha = 50/k, estimate.beta = T, 
217 |                           verbose = 0, prefix = tempfile(), 
218 |                           save = 0, 
219 |                           keep = 50, 
220 |                           seed = 980, # for reproducibility
221 |                           nstart = 1, best = T,
222 |                           delta = 0.1,
223 |                           iter = 2000, 
224 |                           burnin = 100, 
225 |                           thin = 2000) 
226 | 
227 | # Ok, now let's reate a topic model using the "Gibbs" Sampling method, and the
228 | # "control" parameters we just declared. Also, we need to install the "lda" 
229 | # package
230 | 
231 | #install.packages("topicmodels")
232 | library(topicmodels)
233 | 
234 | my_first_topic_model <- LDA(DTM, k, method = "Gibbs", control = control_LDA_Gibbs)
235 | 
236 | # And then we can look at which words are associated with which topic. Here we look at the top 20 words by topic.
237 | 
238 | terms(my_first_topic_model, 30)
239 | 
240 | 
241 | # I mentioned earlier that there is no way to figure out the appropriate number
242 | # of topics, but we can look at shifts in the log likelihoods produced by
243 | # the LDA and try to identify the point where the curve flattens out. 
244 | 
245 | # To do this, however, we have to repeate our LDA again and again. For example,
246 | # this code will run models that have everywhere from 2 to 35 topics
247 | 
248 | many_models <- mclapply(seq(2, 35, by = 1), function(x) {LDA(Blog_DTM, x, method = "Gibbs", control = control_LDA_Gibbs)} )
249 | 
250 | # Hat tip to Achim Edelman for this nice function!
251 | 
252 | many_models.logLik <- as.data.frame(as.matrix(lapply(many_models, logLik)))
253 | 
254 | # We can then plot the results to see where we get decreasing returns for
255 | # increasing the number of topics:
256 | 
257 | plot(2:35, unlist(lda.models.gibbs.logLik), xlab="Number of Topics", ylab="Log-Likelihood")
258 | 
259 | # Once we choose the best number of topics, we can change k and 
260 | # run our model again
261 | 
262 | k<-10
263 | 
264 | my_first_topic_model <- LDA(Blog_DTM, k, method = "Gibbs", control = control_LDA_Gibbs)
265 | 
266 | # And if we want to see how each document gets assigned to each
267 | # topic, we can simply right
268 | 
269 | topic_assignments_by_docs <- topics(my_first_topic_model)
270 | 
271 | 
272 | 
273 | #STRUCTURAL TOPIC MODELS
274 | 
275 | # topic models perform much better with meta-data. Here is how to use Brandon Stewart's stm package
276 | # I borrow his example verbatim here:
277 | 
278 | #install.packages("stm")
279 | library(stm)
280 | processed <- textProcessor(blog_data$documents, metadata = blog_data)
281 | #structure and index for usage in the stm model. Verify no-missingness.
282 | out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
283 | #output will have object meta, documents, and vocab
284 | docs <- out$documents
285 | vocab <- out$vocab
286 | meta <-out$meta
287 | 
288 | # use the utility function prepDocuments to process the loaded data to make sure it is in the right format
289 | plotRemoved(processed$documents, lower.thresh=seq(1,200, by=100))
290 | 
291 | # and run the model
292 | poliblogPrevFit <- stm(out$documents,out$vocab,K=20,
293 |                        prevalence =~ rating+ s(day), max.em.its=75,
294 |                        data=out$meta,seed=5926696)
295 | 
296 | # see the vignette at http://structuraltopicmodel.com/ for many more helpful tools
297 | # for model interpretation and validation (including visualization and identifying illustrative quotes)
298 | 


--------------------------------------------------------------------------------
/Class R Code/Class #6 R Code.R:
--------------------------------------------------------------------------------
  1 | # SCRIPT FOR CLASS #6, COMPUTATIONAL SOCIOLOGY
  2 | # Instructor: Chris Bail 
  3 | # Copyright, Chris Bail
  4 | 
  5 | # Visualizing and Analyzing Data
  6 | 
  7 | # Base R has a variety of routines for visualizing data. Many
  8 | # of these are fairly good, but there is widespread consensus
  9 | # that the "ggplot2" package provides the most sophisticated
 10 | # visualization capacities. Let's take a quick peak at 
 11 | # some of ggplot's capabilities:
 12 | # http://shinyapps.stat.ubc.ca/r-graph-catalog/
 13 | 
 14 | 
 15 | # Let's install ggplot2
 16 | 
 17 | install.packages("ggplot2")
 18 | 
 19 | # The data we are going to use today come "built in" with 
 20 | # the ggplot package. These data describe various 
 21 | # characteristics of a large sample of diamonds (e.g. their
 22 | # size, cut, clarity)
 23 | 
 24 | # to load a built in dataset we need to call ggplot2 and then
 25 | # run the data() command
 26 | 
 27 | library(ggplot2)
 28 | data("diamonds")
 29 | 
 30 | #2.1 Scatterplots
 31 | 
 32 | #Let's try a basic scatterplot in ggplot2:
 33 | 
 34 | ggplot(diamonds, aes(x=carat, y=price)) + geom_point()
 35 | 
 36 | #`diamonds` is the data set we want to plot  
 37 | 
 38 | #"aes" refers to the x,y coordinates we want to plot. 
 39 | #Note that we did not need to use the  
 40 | #`$` operator to specify the variable names geom_point() 
 41 | #describes the type of plot. The `+` indicates this 
 42 | #is a "layer." We can add many different types of layers
 43 | #to a ggplot2 graph, as we will soon see.
 44 | 
 45 | #Not bad, but the graph could be much more informative if
 46 | # we added some color. Lets color the points of the graph
 47 | # according to the clarity of the diamonds. This variable
 48 | # is a factor variable
 49 | 
 50 | ggplot(diamonds, aes(x=carat, y=price, color=clarity)) + geom_point()
 51 | 
 52 | # Conventiently, ggplot automatically creates a legend
 53 | # on the right hand side of the graph.
 54 | 
 55 | # But we can go even further by manipulating the size of the
 56 | # points. 
 57 | 
 58 | ggplot(diamonds, aes(x=carat, y=price, color=clarity, size=cut)) + geom_point()
 59 | 
 60 | # Note that we can also use different kinds of shapes (instead of
 61 | # circles) by specifying `shape=` within our "aes" command.
 62 | 
 63 | # Earlier I mentioned that ggplot uses layers. Let's go even
 64 | #further and add a some smoothing to further illustrate 
 65 | # the relationship between price and carat
 66 | 
 67 | ggplot(diamonds, aes(x=carat, y=price)) + geom_point() + geom_smooth()
 68 | 
 69 | # The blue line is the result of the smoothing, and the
 70 | # grey bars are the standard errors Here we can see
 71 | # there is not a 1 to 1 relationship between carat and price.
 72 | 
 73 | # we can put a variety of "options" within geom_smooth if we 
 74 | # want. For example, if we want to use linear regression to
 75 | # draw the trendline, we can write:
 76 | 
 77 | ggplot(diamonds, aes(x=carat, y=price)) + geom_point() + geom_smooth(method="lm")
 78 | 
 79 | # Or, if we want to look at different trend lines for different
 80 | # variables, we could run:
 81 | 
 82 | ggplot(diamonds, aes(x=carat, y=price, color=clarity)) + geom_smooth()
 83 | 
 84 | #Note that the points disappeared from our chart because we 
 85 | #removed the geom_point() layer.
 86 | 
 87 | # Many R users not only like to add layers to individual plots,
 88 | # but create many plots beside each other in order to 
 89 | # communicate even more information. 
 90 | 
 91 | #3.2 Facet Wraps
 92 | 
 93 | # To do this, we add something called a "facet_wrap:"
 94 | 
 95 | ggplot(diamonds, aes(x=carat, y=price)) + geom_point() + facet_wrap(~ cut)
 96 | 
 97 | # The tilde here tells R which variable it should use to make
 98 | # the separate plots
 99 | 
100 | # We could of course bring back in color, or change the size 
101 | # of the points again.
102 | 
103 | # This is really just the very beginning of ggplot's capability
104 | # If this were a course on visualization alone, we could go
105 | # into much greater depth about how to customize titles (ggtitle,
106 | # xlab, ylab), Or how to change the range of an axis (e.g. xlim(0,2))
107 | 
108 | # But for now, let's look at some other types of graphs ggplot
109 | # can produce. Here is a line graph:
110 | 
111 | ggplot(diamonds, aes(x=carat, y=price, color=clarity)) + geom_line()
112 | 
113 | # 3.3 Histograms, Boxplots, Violin Plots
114 | 
115 | # Here is a histogram
116 | 
117 | ggplot(diamonds, aes(x=price)) + geom_histogram()
118 | 
119 | ## once again, each layer has a range of different options, let's
120 | # say we want the bars to represent more unique values of price.
121 | # to do this, we'd use the bindwidth option
122 | 
123 | ggplot(diamonds, aes(x=price)) + geom_histogram(binwidth=100)
124 | 
125 | # once again, we could add facet_wraps or other types of 
126 | # functionality to this graph, but in the interest of time
127 | # let's keep going and look at some other types of plots
128 | 
129 | # For example, ggplot's boxplots are fairly popular
130 | 
131 | ggplot(diamonds, aes(x=color, y=price)) + geom_boxplot()
132 | 
133 | # We can get a bit more information about the standard
134 | # errors via a "violin plot":
135 | 
136 | ggplot(diamonds, aes(x=color, y=price)) + geom_violin()
137 | 
138 | 
139 | ## Now You Try It:
140 | # 1) Load the `mtcars` **data
141 | # 2) plot the relationship between the `mpg` and `hp` variables in
142 | # the form of a scatterplot with facets for the `gear` variable
143 | # 3) Bonus points: add a title to the graph
144 | 
145 | 
146 | # Solution:
147 | 
148 | ggplot(mtcars, aes(mpg, hp)) + geom_point() +facet_wrap(~gear) +ggtitle("Relationship Between MPG and Horsepower by Number of Gears")
149 | 
150 | 
151 | ##OTHER VISUALIZATION PACKAGES
152 | 
153 | ## Though we have focused on ggplot, There are so many 
154 | ## other great packages for visualization in R. Check
155 | ## out this "tabplot" package, for example:
156 | 
157 | install.packages("tabplot")
158 | library(tabplot)
159 | tableplot(diamonds)
160 | 
161 | # Or check out this beautiful map of geo-tagged tweets
162 | # created using "ggmap" a spin-off of ggplot
163 | 
164 | 
165 | # Finally, I'll show you how to create the heatmap
166 | # I showed you earlier
167 | 
168 | nba <- read.csv("http://datasets.flowingdata.com/ppg2008.csv", sep=",")
169 | nba <- nba[order(nba$PTS),]
170 | row.names(nba) <- nba$Name
171 | nba <- nba[,2:20]
172 | nba_matrix <- data.matrix(nba)
173 | nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA, col = cm.colors(256), scale="column", margins=c(5,10))
174 | 
175 | # I briefly showed you this example to note how many
176 | # visualization tasks involve some data cleaning
177 | # or re-structuring before they can be performed.
178 | # In this case, we sorted the data, changed row names,
179 | # and transformed the data into a matrix.
180 | 
181 | # But many regular applications require even more
182 | # Challenging tasks such as reshaping your data from
183 | # wide to long, or re-formatting variables from
184 | # character to factor, or factor to numeric.
185 | 
186 | 
187 | ## Exporting plots from R is fairly straitforward. If you are working with a ggplot object,
188 | ## the ggsave command is quite useful: 
189 | 
190 | myplot<-ggplot(diamonds, aes(x=color, y=price)) + geom_violin()
191 | ggsave(file="organ donation.png", plot=myplot, width=5, height=5, dpi=300)
192 | 
193 | 
194 | #If you are using another package, wrap your plot in between the png() function
195 | # and the dev.off() function as follows
196 | 
197 | # We do this by sandwhich our code in between two new lines:
198 | png(file="nba_heatmap", width=480, height=480)
199 | nba <- read.csv("http://datasets.flowingdata.com/ppg2008.csv", sep=",")
200 | nba <- nba[order(nba$PTS),]
201 | row.names(nba) <- nba$Name
202 | nba <- nba[,2:20]
203 | nba_matrix <- data.matrix(nba)
204 | nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA, col = cm.colors(256), scale="column", margins=c(5,10))
205 | dev.off()
206 | 
207 | # A png file entitled nba_heatmap is now saved in our working directory
208 | # we can change the size using the "width" and "height" options
209 | # within the png command above (and you can do the same in the ggsave command)
210 | 
211 | 
212 | # Because the Twitter API Script takes a long time to run,  I ran earlier and saved the data for you in a 
213 | # .Rdata file that is in our class dropbox. It is called "Twitter
214 | # Network Data.Rdata". If you have set your working directory
215 | # to be the same folder that you downloaded from Dropbox,
216 | # you can write:
217 | 
218 | load("Twitter Network Data.Rdata")
219 | 
220 | # Once we have the complete dataset, we could do any number
221 | # of things. Because I'm guessing that many of you are interested
222 | # in network analysis, let's just make a quick network plot using
223 | # the igraph package:
224 | 
225 | # First let's install and load the package:
226 | install.packages("igraph")
227 | library(igraph)
228 | 
229 | #Next, let's convert our data frame to an "igraph object"
230 | # which is necessary to do any network analysis in this
231 | # package.
232 | 
233 | twitter_igraph <- graph.data.frame(twitter_network_data, directed=FALSE)
234 | 
235 | #Calculating network stats is extremely easy using igraph:
236 |   
237 | twitter_betweennes<-betweenness(twitter_igraph)
238 | twitter_closeness<-closeness(twitter_igraph)
239 | twitter_clustering_coefficient<-transitivity(twitter_igraph)
240 | 
241 | #...and there are many, many more. Working with two-mode, weighted, and dynamic 
242 | # network data is R is also very easy because of its sophisticated database manipulation tools 
243 | # as well as a number of different packages such as `sna tnet SoNIA` 
244 | 
245 | 
246 | # Now, let's plot it:
247 | 
248 | plot(twitter_igraph)
249 | 
250 | # If you got an error message that says "figure margins too large"
251 | # run this code which resets the allowable limits of visualizations:
252 | 
253 | par(mar = rep(2, 4))
254 | 
255 | # what a mess! First, we are simply trying to plot too much
256 | # info, we need to drop all the people who have very few
257 | # network ties from the dataset in order to get a cleaner
258 | # picture:
259 | 
260 | only_cool_kids<-delete.vertices(twitter_igraph,which(degree(twitter_igraph)<20))
261 | plot(only_cool_kids)
262 | 
263 | # We are still getting an error message about the labels not working. This
264 | # is most likely because some of the characters in the data we read in
265 | # were in a foreign language (remember, our data describe the most popular
266 | # Twitter users around the entire world)
267 | 
268 | #To fix this I'm going to change character encoding as follows:
269 | 
270 | twitter_network_data$Source<-iconv(twitter_network_data$Source, "latin1", "ASCII", sub="")
271 | twitter_network_data$Target<-iconv(twitter_network_data$Target, "latin1", "ASCII", sub="")
272 | 
273 | # now we need to repeat all the steps:
274 | twitter_igraph <- graph.data.frame(twitter_network_data, directed=FALSE)
275 | 
276 | # We are still getting a warning message that says some strings were read
277 | # in as NA. This is probably because the character conversion did not
278 | # work for every single language that Twitter users can use:
279 | 
280 | # once again, let's prune the network
281 | only_cool_kids<-delete.vertices(twitter_igraph,which(degree(twitter_igraph)<20))
282 | plot(only_cool_kids)
283 | 
284 | # What are those funny loops? Those are people who are following themselves?
285 | # must be a data building error. A quick fix would be to use igraph's 
286 | # simplify command which removes these self references
287 | 
288 | only_cool_kids<-simplify(only_cool_kids)
289 | plot(only_cool_kids)
290 | 
291 | #looking better, but hardly beautiful. Let's try a different layout:
292 | 
293 | plot(only_cool_kids, layout=layout.reingold.tilford)
294 | 
295 | #Katy Perry rules all!
296 | 
297 | plot(only_cool_kids, layout=layout.circle)
298 | 
299 | #Everyone is cool!
300 | 
301 | plot(only_cool_kids)
302 | 
303 | #USING GEPHI FOR NETWORK VISUALIZTION
304 | 
305 | # You can spend a long time making network plots look pretty in igraph, but I prefer to use Gephi, because it is 
306 | # much more interactive and has a better graphics engine. It also works well with large network datasets, and it can handle
307 | # both node and edge attributes, as well as dynamic/longitudinal network data.
308 | 
309 | # See the slides associated with this lecture for instructions about how to install Gephi, import data, and 
310 | # visualize/analyze your data.
311 |   
312 | 


--------------------------------------------------------------------------------
/Class R Code/Class #7 R Code.R:
--------------------------------------------------------------------------------
  1 | # SCRIPT FOR CLASS #7, COMPUTATIONAL SOCIOLOGY
  2 | # Instructor: Chris Bail 
  3 | # Copyright, Chris Bail
  4 | 
  5 | # LINEAR MODELING
  6 | 
  7 | # Most of you are not only interested in visualizing
  8 | # your data, but understanding it at a much deeper level.
  9 | # We can learn quite a lot from visualization. Everything
 10 | # from identifying patterns of missing data to three and
 11 | # four way interactions between variables.
 12 | 
 13 | # But most of us also care about "statistical significance,"
 14 | # to some degree. While there are new debates about the
 15 | # meaning of significance in the age of big data, I do not
 16 | # think we are going to abandon P-values over night.
 17 | 
 18 | # In this class, we will 
 19 | # focus on some very basic statistics one can do in R.
 20 | # Once again, I will try to show you enough that you 
 21 | # could learn more on your own, but because R is used
 22 | # by everyone from economists to ecologists, the range
 23 | # of statistical routines available is truly astounding
 24 | # and therefore too broad to cover in one morning- let
 25 | # alone one week!
 26 | 
 27 | # As always, I'm going to focus on messy, "real-world"
 28 | # examples. These are particularly important this week
 29 | # because linear models can create very big problems
 30 | # with big data. This includes not only issues related to
 31 | # p-values, but also outlier detection, equifinality,
 32 | # and a range of other issues. So today we will look 
 33 | # at some of the data I've collected and make some
 34 | # mistakes that I hope will help you avoid similar
 35 | # issues in your own work.
 36 | 
 37 | # The other reason I'm teaching you how to run models
 38 | # in R is that often computational social science requires
 39 | # an iterative approach to data cleaning, data classification,
 40 | # and data analysis. If you keep moving data back and forth
 41 | # between R and STATA Or SAS, you will waste a lot of time
 42 | # converting file formats, etc. Plus, I will try to convince
 43 | # you that R is a much better platform for statistical analyses
 44 | # because it allows you to use state of the art techniques.
 45 | # This is also the reason why I advocate R over Python (stats
 46 | # packages in the latter pale in comparison to the former)
 47 | 
 48 | # Let's start by some of the most basic statistical
 49 | # analysis available to us in R. We are going to return
 50 | # to base R for the time being, but I'll show you 
 51 | # a few packages later on.
 52 | 
 53 | # First, let's load our Pew data from yesterday using the
 54 | # load command:
 55 | 
 56 | load("Pew Data.Rdata")
 57 | 
 58 | # Remember that you will need to specify a file path if the file
 59 | # is not in your working directory. To check, write "getwd()"
 60 | 
 61 | # let's try to predict whether people support or oppose
 62 | # the ground zero mosque (this variable is labelled pew10)
 63 | # 1=oppose construction, 2= support construction
 64 | 
 65 | table(pewdata$pew10)
 66 | 
 67 | # let's create a new factor variable that describes
 68 | # whether someone is Republican
 69 | 
 70 | pewdata$Republican<-0
 71 | pewdata$Republican[pewdata$partlyn=="Republican"]<-1
 72 | 
 73 | 
 74 | #Now, let's run a ttest to see if being a republican
 75 | # shapes your opinion of the ground zero mosque. To
 76 | # do this we use the t.test command. Note that we can
 77 | # specify our data as an option in this command, or
 78 | # write them out using $.
 79 | # The ~ in r generally stands for "explained with."
 80 | 
 81 | t.test(pew10 ~ Republican, data=pewdata)
 82 | 
 83 | # here we see a clear cut difference.
 84 | 
 85 | # but was is the direction? To figure this out we
 86 | # need to run a correlation. We can use the cor.test
 87 | # command
 88 | 
 89 | # CORRELATIONS
 90 | 
 91 | cor.test(pewdata$pew10, pewdata$Republican)
 92 | 
 93 | # we see a negative, significant relationship, so
 94 | # this suggests that being republican is negatively
 95 | # associated with supporting the Ground Zero Mosque
 96 | 
 97 | 
 98 | # One of the most useful things about R is that it 
 99 | # can very easily and efficiently combine statistics
100 | # and visualization. 
101 | 
102 | # Let's say we wanted to quickly identify variables
103 | # with significant correlations with each other. The
104 | # corrgram package produces a "heatmap-style" correlation
105 | # matrix:
106 | 
107 | install.packages("corrgram")
108 | library(corrgram)
109 | corrgram(pewdata)
110 | 
111 | 
112 | 
113 | # MULTIPLE LINEAR REGRESSION
114 | 
115 | # but what if we are concerned that age is a confounding
116 | # factor in this relationship. That is older people might
117 | # be less accepting of Muslims.
118 | 
119 | # To figure this out, we'd need to run a multiple linear
120 | # regression model. In R, the syntax for this is:
121 | 
122 | lm(pew10~Republican+age, pewdata)
123 | 
124 | # But this doesn't give us the output we want. This
125 | # is because R stores the output as an object. 
126 | # so we need to write something like:
127 | 
128 | results<-lm(pew10~Republican+age, pewdata)
129 | 
130 | # and then to obtain the results, we need to write
131 | 
132 | summary(results)
133 | 
134 | # This is a bit annoying, but it can come in handy when
135 | # you want to extract different parts of the results
136 | # and put them into tables or plot them. 
137 | 
138 | # For example, if we want to get the coefficiences, we
139 | # can use the $ operator to get the coefficients
140 | 
141 | x<-results$coefficients
142 | y<-results$residuals
143 | 
144 | z<-cbind(x, y)
145 | write.csv(z, file="myresults.csv")
146 | 
147 | 
148 | # we can also plot the residuals values easily
149 | 
150 | plot(results)
151 | 
152 | #You need to hit return to see the different types
153 | # of plots available
154 | 
155 | #The scatterplot matrix is also pretty cool. Let's
156 | # try it out with the built-in "mtcars" data, because
157 | # it has lots of continuous variables unlike these
158 | # pew data:
159 | 
160 | pairs(~ mpg + hp + cyl, data=mtcars)
161 | 
162 | #R also includes pretty much every single diagnostic test
163 | #available. For exmaple, here is the command for evaluating
164 | #MultiCollinearity/Variance Inflation Factors:
165 | 
166 | install.packages("car")
167 | library(car)
168 | vif(results)
169 | 
170 | # Here is the Bonferonni p-value for the most extreme observations
171 | outlierTest(results)
172 | 
173 | #comes up negative. Some of them are even interactive:
174 | 
175 | influencePlot(results,  id.method="identify", main="Influence Plot", sub="Circle size is proportial to Cook's Distance" )
176 | 
177 | 
178 | #There are also popular tools for assessing non-normality:
179 | 
180 | qqPlot(results)
181 | 
182 | # It looks a little funky because it is bimodal. Normally we use a qqplot to 
183 | #asses wether the data fit a normal distribution
184 | 
185 | #See also 
186 | leveragePlots(results)
187 | avPlots(results)
188 | 
189 | #We can also evaluate homoscedasticity using a non-constant error
190 | #variance test:
191 | 
192 | ncvTest(results)
193 | 
194 | # look for non-linearity
195 | crPlots(results)
196 | 
197 | # looks ok.
198 | 
199 | # What about missing data? 
200 | install.packages("VIM")
201 | library(VIM)
202 | aggr(pewdata)
203 | 
204 | #extremely efficient, huh?
205 | 
206 | # we can also combine the matrix scatterplot with a missing data
207 | #analysis as follows:
208 | 
209 | #first let's take a subset
210 | subsample<-pewdata[,c("age","sex","pew10")]
211 | marginmatrix(subsample) 
212 | 
213 | # If you get the "figure margins too large" warning, try making
214 | # your plot window large in RStudio.
215 | 
216 | # There are many more ways of doing this, and many more great
217 | # visualization techniques in the VIM package to help you
218 | # identify even more subtle missing data patterns
219 | 
220 | 
221 | # But from this brief glance we know we have
222 | # Lots of missing data. I'll show you how
223 | # to use multiple imputation very quickly.
224 | # A word of warning, however: multiple imputation
225 | # can make things worse if your model is not
226 | # properly specified. Also, make sure you are
227 | # only passing numeric variables or factor variables
228 | # to the multiple imputation commands:
229 | 
230 | # Let's just try imputing for the small
231 | # data set we created above called "subsample":
232 | 
233 | library(mice)
234 | 
235 | 
236 | # now impute!
237 | mice.dat <- mice(subsample,m=10,seed=3)
238 | ## combine datasets
239 | mice.dat <- complete(mice.dat,action=10)
240 | # now we could re-run our analysis if we so choose.
241 | 
242 | ## **Now You Try It:**
243 |   
244 | #1) Determine whether the relationship between mpg and
245 | #hp of a car is significant when controlling for the number
246 | # of cylinders, quarter second time (qsec), and whether or not
247 | # a car is automatic (using the "am" variable)"
248 | 
249 | # solution: summary(lm(mpg~hp+cyl+qsec+am, data=mtcars))
250 | 
251 | 
252 | # FIXED-EFFECTS
253 | 
254 | # To run a fixed effects model you can simply run:
255 | 
256 | fixed<-(lm(pew10~Republican+age+factor(state), data=pewdata))
257 | 
258 | # RANDOM-EFFECTS
259 | 
260 | install.packages("lme4")
261 | library(lme4)
262 | random<-lmer(pew10~Republican+age+ (1|state), data=pewdata)
263 | 
264 | 
265 | # To specify a different distribution of the outcome, use the "family" argument:
266 | 
267 | random2<-lmer(pew10~Republican+age+ (1|state), family=poisson, data=pewdata)
268 | 
269 | # We get some funky error messages here because we chose a family that 
270 | #do not fit the data:
271 | 
272 | # To determine whether or not to use random effects we can use 
273 | # the Breusch-Pagan test- if it is significant this suggests the
274 | #random model is better way to describe the data.
275 | 
276 | install.packages("lmtest")
277 | library(lmtest)
278 | bptest(fixed, random)
279 | 
280 | # this suggests we should use the random effects model.
281 | 
282 | 
283 | #TIME-SERIES
284 | 
285 | # R also shines when it comes to time series analysis. We are 
286 | # going to use the plm package, and some sample data on
287 | # employment in the UK:
288 | 
289 | install.packages("plm")
290 | library(plm)
291 | data("EmplUK", package="plm")
292 | head(EmplUK)
293 | 
294 | #These "conditioning plots help us see if the relationships
295 | # between sectors really vary across year:
296 | 
297 | coplot(wage ~ year|firm, type="l", data=EmplUK) 
298 | coplot(wage ~ year|firm, type="l", data=EmplUK) 
299 | 
300 | #it looks like they do.
301 | 
302 | 
303 | #I also like to plot the means across organizations
304 | install.packages("gplots")
305 | library(gplots)
306 | plotmeans(wage ~ year, main="Heterogeineity across time", data=EmplUK)
307 | plotmeans(wage ~ firm, main="Heterogeineity across Employment sectors", data=EmplUK)
308 | 
309 | # so we are seeing differences across both time and sectors
310 | 
311 | # now lets run some models. First, let's run fixed effects:
312 | 
313 | fixed1 <- plm(wage ~ capital+output+emp, index=c("firm", "year"), data=EmplUK, model="within")
314 | summary(fixed1)
315 | 
316 | #and here's how we would run a random effects model
317 | 
318 | random1 <- plm(wage ~ capital+output+emp, index=c("firm", "year"), data=EmplUK, model="random")
319 | summary(random1)
320 | 
321 | #test for serial correlation
322 | library(lmtest)
323 | pbgtest(random1)
324 | 
325 | #We should also test for cross-sectional dependence/contemporaneous causation
326 | pcdtest(random1, test = c("lm"))
327 | 
328 | #And of course, we could test for much, much more...
329 | 
330 | 
331 | 
332 | 


--------------------------------------------------------------------------------
/Class R Code/Class #8 R Code.R:
--------------------------------------------------------------------------------
  1 | # SCRIPT FOR CLASS #8, COMPUTATIONAL SOCIOLOGY
  2 | # Instructor: Chris Bail 
  3 | # Copyright, Chris Bail
  4 | 
  5 | # MACHINE LEARNING
  6 | 
  7 | # Last class we noted a number of problems that plague analysis of large, 
  8 | # complex datasets. These include issues about the meaning of p-values
  9 | # an statistical significance, non-linear relationships between variables
 10 | # the high likelihood of interaction effects given the large number 
 11 | # of variables that can be collected but not analyzed in every possible
 12 | # iteration, and- in general- causal complexity, or the likelihood that
 13 | # many of the outcomes we are interested in as social scientists do not
 14 | # involve a single causal recipe.
 15 | 
 16 | # Today we are going to talk about some exciting new (or new-ish) tools
 17 | # that help address some of these problems. Some of them are replacements
 18 | # for linear models, but others are probably unlikely to displace linear
 19 | # models. Given how deeply engrained linear models are within sociology,
 20 | # I think the best way to think about these new techniques are alternative
 21 | # ways of analyzing/classifying/modeling your data that will help you 
 22 | # identify interactions that you might have missed and/or important
 23 | # subsets of the data that you might want to code and include in the 
 24 | # linear models you ultimately use.
 25 | 
 26 | # Once again, you could spend an entire semester on so-called "machine-learning"
 27 | # sometimes it is also called "statistical learning." Most of these techniques
 28 | # come out of statistics, but they are applied broadly by computer scientists, 
 29 | # and people in industry in order to both classify data, and- more often- to make
 30 | # predictions about individual behavior. If you are really taken by the ideas
 31 | # we talk about today, I recommend checking out this 15 hour class, which will
 32 | # give you a comprehensive overview of the techniques, the math and notation behind
 33 | # them, and hands on info about how to implement them in R:
 34 | 
 35 | #http://www.r-bloggers.com/in-depth-introduction-to-machine-learning-in-15-hours-of-expert-videos/
 36 | 
 37 | # In fact, one of the teachers in this video invented one of the techniques we
 38 | # will be talking about today (GAMs). For a less technical overview, check out
 39 | #this extremely cool visual tutorial:
 40 | 
 41 | #http://www.r2d3.us/visual-intro-to-machine-learning-part-1/
 42 | 
 43 | # Instead of going in depth into each technique, we are going to do what we 
 44 | # always do in this class: try to give you enough of an overview that you
 45 | # could pursue this in depth on your own. We will also work through messy
 46 | # real-world examples that I hope will help you see the strengths and 
 47 | # weaknesses of these new models.
 48 | 
 49 | # We are going to focus on three methods in particular: Generalized
 50 | # Additive Models, Regression Trees, and Random Forests. 
 51 | 
 52 | # GENERALIZED ADDITIVE MODELS
 53 | 
 54 | # One huge problem with many linear models is that they are parametric,
 55 | # or defined by functions that describe the data using a very small set of 
 56 | # parameters. By contrast, GAMs are non-parametric, meaning that the shape 
 57 | # of the predictor functions is fully determined by the data
 58 | 
 59 | # We can of course build in non-linear transformations of variables
 60 | # to account for this problem, but this assumes that we know about the
 61 | # problem in the first place. Also, interpretation of such transformations
 62 | # can be difficult. For example, do you know what it means if a polynomial
 63 | # term is positive or negative? What if there are multiple polynomial
 64 | # predictors in your model? To make matters worse, transforming multiple
 65 | # predictors within the same model can create colinearity issues. Even
 66 | # if we can get around all of these problems, it would take us a ton
 67 | # of time- we'd have to try every single type of transformation for 
 68 | # every single type of variable until we found the one that is the 
 69 | # best fit for the data.
 70 | 
 71 | # Generalized Additive Models are a unique approach developed in 1986
 72 | # by two statisticians. GAMs look a lot like traditional regression models, 
 73 | # but they are much "smarter." GAMs automatically select the best 
 74 | # transformation of each variable FOR YOU- these can be both non-linear
 75 | # or linear. After "Smoothing" or transforming each variable (where necessary)
 76 | # GAMS simultaneously estimate the relationship between each predictor
 77 | # and the outcome, outputting coefficients that are very similar to 
 78 | # a regular linear model.
 79 | 
 80 | # Note that this not only gets us around the issue of having to try
 81 | # out different linear combinations of variables we have hunches about
 82 | # but also helps us account for potential non-linearities that we did
 83 | # not even know to look for.
 84 | 
 85 | # GAMS can also support any type of link function, or in other words,
 86 | # any type of dependent variable that might be put into a generalized
 87 | # linear model (e.g. binary, continuous, etc.)
 88 | 
 89 | # GAMs are kind of a compromise between conventional linear models
 90 | # which are almost always very biased but easy to interpret; and 
 91 | # new machine learning techniques such as random forests which are
 92 | # very good at representing/classifying relationships between 
 93 | # variables but very difficult to interpret. This is one of the 
 94 | # main reasons I don't think we will see random forests in ASR/AJS
 95 | # any time soon, but we may well see GAMs.
 96 | 
 97 | #Let's look at an example that was put together by Michael Clark at Notre Dame
 98 | # these are science test scores from the PISA cross-national education data
 99 | 
100 | data = read.csv("http://www.nd.edu/~mclark19/learn/data/pisasci2006.csv")
101 | 
102 | # and let's take a look at the data:
103 | library(car)
104 | scatterplotMatrix(data)
105 | 
106 | 
107 | # the red smoothed lines show us there is a lot of potential non-linearity
108 | # in the data.
109 | 
110 | # first, let's run a simple linear model sot hat we can get a baseline.
111 | # we use the gam command, but unless we enclose each predictor in an
112 | # s() it assumes it has a linear relationship with the outcome, 
113 | 
114 | install.packages("mgcv")
115 | library(mgcv)
116 | first_model <- gam(Overall ~ Income + Edu + Health, data = data)
117 | summary(first_model)
118 | 
119 | # Now let's try a model that applies a smoother function to each predictor
120 | second_model <- gam(Overall ~ s(Income) + s(Edu) + s(Health), data = data)
121 | summary(second_model)
122 | 
123 | # and now we can plot the relationship between each predictor and the outcome
124 | plot(second_model, pages=1, residuals=T, pch=19, cex=0.25,
125 |      scheme=1, col='#FF8000', shade=T,shade.col='gray90')
126 | 
127 | # and this neat contour plot lets us look at multiple variables at once
128 | vis.gam(second_model, type = "response", plot.type = "contour")
129 | 
130 | # The "lassos" describe the predicted values of the outcome. That is
131 | # the sweat spot is having both high income and high education.
132 | 
133 | # we can also do something called "tensor product smoothing" here which
134 | # you can think of as a smooth of the smoothers of multiple variables
135 | # at once
136 | 
137 | third_model <- gam(Overall ~ te(Income, Edu), data = data)
138 | summary(third_model)
139 | 
140 | # and we can plot it again
141 | vis.gam(third_model3, type='response', plot.type='persp',
142 |         phi=30, theta=30,n.grid=500, border=NA)
143 | 
144 | # Once again, I'm not sure why GAMs don't have more
145 | # influence in sociology. At the very least, I encourage
146 | # you to use them to triangulate your work with linear
147 | # models. And I suppose that with enough work a linear
148 | # model can "resemble' GAM, so perhaps people are using
149 | # them and then making the relevant transformations within
150 | # the linear models they present in papers? Probably not :)
151 | 
152 | # REGRESSION TREES
153 | 
154 | # The idea behind regression and classification trees is to group
155 | # datasets into different subsets and use these different subsets
156 | # to predict different pathways to an outcome. Basically, the
157 | # algorithm breaks down the dataset into different subsets or 
158 | # regions using a stopping rule (for example, the region must
159 | # include at least five observations). Then, the model simply takes
160 | # the mean outcome for each subset or region, and predicts this
161 | # will be the value of the outcome for this subset of the data.
162 | # The results take the form of a "tree" which describes how different
163 | # subsets of the data lead one to expect different values for
164 | # the outcome variable. This visualization is arguably much
165 | # more easy to read than a regression model and can often
166 | # be a better fit for the data (because it can help identify
167 | # non-linearity or causal complexity in the data)
168 | 
169 | # There are two types of common trees used in machine
170 | #learning : regression trees and classification
171 | # trees. Regression trees are for quantitative outcomes and for these
172 | # we can use the MASS package. classification trees are for 
173 | # categorical outcomes and for these we can use the "tree" package
174 | 
175 | # Let's take a look at an example which will help make this more
176 | # clear.
177 | 
178 | install.packages("MASS")
179 | library(MASS)
180 | 
181 | # the key function we are going to use is called "tree"
182 | # The "mpg~." indicates we want to treat mpg as the outcome
183 | # but look at every other variable in the dataset as a 
184 | # possible preditor:
185 | 
186 | tree.cars <- tree(mpg~., mtcars)
187 | 
188 | # One of the great things about regression trees is that
189 | # you can plot them- they kind of resemble a "flowchart"
190 | # within an organization, or a tree used to clasify 
191 | # biological organisms. The main difference, in terms
192 | # or interpretation, is that we are examining causal
193 | # pathways to the outcome (in this case mpg), and the
194 | # ways in which different predictors combine to shape
195 | # the outcome (both high and low levels of the outcome)
196 | 
197 | plot(tree.cars)
198 | # This adds labels
199 | text(tree.cars ,pretty =0)
200 | 
201 | # The way to read these trees is to focus on the 
202 | #'<' sign. The branches to the left are less than
203 | # and the branches on the right are greater than.
204 | # the "wt" variable here describes weight (in tons),
205 | # so the tree shows us that this is a big factor in mpg,
206 | # which is not surprising. In fact, all cars less than 2.26
207 | # get about 30mpg. Those greater than 2.26 tons have two
208 | # categories: those with less than six cylinders and those
209 | # with eight cylinders. The V8 cars with the highest horsepower
210 | # (hp) also have the lowest mpg. Again, this makes sense.
211 | 
212 | # To further intepret how well this classification
213 | #tree fits the data, we can summarize the object:
214 | 
215 | summary (tree.cars)
216 | 
217 | #The number we want to focus on is the "residual mean deviance"
218 | # which tells us how far off the regression tree estimates may
219 | # be. If we run a classification tree- which we are about to do
220 | #-we will get another metric: The missclassification rate, which 
221 | # tells us how many cases cannot be explained by the classification tree.
222 | 
223 | #Finding the best fit for the data usually requires "pruning"
224 | # the tree, or removing branches of the tree that add more
225 | # complexity but less explanatory power. To do this we use the
226 | # cv.tree command
227 | 
228 | prune.cars =prune.tree(tree.cars ,best =5)
229 | plot(prune.cars)
230 | text(prune.cars ,pretty =0)
231 | 
232 | # In this case, our tree was already so simple that pruning it
233 | # did not add parsimony. Note that the same pruning process
234 | # can be used for classification trees, as I will soon discuss.
235 | 
236 | # Let's run through a quick classification tree example, let's
237 | # use the Pew data from a previous class in order to classify
238 | # those who did and did not support the construction of the 
239 | #ground zero mosque. 
240 | 
241 | load("Pew Data.Rdata")
242 | install.packages("tree")
243 | library(tree)
244 | tree.groundzero =tree(pew10~educ+sex+age+inc+partyln, pewdata)
245 | plot(tree.groundzero)
246 | text(tree.groundzero ,pretty =0)
247 | 
248 | # The only difference in this process is that the algorithm selects
249 | # the most common value of the outcome given the subset or "branch"
250 | # of the tree instead of the mean value, which is what regression 
251 | # trees use.
252 | 
253 | # Another neat party for creating trees is the "party" package.
254 | # In addition to the same tree structure generated by the two
255 | # packages above, it adds plots of the cases within each branch
256 | # of the tree. Let's take a look:
257 | 
258 | install.packages("party")
259 | library(party)
260 | prettycartree <- ctree(mpg~., data=mtcars)
261 | plot(prettycartree)
262 | 
263 | 
264 | #These aren't the best datasets to use. I find that regression
265 | # and classification trees are most useful when you have 
266 | # very large datasets with many different variables. I encourage
267 | # you to try this out on different datasets- unfortunately
268 | # I did not have a better dataset on hand.
269 | 
270 | 
271 | 
272 | #RANDOM FORESTS
273 | 
274 | # The main downside of regression trees is that they are not the most
275 | # accurate tool available. In fact, if all relationships between
276 | # predictors and outcoems are linear, then a linear model will
277 | # be more efficient. 
278 | 
279 | # The principal reason that regression trees are not the most efficient way
280 | # of making predictions about data is that they only use one "round"
281 | # of subsetting the data. Therefore, regions with high variance
282 | # create problems. But what if instead of partitioning the data
283 | # only once, we did it hundreds of even thousands of times in slightly
284 | # different ways and then averaged the results? This is what is known
285 | # as bootstrapping, and in the language of regression trees, it is 
286 | # often described as "bagging." Random forests are an extension of
287 | # bagging that includes a small tweak that "decorrelates" the bagged
288 | # trees. It does this by taking a random sample of predictor variables
289 | # within each subset, and then choosing one of them as a branch. The 
290 | # random selection of the predictor helps avoid the influence of
291 | # one important predictor over other predictors that have a more 
292 | # moderate yet still meaningful association with the outcome.
293 | 
294 | # Unfortunately, when we extract a large number of subsets
295 | # from the data using bagging or random forests, we sacrifice 
296 | # the interpritibility a single regression tree. That is, we 
297 | # cannot construct a tree that visualizes all the bagged/randomly
298 | # partioned datasets.
299 | 
300 | 
301 | # On the other hand, we can still measure how important each predictor
302 | # is; or how important that "branch" is within the regression tree by
303 | # examining the residual sum of squares.
304 | 
305 | #Let's try it out.
306 | install.packages("randomForest")
307 | library(randomForest)
308 | 
309 | set.seed (123)
310 | rf.cars =randomForest(mpg~.,data=mtcars,importance =TRUE)
311 | 
312 | # To create the variable importance plot, we use this line:
313 | varImpPlot (rf.cars)
314 | 
315 | 
316 | # The first plot shows us the how much error we add to our model if we 
317 | # remove the variable from our model using the Mean Square Error. The 
318 | # plot on the right describes
319 | # the total increase in "node impurity" if the variable is removed from
320 | # the model- this is another measure of how important it is to have
321 | # the variable in the model that explains the outcome (and more specifically
322 | # how consistently the outcome is correct for each observation
323 | # within that branch. So a perfectly "pure" node or branch would
324 | # descibe all cases exactly)
325 | 
326 | # But the best way to figure out how solid your predictions are is to
327 | # use a training dataset or subset of the data to create the model, and 
328 | # then see how well it predicts the outcomes. 
329 | 
330 | 
331 | # The most recent advance in regression trees is the concept of 
332 | # boosting, which is similar to bagging but allows trees to grow
333 | # sequentially. If you want to learn more about these models
334 | # check out the book "An Introduction to Statistical Learning"
335 | # and the gbm package
336 | 
337 | 
338 | 
339 | 
340 | 
341 | 


--------------------------------------------------------------------------------
/Class R Code/Class #9 R Code.R:
--------------------------------------------------------------------------------
  1 | #SCRIPT FOR CLASS #9, COMPUTATIONAL SOCIOLOGY
  2 | # Instructor: Chris Bail 
  3 | # Copyright, Chris Bail
  4 | 
  5 | # The vast majority of the code we are using this week is for the program "NetLogo." While it is possible
  6 | # run NetLogo from R using the RNetLogo package, I opted not to do this because this package is currently
  7 | #unstable. Instead, I provided more detailed annotation in the class slides for this week than I usually do
  8 | # so most of the activities for this class will not involve this code.
  9 | 
 10 | # At the very end of the class, I explain how to write a simulation from scratch in R. The code below is adapted
 11 | # from James Kitts and it implements the Threshold Model of Collective Behavior developed by Granovetter. In
 12 | # this model, riots emerge because of a critical threshold wherein the risks of participating in the riot
 13 | # decrease given a sufficient number of people who participate.
 14 | 
 15 | #As in NetLogo, we must first initialize our simulation. But in R, we need to be a little more explicit (Net
 16 | # Logo knows we may define paramaters using sliders, but R does not know this, because there are no
 17 | # sliders or buttons!). Here's what the Schelling Model looks like (note the large number of functions
 18 | # within functions- just like NetLogo).
 19 | 
 20 | 
 21 | 
 22 | # Number of people
 23 | n <- 1000
 24 | 
 25 | # The people will live in a square with area side^2
 26 | side <- ceiling(sqrt(n))
 27 | 
 28 | df <- data.frame(x=((0:(n-1)) %% side),
 29 |                  y=floor((0:(n-1)) / side), row.names=0:(n-1))
 30 | 
 31 | # Most lots will have a race; some will be empty, ie uninhabited
 32 | races <- c("forestgreen", "dodgerblue", "darkred")
 33 | 
 34 | # Assign races iid uniformly; leave roughly 10% of lots empty
 35 | df$race <- sample(c(races, "empty"), n, replace=TRUE,
 36 |                   prob=c(rep(0.90 / length(races), length(races)),
 37 |                          0.10))
 38 | 
 39 | PlotNeighborhood <- function() {
 40 |   
 41 |   with(subset(df, race != "empty"),
 42 |        plot(x, y, col=race, pch=20, axes=FALSE,
 43 |             xlab="", ylab="", xlim=c(0, side), ylim=c(0, side),
 44 |             main="A Schelling-esque Neighborhood"))
 45 |   
 46 | }
 47 | 
 48 | dev.new(height=8, width=8)
 49 | par(mar=rep(1, 4), oma=rep(1, 4))
 50 | PlotNeighborhood()
 51 | savePlot("neighborhood_before_movement.png")
 52 | 
 53 | # Neighbors are counted within a Chebyshev distance <= depth
 54 | depth <- 3
 55 | 
 56 | CountNeighbors <- function(i) {
 57 |   
 58 |   # Count people of each race in person i's immediate neighborhood
 59 |   curr.x <- i %% side
 60 |   curr.y <- floor(i / side)
 61 |   neighbors <- subset(df, x %in% (curr.x - depth):(curr.x + depth) &
 62 |                         y %in% (curr.y - depth):(curr.y + depth) &
 63 |                         !(curr.x == x & curr.y == y))
 64 |   
 65 |   return(sapply(races, function(x) { sum(neighbors$race == x) }))
 66 |   
 67 | }
 68 | 
 69 | # Apply CountNeighbors to the entire data frame; append results to df
 70 | df <- cbind(df, t(sapply(0:(n - 1), CountNeighbors)))
 71 | 
 72 | df$num.neighbors <- rowSums(df[ , races])
 73 | 
 74 | # Minimum fraction of own-race neighbors
 75 | fraction <- 0.30
 76 | 
 77 | GetPeopleWhoWantToMove <- function(curr.race) {
 78 |   
 79 |   subset.who.want.to.move <-
 80 |     subset(df, race == curr.race &
 81 |              get(curr.race) / num.neighbors < fraction)
 82 |   
 83 |   return(rownames(subset.who.want.to.move))
 84 |   
 85 | }
 86 | 
 87 | AdjustNeighborCounts <- function(row, race.of.mover, delta) {
 88 |   
 89 |   curr.x <- as.integer(row) %% side
 90 |   curr.y <- floor(as.integer(row) / side)
 91 |   
 92 |   df[df$x %in% (curr.x - depth):(curr.x + depth) &
 93 |        df$y %in% (curr.y - depth):(curr.y + depth) &
 94 |        !(curr.x == df$x & curr.y == df$y),
 95 |      c(race.of.mover, "num.neighbors")] <<- 
 96 |     df[df$x %in% (curr.x - depth):(curr.x + depth) &
 97 |          df$y %in% (curr.y - depth):(curr.y + depth) &
 98 |          !(curr.x == df$x & curr.y == df$y),
 99 |        c(race.of.mover, "num.neighbors")] + delta
100 |   
101 | }
102 | 
103 | MoveOnePerson <- function() {
104 |   
105 |   # Returns 1 if a person was successfully moved, and 0 otherwise
106 |   
107 |   people.who.want.to.move <- c(lapply(races, GetPeopleWhoWantToMove),
108 |                                recursive=TRUE)
109 |   
110 |   if (!length(people.who.want.to.move) >= 1) return(0)
111 |   
112 |   # Of people who want to move, pick one uniformly at random
113 |   person.who.will.move <- sample(people.who.want.to.move, size=1)
114 |   
115 |   race.of.mover <- df$race[rownames(df) == person.who.will.move]
116 |   
117 |   possible.new.homes <- rownames(
118 |     subset(df, race == "empty" &
119 |              get(race.of.mover) / num.neighbors >= fraction))
120 |   
121 |   if (!length(possible.new.homes) >= 1) return(0)
122 |   
123 |   # Of acceptable new homes, choose one uniformly at random
124 |   new.home <- sample(possible.new.homes, size=1)
125 |   
126 |   df[rownames(df) == new.home, ]$race <<- race.of.mover
127 |   df[rownames(df) == person.who.will.move, ]$race <<- "empty"
128 |   
129 |   AdjustNeighborCounts(person.who.will.move, race.of.mover, -1)
130 |   AdjustNeighborCounts(new.home, race.of.mover, +1)
131 |   
132 |   return(1)
133 |   
134 | }
135 | 
136 | RunSimulation <- function(max.movements = 5000, plots.in.loop=TRUE) {
137 |   
138 |   par(mar=rep(1, 4), oma=rep(1, 4))
139 |   
140 |   for(i in 1:max.movements) {
141 |     
142 |     if (!MoveOnePerson()) break
143 |     if (plots.in.loop & (i %% 50) == 0) PlotNeighborhood()
144 |     
145 |   }
146 |   
147 | }
148 | 
149 | library(animation)
150 | 
151 | saveVideo(RunSimulation(),
152 |           video.name="schelling_neighborhood_model.mp4",
153 |           interval=0.20, outdir=getwd())
154 | 
155 | dev.new(height=8, width=8)
156 | par(mar=rep(1, 4), oma=rep(1, 4))
157 | PlotNeighborhood()
158 | savePlot("neighborhood_after_movement.png")


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Computational-Sociology
2 | 
3 | This repository is for materials and group projects associated with my Computational Sociology course at Duke University.
4 | 
5 | 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------