├── Class Projects ├── Code for Class.R ├── Collecting Data for Github Project.R └── Sankey.html ├── Class R Code ├── Class #1 R Code.R ├── Class #2 R Code.R ├── Class #3 R Code.R ├── Class #4 R Code.R ├── Class #5 R Code.R ├── Class #6 R Code.R ├── Class #7 R Code.R ├── Class #8 R Code.R └── Class #9 R Code.R ├── Class Slides ├── Class #1 Slides.html ├── Class #2 Slides.html ├── Class #3 Slides.html ├── Class #4 Slides.html ├── Class #5 Slides.html ├── Class #6 Slides.html ├── Class #7 Slides.html ├── Class #8 Slides.html └── Class #9.Slides.html └── README.md /Class Projects/Code for Class.R: -------------------------------------------------------------------------------- 1 | 2 | #read in document from google docs 3 | 4 | our_interests<-read.csv("https://docs.google.com/spreadsheets/d/1zIeZ-9fbnCM1AQ3vt_OT8Kqwdqup2VA90b2m0jnkzkM/pub?gid=0&single=true&output=csv", row.names=1) 5 | 6 | #cluster our interests 7 | 8 | 9 | # Ward Hierarchical Clustering 10 | #create distance matrix 11 | distance_matrix <- dist(our_interests, method = "euclidean") 12 | fit <- hclust(distance_matrix, method="ward") 13 | # display dendogram 14 | plot(fit) 15 | groups <- cutree(fit, k=5) # cut tree into 5 clusters 16 | # draw dendogram with red borders around the 5 clusters 17 | rect.hclust(fit, k=5, border="red") 18 | 19 | 20 | #Non-Hierarchical cluster analysis 21 | 22 | kmeans_clusters <- kmeans(our_interests, 5) 23 | library(cluster) 24 | clusplot(our_interests, kmeans_clusters$cluster, color=TRUE, shade=TRUE, 25 | labels=2, lines=0) 26 | 27 | 28 | # Now where are the good puzzles? 29 | 30 | # http://www.unc.edu/~ncaren/cite_network_full/cites.html 31 | -------------------------------------------------------------------------------- /Class Projects/Collecting Data for Github Project.R: -------------------------------------------------------------------------------- 1 | # Collecting data for group project 2 | 3 | print("hello world") 4 | -------------------------------------------------------------------------------- /Class R Code/Class #1 R Code.R: -------------------------------------------------------------------------------- 1 | # SCRIPT FOR CLASS #1, COMPUTATIONAL SOCIOLOGY 2 | # Instructor: Chris Bail 3 | # Copyright, Chris Bail 4 | 5 | 6 | # GETTING STARTED WITH YOUR WORKING DIRECTORY 7 | 8 | # Setting Your Working Directory 9 | 10 | # First let's identify your "working directory," or the 11 | # place where the files you want to work with are located. 12 | # At first they were online in our class's Dropbox folder, 13 | # but you have since downloaded them onto your computer. 14 | # Identifying the working directory is important because 15 | # you will need to know it in order to load files, import 16 | # data, and export graphs or other types of analysis. 17 | 18 | # in order to identify your working directory, highlight 19 | # the line below and then click "Return" while holding down 20 | # "Control." This tells RStudio that you want to "run" or 21 | # execute whatever line you are working on. You can also 22 | # use the "Run" button in the upper right-hand side of this 23 | # pane of RStudio. 24 | 25 | getwd() 26 | 27 | # You should now see the output of this command below. By 28 | # default, R sets the working directory to the "home" folder 29 | # on your computer, or the folder that contains the file you 30 | # double clicked on. 31 | 32 | # Often you will want to change the working directory, either 33 | # because you want to work with data in a new folder, or 34 | # because you want to tell R to save your work to a folder 35 | # that is more convenient for your work flow 36 | 37 | # the command below will set your working directory to be 38 | # your desktop 39 | 40 | setwd("~/Desktop") 41 | 42 | # The ~ sign here replaces the more detailed name of your 43 | # computer for example, if I were to use the complete name 44 | # of my desktop folder is: setwd("/Users/christopherandrewbail/Desktop) 45 | # I am going to set my home folder as follows: 46 | 47 | setwd("/Users/christopherandrewbail/Desktop/Dropbox/ODUM R COURSES/Intro to R Class Dropbox/") 48 | 49 | 50 | # Next, let's take a look at what documents are in your 51 | # home folder. 52 | 53 | list.files() 54 | 55 | # Basic Operations in R 56 | 57 | # Perhaps the most basic thing one can do is use 58 | # R as a calculator 59 | 60 | 1+1 61 | 62 | # Now let's create our first object or variable in R 63 | # To do this, you need to use the "<-" operator 64 | 65 | my_number<-2 66 | 67 | # we have now created a numeric variable whose value 68 | # is 2. Note that you can also use the "=" sign if you 69 | # prefer (my_number=2). 70 | 71 | # Notice in the top right hand pane of Rstudio there is 72 | # now a value for my_number. 73 | 74 | # now lets try some basic operations 75 | 2*my_number 76 | 2+my_number 77 | 2-my_number 78 | my_number/3 79 | my_number^3 80 | 81 | # if we want to store the results of these basic 82 | # operations, we could use the "<-" operator again 83 | 84 | my_new_number<-2*my_number 85 | 86 | # when naming variables or objects in r, try to 87 | # avoid terms that may confuse r because they are 88 | # similar to commands. For example, don't name a 89 | # variable "mean" or "median." Also, keep in mind 90 | # that R is case sensitive. If one letter is 91 | # accidentally capitalized, your command won't 92 | # work. 93 | 94 | # We can also create character or "string" variables 95 | # by using either double or single quotation marks. 96 | 97 | my_name<-"Georg Simmel" 98 | 99 | # If we want to see the variable, we can use this 100 | # command 101 | 102 | print(my_name) 103 | 104 | # -------------------------------------------------------------------------------- /Class R Code/Class #2 R Code.R: -------------------------------------------------------------------------------- 1 | # SCRIPT FOR CLASS #2, COMPUTATIONAL SOCIOLOGY 2 | # Instructor: Chris Bail 3 | # Copyright, Chris Bail 4 | 5 | 6 | 7 | Vectors 8 | 9 | # Many objects in R are vectors. These are sequences 10 | # of multiple variables. We define a vector as follows 11 | 12 | my_vector<-c(1, 3, 4, 9) 13 | 14 | 15 | # Next, Let's try out some basic operations on 16 | # numeric vectors: 17 | 18 | mean(my_vector) 19 | median(my_vector) 20 | max(my_vector) 21 | min(my_vector) 22 | summary(my_vector) 23 | 24 | # Note that vectors can also be sequences of strings 25 | 26 | my_word_vector<-c("Roy Williams","Is","The Best") 27 | 28 | # often, you will want to grab one variable within a 29 | # vector. This command, for example, selects the third 30 | # number in my_word_vector 31 | 32 | my_word_vector[3] 33 | 34 | # Let's pause to try this out. Here's an excercise: 35 | # 1) create your own vector of numbers; 36 | # 2) create a new variable that is the mean of 37 | # your vector 38 | 39 | # Example Solution: 40 | 41 | my_new_vector<-c(100,200, 549) 42 | average_vector<-mean(my_new_vector) 43 | average_vector 44 | 45 | 46 | #Matrices 47 | 48 | # Vectors are a basic building block of matrices, 49 | # another critical type of object in R. To create 50 | # a matrix, we use the "matrix()" function. 51 | 52 | my_matrix <- matrix(c(1,2,1,2, 64000,38000,100000,200000, 53 | 1,5,17,21 ), nrow = 4, ncol = 3) 54 | 55 | # the first value required by this function is a 56 | # vector of numbers or characters. We use nrow and ncol 57 | # to specify the number of rows and columns. 58 | 59 | # to look at our matrix, you can run this line: 60 | my_matrix 61 | 62 | # or, you can click on "my_matrix" in the upper-right 63 | # pane of RStudio. 64 | 65 | # often, we will need to grab one row of a matrix, or 66 | # one column. To do this, we use the "," operator: 67 | 68 | my_matrix[1,] 69 | 70 | # The "," operator specifies whether you are requesting 71 | # the rows or the columns of the matrix. To request 72 | # the first column, we would run 73 | 74 | my_matrix[,1] 75 | 76 | # To get the value of a cell within a matrix, we need 77 | # to tell R about both the row and the column: 78 | 79 | my_matrix[1,2] 80 | 81 | # 64,000 is the number that is in the second column of the 82 | # first row 83 | 84 | Lists 85 | 86 | # A third important type of R object 87 | # is a list. Lists are like vectors, but unique 88 | # in that they may contain multiple types of 89 | # data (e.g. strings, numbers, or even matrices) 90 | 91 | # Let's create a list 92 | 93 | my_list<-list(9, "Roy Williams", my_matrix) 94 | 95 | # Let's take a look 96 | my_list 97 | 98 | # Let's say we wanted to grab "Roy Williams" from 99 | # our list. We can just write: 100 | 101 | my_list[2] 102 | 103 | 104 | # "Why are we spending so much time with Matrices and Lists?" 105 | # you may ask. It is because many forms of programming 106 | # require a basic familarity with matrices and lists, and 107 | # if you get into working with big data you will almost 108 | # surely need o know how to work with them. 109 | 110 | #Data Frames 111 | 112 | # Matrices and lists are also important because they are 113 | # the building blocks of what may be the most important 114 | # type of object in R: data frames. 115 | 116 | # Data frames are very similar to datasets you might load 117 | # into Stata/SPSS/SAS in that they have rows, columns, and 118 | # column names, etc. 119 | 120 | # In order to create a data frame, we can use the 121 | # following command on our matrix: 122 | 123 | my_data_frame<-as.data.frame(my_matrix) 124 | 125 | # Note that there is now a new object in the upper 126 | # right "Environment" pane of RStudio. If we click 127 | # up there, we see that R has already chosen some 128 | # arbitrary names for our columns (V1, V2, V3). 129 | 130 | # R uses some clunky syntax to change column names. 131 | # This is worth our time, however, because column 132 | # names often change when you are manipulating 133 | # data 134 | 135 | # lets change "V1" to "Sex" 136 | 137 | colnames(my_data_frame)[colnames(my_data_frame)=="V1"]<-"Sex" 138 | 139 | # But let's say we want to use words instead of numbers to 140 | # describe sex. In this case, we need to change the 141 | # contents of the data frame as follows: 142 | 143 | my_data_frame$Sex[my_data_frame$Sex==1]<-"Female" 144 | my_data_frame$Sex[my_data_frame$Sex==2]<-"Male" 145 | 146 | # That was a mouthful, huh? The "$" operator is 147 | # how you tell R that you are looking for a specific 148 | # variable within the data frame. 149 | 150 | # now lets look at our data frame 151 | 152 | my_data_frame 153 | 154 | # Now let's figure out the sex breakdown of our 155 | # data using the "table" command. 156 | 157 | table(my_data_frame$Sex) 158 | 159 | # Ok, let's step back again so that you can try 160 | # this out on your own: 161 | # 1) Change the name of the Second column in 162 | # my_data_frame to "Income"; 163 | # 2) Calculate the median of the Income variable 164 | 165 | # MANIPULATING DATA 166 | 167 | # Until now, we have been working at a very 168 | # abstract level. This is because I needed 169 | # to teach you some basic concepts before we 170 | # can start to work with real data. 171 | 172 | # R Data Files have the extension .Rdata 173 | # We will work with these soon, but let's 174 | # begin by pulling in other types of data 175 | # files, because it's unlikely that you 176 | # will be working with an .Rdata file if 177 | # you are coming from another program 178 | # such as STATA. 179 | 180 | # Importing Spreadsheets 181 | 182 | # R has a variety of ways of importing data. 183 | # For example, data often comes in .csv 184 | # format. To read this, we use the read.csv 185 | # command 186 | 187 | sample_csv_data<-read.csv("Sample_CSV_Data.csv") 188 | 189 | # As the upper right hand pane of RSTudio 190 | # now shows, these data have 9909 observations 191 | # and 406 variables. 192 | 193 | # By default, R has assumed that the first 194 | # line of these data are the variable names. 195 | # to list all of the variable names, we can 196 | # write 197 | 198 | colnames(sample_csv_data) 199 | 200 | # We do not have the dictionary for these data, 201 | # so we can only guess what these codes mean. 202 | 203 | # R also treats any strings as factors. This can 204 | # become problematic later if you try to perform 205 | # operations on string variables that are actually 206 | # factor variables 207 | 208 | # In order to see the "class" of a variable- 209 | # or whether it is a numeric, character, or 210 | # factor variable, we can use the class() command 211 | 212 | class(sample_csv_data$Institution_Name) 213 | 214 | # Yep, it's a factor. If we want to prevent R 215 | # from defaulting to this behavior, we can add 216 | # an option to our read.csv command. Options 217 | # for most commands are specified by a comma 218 | # after the name of the object you want to apply 219 | # the command to. 220 | 221 | # to illustrate this point a bit better, let's 222 | # look at the "help" file for read.csv. Earlier 223 | # I said there is no manual for R. The "help" 224 | # file is the closest thing we've got, and it's 225 | # not always great. 226 | 227 | ?read.csv 228 | 229 | # now we can see that there are many different 230 | # types of options that can be specified. Let's 231 | # try: 232 | 233 | sample_csv_data<-read.csv("Sample_CSV_Data.csv", 234 | stringsAsFactors=FALSE) 235 | 236 | # this tells R not to import strings as factors. In many 237 | # cases, you will want to add lots of different options 238 | # to an R command. We will get to these cases soon. 239 | 240 | # But before we do, let's try to import some other types 241 | # of data. For example, what if you are a STATA user 242 | # trying to make the transition to R so that you can 243 | # analyze some Stata Data using a technique that is 244 | # only available in R? 245 | 246 | #Installing Packages and Importing Data 247 | 248 | # To do this, we need to install a new package in R. 249 | # Until now, we have been using "Base R" which refers 250 | # to all of the standard commands that come when you 251 | # download R. But most users will want to take advantage 252 | # of the rapidly expanding number of packages available. 253 | # Indeed, some of these have become so instrumental for 254 | # computational sociology that I cannot imagine life without them. 255 | 256 | # To open Stata data we are going to use the "Haven" 257 | # package written by a fellow named Hadley Wickham. He 258 | # is one of the most prolific authors of R packages for 259 | # computational social science and is very well respected 260 | # within the R community. 261 | 262 | # To add a package onto R, we use the install.packages 263 | # command 264 | 265 | install.packages("haven") 266 | 267 | # Though you only need to install a package once you 268 | # must "call" it within individual R scripts as follows: 269 | 270 | library(haven) 271 | 272 | # You can also do this by writing require(haven) 273 | 274 | # Here is where things can get messy. There is a group 275 | # called the R Core Development Team which oversees and 276 | # approves R packages in order to make them more useable. 277 | # in order to get your package approved you have to write 278 | # a help file, so we can write. 279 | 280 | # to find these help packages, you can either navigate 281 | # to the "packages" pane of RStudio on the lower right 282 | # pane, or you can google the name of the package to 283 | # find the CRAN site (This stands for the Comprehensive 284 | # R Archive Network). 285 | 286 | # Often you can also find a "vignette" or a pdf document 287 | # that not only explains some of the commands in the 288 | # package but applies them to real data. These are often 289 | # easier to follow then the help files themselves. 290 | 291 | # In this case, I know we want the "read_stata" command: 292 | 293 | sample_stata_data<-read_stata("Sample Stata Data.dta") 294 | 295 | # note that this "Haven" package also allows you to read 296 | # SPSS and SAS files, and write R files into these formats 297 | # as well. 298 | 299 | # If you plan to work with text data or other types of web- 300 | # based data you will probably encounter different types of 301 | # data structures that we do not have time to cover in this 302 | # class, but will be covered in my course on Thursday. For 303 | # example, JSON data, or html data. 304 | 305 | # Subsetting Data Frames 306 | 307 | # Manipulating data is a core task of computational social science. 308 | #A recent New York Times Article suggests 80% of data scientists' 309 | # time is spent cleaning data, while only 20% of their time 310 | # is spent analyzing it. See: 311 | #http://www.nytimes.com/2014/08/18/technology/for-big-data-scientists-hurdle-to-insights-is-janitor-work.html?_r=0 312 | 313 | # This is because many data sets are either unstructured, or semi- 314 | # structured, or because they have large amounts of missing 315 | # data, or because they have to be manipulated in order to 316 | # be analyzed for one reason or another 317 | 318 | # First, let's work on subsetting data. This simply means breaking 319 | # up a data frame into chunks. The syntax is similar to that we 320 | # used when we worked with matrices. For example, let's say 321 | # we want to take all respondents from our stata dataset who are 322 | # less than 50. The variable we want is called "age." 323 | 324 | respondents_under_50<-sample_stata_data[sample_stata_data$age<50,] 325 | 326 | # Once again, this is some tricky syntax. We first need to tell R 327 | # which dataset we want to manipulate. Everything inside the 328 | # parentheses is our instructions to R about what subset we want. 329 | # Remember that the "," before the last "]" here is critical. 330 | # We are are telling R that we want all rows that meet the criteria. 331 | # This is also the first time we have used a "logical operator" 332 | # in this case "<" you can also use ">" and "<=" 333 | 334 | # There are also a variety of useful commands to identify missing 335 | # data. This is important because often when one is working with 336 | # big data one cannot simply eyeball the data to identify patterns 337 | # of missing-ness. 338 | 339 | # First, let's drop all rows of the dataset that have any missing 340 | #data. To do this, we use the complete.cases() command: 341 | 342 | no_missing_data<-sample_csv_data[complete.cases(sample_csv_data),] 343 | 344 | # this dropped every single row. This is because of the structure 345 | # of this dataset (certain questions were asked of some respondents 346 | # but not others). 347 | 348 | # more often, you might want to identify all rows that are missing 349 | # data on one single variable in order to identify patterns of 350 | # missingness. Let's load some new data from the Dropbox in 351 | # order to illustrate this 352 | 353 | pewdata<-read.csv("Sample_Pew_Data.csv") 354 | 355 | # working with different datasets is useful because it gives you 356 | # a sense of the range of different problems you might encounter 357 | # with data cleaning. In this data set, for example, missing data 358 | # was coded as 9 instead of "NA" (or empty cells, which R would 359 | # have read in as NA). 360 | 361 | # lets look at missing data on the "pew10" variable, which is about 362 | # whether people supported the construction of the "Ground Zero" 363 | # mosque in New York in 2011. First, let's change the 9's to NAs 364 | 365 | pewdata$pew10[pewdata$pew10==9]<-NA 366 | 367 | missing<-pewdata[is.na(pewdata$pew10),] 368 | 369 | #If we want to take all the values where "pew10" is NOT 370 | #missing, we would do this: 371 | 372 | no_missing<-pewdata[!is.na(pewdata$pew10),] 373 | 374 | # Note that is.na() is a logical operator. If we write 375 | 376 | is.na(pewdata$pew10) 377 | 378 | # we see TRUE/FALSE values for each row of the data frame on this 379 | # variable. 380 | 381 | #Recoding Variables 382 | 383 | # now lets say we wanted to find all of the men with missing data. 384 | # First let's find the variable 385 | 386 | colnames(pewdata) 387 | 388 | # now let's see how the variable is coded 389 | table(pewdata$sex) 390 | 391 | # Looks like 1s and 2s. I happen to know that 1=Male in these 392 | # data, so: 393 | 394 | missing<-pewdata[is.na(pewdata$pew10)& pewdata$sex==1,] 395 | 396 | # Note again that we need the "," because we are telling R 397 | # we want the rows. If we wanted to trim columns from the data 398 | # we would need to put the content we want after the ","- 399 | # we can either use the numbers of the columns or their names. Let's 400 | # say we just want the two variables we've been working with so 401 | # far: 402 | 403 | gender_and_mosque<-pewdata[,c("sex","pew10")] 404 | 405 | # remember that the "c()" operator is necessary here because we 406 | # are asking for multiple variables. 407 | 408 | # let's say we wanted everything but the first column in the dataset. 409 | # First we would need to know the number of columns. We can use 410 | # ncol() for this purpose 411 | 412 | ncol(pewdata) 413 | 414 | # Then we simply tell R we want rows 2 to 52 using the ":" operator, 415 | # which indicates a sequence. 416 | 417 | no_first_column<-pewdata[,2:52] 418 | 419 | # I also want to note that we could combine the two steps as follows: 420 | 421 | no_first_column<-pewdata[,2:ncol(pewdata)] 422 | 423 | # I'm noting this because it will be helpful to know that this is 424 | # possible when we discuss programming later in this class. 425 | 426 | # You now know the basics of manipulating a data frame in R. Let's 427 | # pause for another exercise: 428 | # 1) Figure out the age of the oldest man in the dataset 429 | 430 | #Reshaping Data Frames 431 | 432 | # Another very common task in computational sociology is reshaping data. For 433 | # example, suppose we wanted to examine partisanship by race. The 434 | # Patyln variable describes the following question within the Pew Data: 435 | # "As of today do you lean more to the Republican Party" or more to The 436 | # Democratic party" The possible answers are 1: Republican, 2: Democrat; 437 | # 9: Missing. 438 | 439 | # It's annoying that these are not already correctly coded, but this is 440 | # a common task in computational sociology, so first, let's recode the numeric 441 | # data into strings or characters: 442 | 443 | pewdata$partyln[pewdata$partyln==1]<-"Republican" 444 | pewdata$partyln[pewdata$partyln==2]<-"Democrat" 445 | pewdata$partyln[pewdata$partyln==9]<-NA 446 | 447 | # let's check to make sure it worked: 448 | 449 | table(pewdata$partyln) 450 | 451 | # Now we also need to recode the race variables. 452 | 453 | pewdata$race[pewdata$race==1]<-"White" 454 | pewdata$race[pewdata$race==2]<-"African American" 455 | pewdata$race[pewdata$race==3]<-"Asian or Pacific Islander" 456 | pewdata$race[pewdata$race==4]<-"Mixed Race" 457 | pewdata$race[pewdata$race==5]<-"Native American" 458 | pewdata$race[pewdata$race==6]<-"Other" 459 | pewdata$race[pewdata$race==9]<-NA 460 | 461 | table(pewdata$race) 462 | 463 | # we can get a cross tab by doing this: 464 | table(pewdata$partyln, pewdata$race) 465 | 466 | # Just for fun, let's save our cleaned up dataframe 467 | # in R format- we'll use it for some analysis tomorrow 468 | save(pewdata, file="Pew Data.Rdata") 469 | 470 | # but let's say we want the average age by race. As is 471 | # common with R, there are many different ways to do this. 472 | # let's continue using base R. For the record, one could use 473 | # the "plyr" package, the "reshape" package, and the 474 | # "data.frame" package, just to name a few. 475 | 476 | aggregate(pewdata$age, by=list(pewdata$race), FUN=mean) 477 | 478 | # What if we want the average age by both race and party? 479 | aggregate(pewdata$age, by=list(pewdata$race, pewdata$partyln), FUN=mean) 480 | 481 | # And once again we could store these data as follows: 482 | 483 | age_by_race<-aggregate(pewdata$age, by=list(pewdata$race), FUN=mean) 484 | 485 | # Merging Data Frames 486 | 487 | # Another very common task you might face in R is merging multiple 488 | # datasets. This is one of the most common tasks you might encounter 489 | # in data cleaning and manipulation precisely because R can have 490 | # so many objects loaded in memory at once. 491 | 492 | # Imagine, for example, that we want to add average income by race 493 | # to our dataset that describes average age. I put a very 494 | # small spreadsheet in the Dropbox that describes average 495 | # income by race. 496 | 497 | race_income_data<-read.csv("Income By Race.xlsx") 498 | 499 | # this gives us an error, because this is an .xlsx 500 | # file, and not a .csv file. This is a total pain. 501 | # Because the file is so small, we might be tempted to 502 | # either a) open Excel and save it as .csv or b) 503 | # just input the data manually into R. 504 | 505 | # But what if this dataset were huge, or had some funky 506 | # character encoding that would be lost if you saved it 507 | # as .csv? This is a common problem when working with 508 | # big data. 509 | 510 | # In this type of situation, you would need to do some 511 | # research. I said earlier that R does not have a manual 512 | # One can, however, search for R help via RSTudio by typing 513 | # ?? before the search term 514 | ??xlsx 515 | 516 | # Bit this is a bit confusing, it does not give us good 517 | # advice on which of these packages might be best, or why 518 | # A better solution is simply to google your problem 519 | # let's try "Import .xlsx into R" 520 | 521 | # The very first result comes from a site called "Stack 522 | # Overflow. In my opinion this is by far the most reliable 523 | # site for information about R, and other programming 524 | # language as well. This is because it boasts an extermely 525 | # large and diverse user base, a sort of "hive mind" 526 | 527 | # Here is the site: 528 | # http://stackoverflow.com/questions/7049272/importing-xlsx-file-into-r 529 | 530 | # At the top of the page is a user who is asking a question 531 | # The question gets "votes" that indicate how important it 532 | # is for the question to be answered. 533 | 534 | # First, there are a variety of comments on the question, asking 535 | # for clarification or recommending other resources. 536 | 537 | # Below, there are answers. First, note that there are eleven 538 | # answers to the question! This shows both the potential and the 539 | # disorganization of R. Which answer is best? Each answer gets 540 | # votes by other users, so we can see here that the best answer 541 | # is the first one. (You should also note that individual users 542 | # have different reputation scores, and you might use those as 543 | # a guide as well). 544 | 545 | # The consensus on this page is to use the XLConnect package. 546 | # first, we need to install it: 547 | 548 | install.packages("XLConnect") 549 | 550 | # let's try it out 551 | race_income_data <- readWorksheet(loadWorkbook("Income By Race.xlsx"),sheet=1) 552 | 553 | # It didn't work. Why? Because we did not call the packages. 554 | # Let's try again: 555 | 556 | library(XLConnect) 557 | race_income_data <- readWorksheet(loadWorkbook("Income By Race.xlsx"),sheet=1) 558 | 559 | # That was a lot of work for a little reward, but again 560 | # the point was to help you learn how to resolve a real- 561 | # world type of situation. 562 | 563 | # Ok, now that we have finally read the data into R 564 | # we can now merge it together with our data frame. 565 | # We could do this within base r using the "merge" 566 | # command, but it is a bit clunky. Most folks now 567 | # prefer to use the "plyr" package because it is 568 | # faster and more intuitive. 569 | 570 | install.packages("plyr") 571 | library(plyr) 572 | 573 | #The command for merging datasets is called "join" 574 | merged_data<-join(age_by_race, race_income_data) 575 | 576 | # This looked like it worked, but if we view the 577 | # merged dataset, we see that it added NAs instead 578 | # of the values 579 | 580 | View(merged_data) 581 | 582 | # Why? This particular command from the plyr package 583 | # automatically searches for column names that are 584 | # shared by both files. Let's check things out with 585 | # colnames() 586 | 587 | colnames(age_by_race) 588 | colnames(race_income_data) 589 | 590 | # When we ran the "aggregate" command above, it gave 591 | # the columns new, arbitrary names. We need to fix 592 | # the column names so that they are the same across 593 | # The datasets 594 | 595 | colnames(age_by_race)[colnames(age_by_race)=="Group.1"]<-"race" 596 | colnames(age_by_race)[colnames(age_by_race)=="x"]<-"age" 597 | 598 | # Let's try to merge again 599 | 600 | merged_data<-join(age_by_race, race_income_data) 601 | 602 | # Once again, it looks like it worked. But if we view 603 | # the data again, we see that only the income of 604 | # Whites was added. 605 | 606 | # Note that R did not give us an error message. This 607 | # is because it did exactly what we asked it to do: 608 | # merge all the rows that could be merged. But this 609 | # is the type of easy mistake that can create major 610 | # headaches further down the line. This is why it's 611 | # important to always view or table() your dataframes 612 | # after you manipulate them. 613 | 614 | # In order to diagnose the problem, lets table race 615 | # in both dataframes 616 | 617 | table(age_by_race$race) 618 | table(race_income_data$race) 619 | 620 | # Aha, we can now see that most of the races were not 621 | # merged because they were only in one of the two 622 | # data frames. We ALSO see that the African American 623 | # row in the age_by_race data frame was not merged 624 | # because the race_income_data uses the term "Black." 625 | # The terms for Asians also need to be recoded 626 | # Let's change this so that our merge will work: 627 | 628 | race_income_data[race_income_data=="Black"]<-"African American" 629 | race_income_data[race_income_data=="Asian"]<-"Asian or Pacific Islander" 630 | 631 | # And now let's try the merge again 632 | 633 | merged_data<-join(age_by_race, race_income_data) 634 | 635 | View(merged_data) 636 | 637 | # finally, it worked. 638 | 639 | # This is the conclusion of the first Day of this Class. 640 | # My goal was to help you get R up and Running and master 641 | # some of the basic object types and data manipulation 642 | # commands. These are by far the most frustrating parts 643 | # of learning R. Tomorrow, we will begin to get to analysis, 644 | # visualization, and programming, which is really where 645 | # R begins to shine. 646 | 647 | 648 | 649 | 650 | 651 | 652 | -------------------------------------------------------------------------------- /Class R Code/Class #3 R Code.R: -------------------------------------------------------------------------------- 1 | # SCRIPT FOR CLASS #3, COMPUTATIONAL SOCIOLOGY 2 | # Instructor: Chris Bail 3 | # Copyright, Chris Bail 4 | 5 | 6 | 7 | #1: Introduction to Programming 8 | 9 | # Though STATA, SPSS, and SAS provide basic forms of programming, 10 | # R blows them out of the water. This is because R is built upon 11 | # extremely powerful languages such as C++, but also because it has 12 | # the object-oriented and open-source characteristics necessary 13 | # to interface with other programming languages such as Python. 14 | 15 | # At the same time, the syntax of programming in R can be rather 16 | # funky. People who come to R from other languages such as C++ 17 | # or Python often report being frustrated by its clunky syntax. 18 | # There are ways to make R simulate the syntax of other programming 19 | # languages, but I will proceed to teach you the "R way" because 20 | # I assume that most of you are hoping to use R as your main 21 | # programming tool. 22 | 23 | #I should note that many people believe that the first programming 24 | # language you learn is the hardest. Once you get a sense of the 25 | # basic concepts of programming, it is much easier to translate 26 | # what you know from one language to another. 27 | 28 | # 2: Functions 29 | 30 | # The most basic form of programming is a function. We've actually 31 | # already been using them extensively throughout the course. But 32 | # we've been using them without seeing the "source code" or the 33 | # complicated list of instructions that R processes each time 34 | # we run a command such as corrgram, or tableplot 35 | 36 | # A function is simply a set of instructions or tasks that one 37 | # may apply to any type of object in R. Let's take a very basic 38 | #function 39 | 40 | my_function <- function(x) x+2 41 | 42 | # This function takes a number (x) and adds two to the number. 43 | # let's try it: 44 | 45 | my_function(2) 46 | 47 | # functions can get much, much more complicated. 48 | another_function <- function(x, y, z) { 49 | x <- mean(x)^2 50 | y <- cos(y)-5 51 | z <- log(z)*200 52 | c(x,y,z) 53 | } 54 | 55 | # this function requires three inputs (x, y, and z). The part 56 | # between the brackets tells R what we want to do to each of these 57 | # three inputs. The "c()" tells R that we want to display the results 58 | # if we did not include the c(), the function would still run, but 59 | # we would need to type "x,""y," or "z," to see the results for each 60 | # variable. 61 | 62 | # If you are just getting started out in R, you will probably not 63 | # write too many of your own functions, but you will probabbly soon 64 | # begin borrowing functions from others that you find online. It 65 | # is also important that you understand how a function works in 66 | # case you begin borrowing segments of other people's codes. If 67 | # you do not understand why their code works, you probably will 68 | # not be able to modify it to suit your own purposes. 69 | 70 | # 3: Loops 71 | 72 | # Another central type of programming in R is the "for" loop. This 73 | # is one of the oldest types of programming in computer science. We 74 | # might use a for loop when we want to repeat some type of function 75 | # or transformation across a large number of rows in a data frame, or 76 | # a large number of files in a folder. 77 | 78 | #let's begin with a very simple example: 79 | 80 | for (i in 1:6){ 81 | print("Jim Moody is bad-a$$") 82 | } 83 | 84 | # Let's start working with an example to illustrate. Let's say we 85 | # have a folder full of .csv files that describe different health 86 | # indicators from OECD, but we are really only interested in data 87 | # about Korea. 88 | 89 | # Let's build a for loop that opens each file, grabs the data from 90 | # korea, and then makes another data frame. We need to begin with 91 | # a few steps that may seem strange or unnecessary, but it will 92 | # soon become clear why we need to do them. 93 | 94 | # first, we need to tell R where the data are. I've placed it in 95 | # the dropbox in a folder entitled "OECD Health Data. Let's use 96 | # list.files to count the number of files. 97 | 98 | list.files("OECD Health Data") 99 | 100 | #The first thing we need to do with a for loop is initialize it, or tell it how 101 | # many times we want it to repeat the action. We therefore need 102 | # to count the number of files " 103 | 104 | filenames<-list.files("OECD Health Data") 105 | number_of_files<-length(filenames) 106 | 107 | # now, let's create an empty data frame to store our data: 108 | koreadata<-as.data.frame(NULL) 109 | 110 | # now, let's loop into each file 111 | 112 | for(i in 1:number_of_files){ 113 | 114 | filepath<-paste("OECD Health Data/", filenames[i], sep="") 115 | data<-read.csv(filepath, stringsAsFactors = FALSE) 116 | newdata<-data[data$Location=="Korea",] 117 | newdata$indicator<-filenames[i] 118 | koreadata<-rbind(koreadata,newdata) 119 | } 120 | 121 | # There is quite a lot to explain here. Let's begin by the first 122 | # line. The "i" here is the variable we are going to loop through. 123 | # so if i=1, then we are looking at the first file in the folder. If 124 | # i=2 we are looking at the second file in the folder, etc. 125 | 126 | # the 1:number_of_files, tells R that we want to repeat the steps 127 | # within the loop for values between 1 and number_of_files. In this 128 | # case our number_of_files variable equals 5, so we are telling R 129 | # to repeat these steps for all five files in our folder. 130 | 131 | # Everything within the brackets is what we want r to do for each 132 | # file. 133 | 134 | # The first thing we want it to do is open the csv file, but to 135 | # do this we need to tell it the full file path of the file. We 136 | # could type filenames[i] but this would just get us the name of 137 | # the file, and not the whole file path. 138 | 139 | # to create the file path, we are using the paste function. 140 | # this function takes two strings and joins them together. The 141 | # sep here refers to what we want R to put in between the two 142 | # strings, in this case, we want nothing, so we put no text or 143 | # spaces in between the quotation marks. 144 | 145 | # the second line in the loop simply reads in the .csv file 146 | #using the file path we just created. 147 | 148 | # The third line selects only the data for Korea, which we can 149 | #find because all of the .csv files we are reading in have used 150 | # the same column names, and the same capitalization for the term 151 | # "Korea." 152 | 153 | # In the fourth line, we are creating a new variable in the data 154 | # frame we created in the preceding line that describes the name 155 | # of the metric. In this case, the names are sloppy because they 156 | # include all of the .csv formatting. We could clean this up 157 | # using a command such as gsub() but let's keep it simple for now 158 | 159 | # The final line in the loop is critical. We are telling R to 160 | # take this new data frame we created and append it to the blank 161 | # data frame we created before we started the loop. With each 162 | # iteration of the loop, the data frame gets one more row. 163 | 164 | # That was a detailed explanation, but my goal was to try and 165 | # work in a few useful commands into a practical example which 166 | # you might encounter in your own work. 167 | 168 | # There are other types of loops in R that we do not have time 169 | # to cover (e.g. "while" loops, and if/else statements). My hope 170 | # is that if you want to learn more about these types of loops 171 | # you now have a base level of knowledge to learn about them. 172 | 173 | # Loops are slow in most languages, but particularly in R. You 174 | # may never care about speed if you are only working with datasets 175 | #<10,000 observations in R, but if you want to get into big data 176 | # you will probably want to look into activities at loops. 177 | 178 | # On the other hand, you can also chose to be a "hack" or a sloppy 179 | # programmer, and simply run your code on a really powerful 180 | # machine. I'll describe how to do this later during this class. 181 | 182 | ======================================================== 183 | # **Now you try it: 184 | 185 | # Write a forloop that goes through each variable in our Pew Dataset and replaces values of 9 with NA. 186 | 187 | #Hint: you may find the `ncol` function useful. 188 | 189 | #SOLUTION 190 | 191 | number_of_columns<-ncol(pewdata) 192 | for (j in 1: number_of_columns){ 193 | pewdata[,j][pewdata[,j]==9]<-NA 194 | } 195 | 196 | 197 | # 4: Vectorized functions 198 | 199 | # One of the reasons that R is slow is that it is not a compiled 200 | # language. In other words, you don't have to run a "set up" type 201 | # of program before you do your analysis. 202 | 203 | # R can access compiled commands through a process called "vectorization" 204 | # It is not really important for you to understand what the difference 205 | # is. The important thing is that you will probably encounter 206 | # other people using vectorized commands because they are faster 207 | # and it is therefore important for you to understand how they 208 | # work. 209 | 210 | # Vectorized functions within R are known as "apply" functions. 211 | # There are different types of apply commands for different 212 | # types of r objects. We are just going to look briefly at the one 213 | # for data frames, though there are also apply commands for lists 214 | # and arrays. 215 | 216 | # let's try to read our OECD Health files into R using apply. Once 217 | # again, we need a list of the names of the files: 218 | 219 | filenames<-list.files("OECD Health Data") 220 | 221 | # And now let's paste the file path into them 222 | filenames<-paste("OECD Health Data/", filenames, sep="") 223 | 224 | # and now let's apply the read.csv command to each file: 225 | data<-lapply(filenames,read.csv) 226 | 227 | # just one line! Note that the data is now in list format 228 | # and we'd have to clean it up to make it comparable to 229 | # the data we created within the for loop. 230 | 231 | # The important thing isn't the usefulness of this command 232 | # in this context, but in other, larger datasets. The apply 233 | # command is particularly powerful because we can apply 234 | # whatever function we want to our filenames- either other 235 | # people's r functions or our own. 236 | 237 | # the syntax for apply commands can become somewhat opaque 238 | # because they do not spell out the functions. Also, one has 239 | # to choose the appropriate apply command for the object in 240 | # question. A useful resource on the apply command is this 241 | # blog post: 242 | # http://www.r-bloggers.com/using-apply-sapply-lapply-in-r/ 243 | 244 | # One final note: you can speed up plyr and dplyr commands 245 | # by specifying the "parallel processing" options that allow 246 | # r to take advantage of multiple CPUs that you may have on 247 | # your machine. This can be particularly helpful if you use 248 | # the very powerful Amazon machines- or other cluster computing 249 | # technologies- described in section 4.6 below 250 | 251 | # 5: Piping 252 | 253 | # At the risk of giving you too many different options for 254 | # programming, I'm going to introduce you to one of the newer, 255 | # more cutting edge ways of programming in R. This is called 256 | # piping. 257 | 258 | # Piping is a way of passing data and functions in code without 259 | # initializing or iterating. Many people find it more intuitive 260 | # because it is a) less complex, and b) can be coded in a less 261 | # cluttered manner. 262 | 263 | # let's take a quick peak at the maggritr package 264 | 265 | install.packages("magrittr") 266 | library(magrittr) 267 | 268 | # The key contribution of this package is the `%>% operator. 269 | # Whatever is on the left side of this operator gets passed 270 | # to the right side. 271 | 272 | # Let's look at some data on baby naming from the Social 273 | # Security administration. 274 | 275 | install.packages("babynames") 276 | library(babynames) 277 | 278 | # The real power of %>% comes when you combine it with other 279 | # packages. Let's combine it with the dplyr package for data 280 | # reshaping/manipulation: 281 | 282 | # first, lets take the babynames data and pass it through the 283 | # "filter" command in dplyr which lets us request only names 284 | #where the first three letters start with "Ste." Then we will 285 | # use the group_by function of the same package to reshape 286 | # the data by year and sex. Finally, we will count the totals, 287 | # and plot it using ggplot 288 | 289 | library(dplyr) 290 | library(ggplot2) 291 | 292 | babynames %>% 293 | filter(name %>% substr(1, 3) %>% equals("Ste")) %>% 294 | group_by(year, sex) %>% 295 | summarize(total = sum(n)) %>% 296 | qplot(year, total, color = sex, data = ., geom = "line")%>% 297 | add(ggtitle('Names starting with "Ste"')) %>% 298 | print 299 | 300 | # Notice that we never created a variable, a blank data frame 301 | # or any other object. Once again, for some, this is much easier 302 | # to follow. Regardless of whether you find it more intuitive, 303 | # you would probably agree that it is quicker to write. 304 | 305 | # 6: Debugging your code 306 | 307 | # Whether you are brand-new to coding or whether you've been 308 | # doing it for years, it is extremely easy to make small mistakes 309 | # that can make your code fail. 310 | 311 | # Consider, for example, a for loop that never closes its brackets, 312 | # or a loop that uses the same letter to represent two different 313 | # variables in a model. 314 | 315 | # In order to catch these annoying problems, we need to "de-bug" 316 | # our code. Thankfully, R has a number of built in tools as well 317 | # as user contributed packages that can help us do this. 318 | 319 | # Perhaps the easiest way to debug your code, however, is right 320 | # here in RStudio. 321 | 322 | # You've probably noticed by now that RStudio will try to complete 323 | # the code you write. Once you define a data frame, for example 324 | # it can help you write variable names, etc. It can also help 325 | # you find options within a function. 326 | 327 | # You may have also noticed a red dot to the left of your code 328 | # or "Script" window. This describes some type of error. Usually 329 | # it is a syntax error, or some type of code that would result in 330 | # an error message in R. 331 | 332 | # This is particularly useful if you are looking at a very large 333 | # amount of code. It may be something as simple as realizing that 334 | # you did not load a package before calling a function. 335 | 336 | # RStudio also helps you find where brackets and parentheses 337 | #begin and end in your code. 338 | 339 | # RSTudio also has more sophisticated debugging tools that are 340 | # described in detail here: 341 | # https://support.rstudio.com/hc/en-us/articles/205612627-Debugging-with-RStudio 342 | 343 | # One final note on programming. If you want to get into more 344 | # advanced programming in R, I highly suggest the following 345 | # site: http://adv-r.had.co.nz authored by Hadley Wickham 346 | 347 | -------------------------------------------------------------------------------- /Class R Code/Class #4 R Code.R: -------------------------------------------------------------------------------- 1 | # SCRIPT FOR CLASS #4, COMPUTATIONAL SOCIOLOGY 2 | # Instructor: Chris Bail 3 | # Copyright, Chris Bail 4 | 5 | 6 | # INTRODUCTION 7 | 8 | # This class is designed to introduce you to the basic techniques 9 | # for collecting large corpora-- or text-based data-- using R. 10 | 11 | # For example, the techniques you will learn in this class can be used 12 | # to scrape text data from websites, extract social media messages or 13 | # other types of texts from sites such as Twitter, Facebook, or Google, 14 | # or automate the collection of text using other internet-based tools. 15 | 16 | # When people think about large text-based data sets, they tend to think 17 | # immediately of social media sites or blogs. Yet one of the most 18 | # exciting things about recent years is that we are witnessing vast 19 | # archive of historical archives as well. 20 | 21 | # Consider, for example, Google's nGram dataset, which is based upon 22 | # digital copies of nearly every book in the English language, and many 23 | # other languages as well. It is also increasingly easy to get historical 24 | # newspaper data or television transcripts. Librarians across the world 25 | # are rapidly digitizing hand-written texts from across the ages. 26 | 27 | # These new wellsprings of data present unprecedented possibilities for 28 | # academics, yet they also raise a number of new challenges. Fortunately, 29 | # the fields of computer science and computational linguistics have 30 | # jointly produced a suite of new tools that make our job easier. 31 | 32 | # Though we once had to hire teams of research assistants to collect, 33 | # standardize, and analyze large corpora, a single computer or group of 34 | # computers can now do this to text-based datasets that are so large that 35 | # human coders could never analyze them all. 36 | 37 | # But these new techniques will be unfamiliar to you if you come from a 38 | # conventional statistics background. Text-based datasets do not come 39 | # prepackaged. Instead, they are unstructured and usually very messy. 40 | 41 | # This is often because automated collection of texts often produces 42 | # texts that are formateed or structured differently. The first task 43 | # we will discuss in this class is simply how to automate collection 44 | # of texts via the internet. 45 | 46 | # COLLECTING TEXT-BASED DATA 47 | 48 | # Before I describe automated techniques for collecting text-based data 49 | # I'd like to point out that there is already a vast amount of data 50 | # out there that has already been compiled. For example, the New York 51 | # Times offers a large dataset of its articles, as does Reuters. Google 52 | # also makes is nGrams data available to the public. There are also a 53 | # variety of archives of Wikipedia data. 54 | 55 | # It is important to ask yourself whether you might be able to take 56 | # advantage of text-based datasets that someone else has collected 57 | # because you may be underestimated the amount of time it takes to collect 58 | # vast amounts of data. On the one hand, new technologies make this 59 | # easier than ever, but on the other hand the inherent messiness of 60 | # automated text collection-- from inconsistent file formats to spelling 61 | # differences to character encoding problems-- can make collecting 62 | # your own text-based datasets quite a hassle. 63 | 64 | # But if you are here, it is probably because you want to learn how 65 | # to build your own datasets. And this is probably where the greatest 66 | # value added is given that this is really a new frontier. 67 | 68 | # 1.1 Screen-Scraping. 69 | 70 | # Unfortunately, however, we are no longer in the "Wild Wild West" of big 71 | # data. Only several years ago one could easily mine or "scrape" vast amounts 72 | # of data from giant archives of information such as Google or Amazon. 73 | 74 | # Yet major corporations have become wise to the value of their data, and the 75 | # vast majority of sites now prevent you from scraping large amounts of data. 76 | # There are some important exceptions to this, but by in large, sites such as 77 | # Facebook, Twitter, or JSTOR will shut you down if you try to grab too 78 | # much text in an automated fashion. 79 | 80 | # I should also warn you that automated collection of text-based is also 81 | # often not only discouraged, but illegal. Years ago, several academics 82 | # got into considerable trouble with Facebook and Google for trying to 83 | # scrape data from these sites. To determine whether you can safely 84 | # automate data collection from a site, you need to visit its "Terms 85 | # of Service," which is a legally binding document that describes how 86 | # developers (in this case, you!) may interface with a site. 87 | 88 | # Despite all of these issues, the first technique I want to teach you today is something called "screen-scraping." 89 | 90 | # Screen scraping refers to a type of computer program 91 | # that reads in a web page, finds some information on it, grabs the 92 | # information, and puts it into a spreadsheet or other type of data 93 | # storage format. 94 | 95 | # When we look at a web page, we typically see something that is very easy 96 | # to digest. There is some combination of text and images in a relatively 97 | # small number of formats that we have been taught to digest easily. 98 | 99 | # But this is not what a webpage looks like to a computer. And if we want 100 | # to teach a computer to grab information from a web page for us, we need 101 | # to assume the perspective of a computer. 102 | 103 | # To a computer, a webpage is a long list of formatting rules, scripts, text, 104 | # and audio-visual data that is all put together in one of two common formats: 105 | # HTML or XML. These long lines of code tell the website how to assemble text, # images and video on the vast range of devices that might try to load the 106 | # page. It also generally shapes the "look" or "theme" of the website, and 107 | # how data is stored. But none of this is very important to understand in 108 | # detail unless you are interested in building websites. 109 | 110 | # Let's look at an example. Consider, the following Wikipedia page about 111 | # the World Health Organization's Ranking of Different Countries' Health 112 | # systems: 113 | 114 | #https://en.wikipedia.org/wiki/World_Health_Organization_ranking_of_health_systems_in_2000 115 | 116 | # To do screen scraping, we need to find the "Source Code," or the messy 117 | # list of instructions that a computer needs to display this page in the 118 | # format we see before this. 119 | 120 | # There are a variety of ways to see the source code of a website, but 121 | # the easiest way is typically to use your web browser. In Chrome, for 122 | # example, we can go to the dropdown "View" menu, and then select 123 | # "Developer" and then "View Source." 124 | 125 | # Messy, huh? At the top of the source code we can see that this document 126 | # is an HTML file. We will need to know whether a site is in HTML or XML 127 | # because it will determine the type of tools we use in R to scrape it. 128 | 129 | # In order to get a better feel for how the source code relates to the website 130 | #, let's navigate back to the wikipedia site: 131 | 132 | #https://en.wikipedia.org/wiki/World_Health_Organization_ranking_of_health_systems_in_2000 133 | 134 | # Let's say we want to scrape the data from the table on this page. 135 | # To do this, we are going to need to find out where this information is 136 | # within that messy HTML code. 137 | 138 | # Fortunately, there are a number of useful tools we can do to find this 139 | # type of information. In Chrome, for example, we can right click on the 140 | # part of the webpage we want to scrape, and click "inspect element." 141 | 142 | # Now, when we mouse over the messy code in the text, Chrome highlights 143 | # the part of the page that this code creates. So if we move our mouse 144 | # around until it highlights the table, we can start to identify the part 145 | # of the code we need to scrape it. The thing we need is called the "xpath" 146 | # To get the xpath, we can again right click and Chrome gives us the option 147 | # to copy it to our clipboard. 148 | 149 | # In my view, the best R package for screenscraping at present is the "rvest" 150 | # package, which was written by Hadley Wickham. R used to lag behind other 151 | # languages such as Python for web scraping, but rvest basically takes all the 152 | # best parts of these other languages and ports them into R. 153 | 154 | # The first thing I'm going to do is set our class dropbox as my 155 | # working directory: 156 | 157 | setwd("/Users/christopherandrewbail/Desktop/Dropbox/Teaching/Computational Soc Fall 2015/Course Dropbox") 158 | 159 | # note: the file path will be different on your machine! 160 | 161 | # The first thing we need to do is install rvest: 162 | 163 | install.packages("rvest") 164 | 165 | # Next, we need to remember to load rvest into our r code/r session 166 | 167 | library(rvest) 168 | 169 | # The first thing we need to do is to pass all of that messy source code 170 | # from the web and into r. To do this, we use the html() command: 171 | 172 | wikipedia_page<-html("https://en.wikipedia.org/wiki/World_Health_Organization_ranking_of_health_systems_in_2000") 173 | 174 | # Here I've created an object called "wikipedia page" that we are going 175 | # to reference in the rest of our code. If we type "wikipedia_page" we will 176 | # see all of that nonsense: 177 | 178 | wikipedia_page 179 | 180 | # HTML is broken up into sections that are called "nodes." The xpath tells 181 | # R which section we want. To get that section, we use the html_nodes() 182 | # command as follows: 183 | 184 | section_of_wikipedia_html<-html_nodes(wikipedia_page, xpath='//*[@id="mw-content-text"]/table[1]') 185 | 186 | #Once again, this object is going to be messy: 187 | 188 | section_of_wikipedia_html 189 | 190 | #But fortunately rvest has a command that let's us grab tables within 191 | #HTML sections, it's called "html_table() 192 | 193 | health_rankings<-html_table(section_of_wikipedia_html) 194 | 195 | # ..And voila. We have now scraped the health rankings data from Wikipedia 196 | 197 | health_rankings 198 | 199 | # It's still in a somewhat messy format though. In fact, let's check to 200 | # see what type of format it is in: 201 | 202 | class(health_rankings) 203 | 204 | # It's a list. To convert this to a data frame that we could easily 205 | # work with, we can simply write: 206 | 207 | test<-as.data.frame(health_rankings) 208 | 209 | # Unfortunately, many sites are not as "friendly" to automated text 210 | # collection as Wikipedia, which is not only decidely "open" to anyone 211 | # but also very consistent in the way it formats information. 212 | 213 | # On messier sites, the "inspect element" trick in Chrome might not work. 214 | # But there is another way around this. Instead of getting the "xpath" we 215 | # can get something called the "css selector." 216 | 217 | # The easiest way to do this it to download a plugin for chrome called 218 | # Selector Gadget. This is a tool that you can load when you look at a 219 | # webpage in order to find the css selector in the html code. 220 | 221 | # This website explains how to use it: 222 | 223 | #http://cran.r-project.org/web/packages/rvest/vignettes/selectorgadget.html 224 | 225 | # If you drag the link on this page onto the bookmarks bar, you can load the 226 | # selectorgadget anytime you are on a website you want to scrape. 227 | 228 | # The next step is to click on the stuff you want to scrape, and then click 229 | # on something you DO NOT want to scrape. This helps the tool figure out 230 | # exactly how to describe what you want on the page. IT IS NOT PERFECT THOUGH. 231 | # Once again, different pages use different formats, but some combination 232 | # of this method with the Chrome/INspect Element method should work for 233 | # most webpages. 234 | 235 | # Why don't we scrape a list of the 100 Twitter Users with the largest 236 | # numbers of followers so that we can use it when we work with Twitter in 237 | # just a bit. 238 | 239 | # Here is the link to the page: 240 | 241 | # http://twittercounter.com/pages/100" 242 | 243 | # After using the SelectorGadget tool, I determined that the 244 | # css Selector for the data we want is called ".name-bio." 245 | 246 | # To get the data, the process is almost identical to our 247 | # last example, except that we replace the xpath= with css= 248 | 249 | toptwitter<-html("http://twittercounter.com/pages/100") 250 | toptwitternodes<-html_nodes(toptwitter, css=".name-bio") 251 | names<-html_text(toptwitternodes) 252 | 253 | #Unfortunately, the html_table does not work, because the node 254 | #we selected is not a table, but just plain text, so we need to 255 | # run the code using the "html_text()" command: 256 | 257 | toptwitter<-html("http://twittercounter.com/pages/100") 258 | toptwitternodes<-html_nodes(toptwitter, css=".name-bio") 259 | names<-html_text(toptwitternodes) 260 | 261 | #Let's take a look 262 | 263 | names 264 | 265 | # The data we want is in there, but it's surround by a bunch of odd 266 | # characters. These characters are telling the webpage how many spaces to 267 | # put in between the text. 268 | 269 | # Cleaning up messy text like this in R is a very common challenge. 270 | # One can approach the problem in a variety of different ways, but 271 | # I am fond of using the "gsub()" command. 272 | 273 | # The gsub command finds one character string and replaces it with another. 274 | # This line tells R to find the "\n"s and replace the with "" which means 275 | # nothing. 276 | 277 | names<-gsub("\n","", names) 278 | 279 | # THe last argument simply tells R the name of the object we want to apply 280 | # this text transformation to. 281 | 282 | names 283 | 284 | # This got rid of the "\n"s but not the "\t"s but to get rid of those, 285 | # we can just add another line of code: 286 | 287 | names<-gsub("\t","", names) 288 | 289 | #Let's check it out: 290 | 291 | names 292 | 293 | #Getting closer, but we are going to want to split up the names and the 294 | # Twitter addresses, which begin with "@" 295 | 296 | # To do this, we can use the "strsplit" command: 297 | names_data<-(strsplit(names,"@")) 298 | 299 | #This command simply tells R to split each string into two when it 300 | # encounters the "@" character 301 | 302 | names_data 303 | 304 | # It's split, but it's still in a funky format. Let's find out which 305 | # one: 306 | 307 | class(names_data) 308 | 309 | # It's a list, and we are going to want a data frame once again. Let's 310 | # use an apply function to extract the names and twitter handles in 311 | # separate steps: 312 | 313 | twitter_names<-sapply(names_data,"[",1) 314 | twitter_handles<-sapply(names_data, "[", 2) 315 | 316 | #Let's just make sure that they are now character vectors: 317 | class(twitter_names) 318 | 319 | # now we can simply bind them together using "cbind" and 320 | # as.data.frame() 321 | 322 | twitter_data<-as.data.frame(cbind(twitter_names, twitter_handles)) 323 | 324 | # Now you know the basics of screen scraping. But there are two 325 | # more things you need to know about. First of all, if you are scraping 326 | # an XML website you can use other functions such as XML2R. There is 327 | # A nice tutorial on this here: 328 | 329 | #http://cpsievert.github.io/slides/web-scraping/#1 330 | 331 | # One more thing. Often, you don't want to scrape just one website, 332 | # but many websites. This means you need to generate a list of website 333 | # that you can then pass through a "for" loop and extract whatever 334 | # type of data that you are searching for. 335 | 336 | # IF you are trying to repeatedly scrape one website for lots of sub 337 | # pages, you may be able to recognize patterns in the way that the 338 | # URLs are formed, and then use "gsub" or "paste" commands to change 339 | # your url calls to collect HTML or XML. 340 | 341 | # By now you can probably tell that some screen scraping exercises 342 | # are much easier than others. It simply depends upon the structure 343 | # of the website, and its overarching structure. 344 | 345 | # As I mentioned earlier, many sites now have functions that stop 346 | # you from scraping them. If you try to request to many different 347 | # sub-sections of the same site, for example, you will eventually 348 | # get an error that says something about "authentication" or an' 349 | # "SSL" error, or an "OAuth" error. 350 | 351 | #WORKING WITH AN API 352 | 353 | # Sites that block you- which are unfortunately most of the sites you 354 | # might want to scrape- usually offer a powerful alternative: an 355 | # Application Programming Interface (API) 356 | 357 | # An API is a type of web infrastructure that enables a developer (you) 358 | # to request large amounts of specific information from a website. The 359 | # Website then creates a new URL that contains the data you request, 360 | # and you scrape it. This has become such an important part of the web 361 | # that most large websites now have APIs (e.g. Google, Twitter, Facebook 362 | # even the New York Times) 363 | 364 | # APIs Are called Application Programming Interfaces because may of the people 365 | # who use them are building apps. For example, a music sharing website 366 | # might want to build an app that helps people expose their friends to 367 | # new types of music. But to do this it needs to request permission to 368 | # extract certain types of information about the person from a site 369 | # such as Facebook. 370 | 371 | # But Facebook obviously can't give them all the info. Facebook needs to 372 | # make sure that the person wants them to access their data. They also 373 | # need to make sure the app develop can only access certain types of data 374 | # and not all the data that Facebook has. 375 | 376 | # To do this, Facebook- and other sites that have APIs- have "authenticat# ion tokens," or "access keys." These are simply codes that you need to # give when you request data from an API. 377 | 378 | # Let's take a look at how the Facebook API works using the "Facebook 379 | # Graph API Explorer. This is a website that lets you see how an 380 | # API works, also known as a "sandbox": 381 | 382 | # https://developers.facebook.com/tools/explorer 383 | 384 | # try typing "me/friends" into the search bar below the text "FQL Query." # This is a tool that shows you what the results would look like if you #made this API request. 385 | 386 | # what it is actually doing is forming the URL request and then showing you the JSON-format data that would load if you pasted the URL in your browser. 387 | 388 | # Most sites that have APIs do not have this type of "sandbox." but 389 | # learning how to master working with them is a really nice to skill because 390 | # there are so many APIs out there. 391 | 392 | #At present, there are more than 13,000 APIs. You can see a list of them here: http://www.programmableweb.com/category/all/apis?order=field_popularity 393 | # Academics may be interested to know that many data archiving sites now offer 394 | # APIs (such as ProQuest). Many are free to use, but others cost significant 395 | # amounts of money. 396 | 397 | # Most APIs have "rate limits" which determine how many 398 | # requests for information a developer (you) can make within a certain time frame 399 | 400 | 401 | # In R, you can either interact with an API by forming requests for data within 402 | # a loop and "scraping" the resultant data from URLS "by hand," or you can 403 | # use a variety of user-generated packages to collect data. 404 | 405 | # Because we already covered screen scraping, let's look at one of these packages. Let's start with the twitteR package. 406 | 407 | install.packages("twitteR") 408 | 409 | #The instruction manual for this package is here: 410 | 411 | # http://cran.r-project.org/web/packages/twitteR/twitteR.pdf 412 | 413 | # The first thing you need to do is register as a developer with Twitter. 414 | # in order to do this, you need to visit this page: 415 | 416 | # apps.twitter.com 417 | 418 | #Unfortunately, if you don't have a Twitter account, you'll have to make one, 419 | # or follow along on your neighbor's laptop if they don't mind. 420 | 421 | #THe next step is to click on "Create New App." You need to name your app, and 422 | # provide some other credentials. It really does not matter much what you 423 | # put in here, because we are not building an app that other people are going 424 | # to use. I just put in my own website. You can leave the "Callback URL field blank." 425 | 426 | # Our goal in doing this is to obtain a Twitter API Key which we need to extract 427 | # Any data from Twitter. TO do this we need to scroll down to the "Application 428 | #Settings section, and then click the blue "manage keys and access token" link 429 | # That is to the right of our Consumer Key 430 | 431 | # The next thing we need to do is tell the twitteR package what our secret 432 | #login details are. I can't write mine in here because if this information 433 | # got out a hacker could use it to pose as me, or get data collected by my 434 | # app which I might not want her or him to have. 435 | 436 | setup_twitter_oauth(consumer_key="TEXTOFYOURKEYHERE", 437 | consumer_secret="TEXTOFYOURSECRETHERE", 438 | access_token="TEXTOFACCESSTOKENHERE", 439 | access_secret="TEXTOFACCESSSECRETHERE") 440 | 441 | 442 | # When we run this last line, it will ask us if we want to use a 443 | # local file to store these "credentials." I am going to say "no" 444 | # and load these into R each time I need them. 445 | 446 | # What this twitteR package is doing for us is simplifying some of 447 | # the complex URL requests we would need to put in each URL call 448 | # we make to the TWitter API. Once all of our authentication 449 | # information is in the system, we have a range of useful commands 450 | # available to us. 451 | 452 | #First, we can define a Twitter user whose information we want to scrape 453 | # you can use my name, or feel free to put in your own name 454 | # instead of mine 455 | 456 | user <- getUser("chris_bail") 457 | 458 | # Let's get a list of my "friends"- by friends, the author of this package is referring 459 | # to the name of the people that I follow on Twitter: 460 | 461 | friends<-user$getFriends() 462 | 463 | # Now let's get a list of people who follow me on Twitter: 464 | followers<-user$getFollowers() 465 | 466 | #We can also get a list of all my favorite Tweets: 467 | favorites<-favorites(user) 468 | 469 | #This package also has some nice commands for formatting these 470 | # data as data.frames: 471 | 472 | friendsdata <- twListToDF(friends) 473 | followersdata <- twListToDF(followers) 474 | 475 | #This the command I would use to get a user's tweets: 476 | tweets<-userTimeline(user) 477 | 478 | # I mentioned earlier that Twitter will set shut us 479 | # down if we ask for two much data. this command 480 | # let's me see the limits on what I can do 481 | # within a given time frame: 482 | 483 | getCurRateLimitInfo() 484 | 485 | # Remember that list of top twitter accounts we got? 486 | # Let's see if we can scrape network data from these folks. 487 | # First, let's remind ourselves what these data look like: 488 | 489 | head(twitter_data) 490 | 491 | # So what I am going to want to do is create a for loop 492 | # where I make each person the "user" in each iteration 493 | # and scrape the names of the people they follow: 494 | 495 | # Create a blank data frame to store data we scrape 496 | twitter_network_data<-as.data.frame(NULL) 497 | # figure out how many rows we have to scrape 498 | z<-nrow(twitter_data) 499 | 500 | # start for loop that gets names of people the user 501 | # follows and append them to the dataset we 502 | # just created. Finally take a break between 503 | # pulling each user's Twitter data in order 504 | # to prevent Twitter rate limiting kicking in: 505 | 506 | for(i in 1:z){ 507 | user <- getUser(twitter_data$twitter_handles[i]) 508 | people_user_follows <- user$getFriends() 509 | people_user_follows<-twListToDF(people_user_follows) 510 | people_user_follows$name_of_user<-twitter_data$twitter_handles[i] 511 | twitter_network_data<-rbind(twitter_network_data, people_user_follows) 512 | Sys.sleep(60) 513 | print(i) 514 | } 515 | 516 | # We don't have time to run this loop together, it will take quite a bit 517 | # of time to run. 518 | 519 | 520 | ## There are many many more R packages for working with APIS: 521 | ## Here are a few: `RgoogleMaps`, `Rfacebook`, `rOpenSci` 522 | ##(this one combines many different APIs e.g. the Internet Archive), 523 | ##`WDI`,`rOpenGov`,`rtimes` 524 | ##Many more are available but not yet on CRAN (install from 525 | ##github or using devtools) 526 | 527 | ## There are also APIS that you can use to do analyses, like plotly 528 | # for visualization. 529 | 530 | # But there are still APIs that don't have R packages (many of them) 531 | 532 | # Let's pretend there was no R package for Google Maps, what would we do? 533 | # first: look for patterns 534 | # https://maps.googleapis.com/maps/api/geocode/json?address=Durham,NorthCarolina&sensor=false 535 | # In this case, address goes between the first** `=` **and the** `&` 536 | 537 | findGPS <- function(address,sensor = "false") { 538 | beginning <- "http://maps.google.com/maps/api/geocode/json?" 539 | paster <- paste0(beginning,"address=", address, "&sensor=false") 540 | return(URLencode(paster)) 541 | } 542 | 543 | findGPS("Durham, North Carolina") 544 | 545 | # let's put it all together 546 | 547 | page<-findGPS("Durham, North Carolina") 548 | gpscoordinates <- fromJSON(page) 549 | latitude <- gpscoordinates$results[[1]]$geometry$location["lat"] 550 | longitude <- gpscoordinates$results[[1]]$geometry$location["lng"] 551 | gps<-c(latitude, longitude) 552 | gps 553 | 554 | # we could then wrap them in a loop. 555 | 556 | 557 | 558 | 559 | -------------------------------------------------------------------------------- /Class R Code/Class #5 R Code.R: -------------------------------------------------------------------------------- 1 | # SCRIPT FOR CLASS #5, COMPUTATIONAL SOCIOLOGY 2 | # Instructor: Chris Bail 3 | # Copyright, Chris Bail 4 | 5 | 6 | # INTRODUCTION 7 | 8 | # What can we do with all of the data we collected last week given that 9 | # we can't read it all by ourselves? 10 | 11 | #Fortunately, 12 | # the fields of computer science and computational linguistics have 13 | # jointly produced a suite of new tools that make our job easier. 14 | 15 | # Though we once had to hire teams of research assistants to collect, 16 | # standardize, and analyze large corpora, a single computer or group of 17 | # computers can now do this to text-based datasets that are so large that 18 | # human coders could never analyze them all. 19 | 20 | # But these new techniques will be unfamiliar to you if you come from a 21 | # conventional statistics background. Text-based datasets do not come 22 | # prepackaged. Instead, they are unstructured and usually very messy. 23 | 24 | # This is often because automated collection of texts often produces 25 | # texts that are formateed or structured differently. The first task 26 | # we will discuss in this class is simply how to automate collection 27 | # of texts via the internet. 28 | 29 | # But in order to analyze the large amounts of texts that can be analyzed 30 | # using these methods, we need to do something equally challenging: we 31 | # need to transform these texts into numbers, so that they can we can 32 | # classify them using automated tools for text analysis such as topic 33 | # modeling. 34 | 35 | # first, we need a large corpus of documents. You may already 36 | # have these, but we are going to grab ours from the internet. To 37 | # create our corpus, we are going to use the "tm" package 38 | # in R: 39 | 40 | #install.packages("tm") 41 | library(tm) 42 | 43 | # Next, I'm going to read in that political blogs data from our dropbox: 44 | 45 | blog_data<-read.csv("poliblogs2008.csv", stringsAsFactors = FALSE) 46 | 47 | # One of the key commands in the tm package is the "corpus" command. This 48 | # creates- you guessed it- a corpus! We need to tell it the name of 49 | # the variable from the data set we want to import, and we also need to tell it that this object is a dataset, since the command can also be used to import a directory of text files or other types of data 50 | 51 | #first let's figure out the names of the blog_data dataset: 52 | 53 | colnames(blog_data) 54 | 55 | # The one we want is called "documents," let's check one out: 56 | 57 | blog_data$documents[1] 58 | 59 | 60 | #did you notice the funny text in there (e.g. \xe5\xca ?). This happens if you don't have the correct character encoding 61 | # I'm going to clean up the character encoding before we work with 62 | 63 | blog_data$documents <- iconv(blog_data$documents, "latin1", "ASCII", sub="") 64 | 65 | 66 | blog_corpus <- Corpus(VectorSource(as.vector(blog_data$documents))) 67 | 68 | # That's it! Now our data are in "corpus" format, which is going 69 | # to let us begin to do run basic text processing commands on our 70 | # blog posts, and eventually automated forms of content analysis 71 | # known as topic modeling. 72 | 73 | # We could of course code all of these blog posts by hand, or hire 74 | # a team of undergrads to do this for us, but this would create a 75 | # number of probems ranging from coder-burnout to inter-coder reliability 76 | 77 | # The only alternative to hand coding used to be word count analysis, where 78 | # one simply counts the number of times a word appears in a document. Over 79 | # the past 10 years or so, however, we have taken leaps and bounds in the 80 | # field of automated text analysis. 81 | 82 | # I could introduce you to many different ways of classifying text, but 83 | # for now, we are going to focus on the most popular method at the moment: 84 | # topic modeling. 85 | 86 | # Topic modeling is an automated technique that looks at patterns of how 87 | # words co-appear within documents in order to classify them into latent 88 | # groups of topics. 89 | 90 | # This technique is not perfect, as we will see, but it is much much better 91 | # then keyword analysis. This is because it is better at recognizing the 92 | # polysemy of words-- that is, how words can take on different meanings 93 | # if they occur next to other words. 94 | 95 | # We don't have time to get into the math of topic modeling, but I will 96 | # just briefly tell you that the methods we are going to use are based upon 97 | # a probabilistic Bayesian method known as Latent Dirichlet Allocation, 98 | # which is often abbreviated as LDA. 99 | 100 | # Unfortunately, we cannot just run a simple "lda" command on the corpus 101 | # we created in the previous section of this class. This is because lda 102 | # must analyze numbers, and not words. More specifically, lda requires 103 | # us to create a document term matrix, or a set of numbers that describe 104 | # where different words occur across documents. These are the data that the 105 | # lda algorithms actually analyze. 106 | 107 | # But even before we create a document term matrix, we need to make some 108 | # important decisions. It is common practice in the field of "Natural 109 | # Language Processing" to pre-process text. This is because most text is 110 | # messy- it contains punctuation, variations in spelling, and other 111 | # problems that make the lda algorithms less effective. 112 | 113 | # for example, right now our corpus includes dashes (-) do we really want 114 | # our algorithm to treat this-- or any other punctuation mark-- as a "word" 115 | # that should carry a meaning? Probably not. Fortunately, the tm package 116 | # can remove all punctuation as follows: 117 | 118 | blog_corpus <- tm_map(blog_corpus, content_transformer(removePunctuation)) 119 | 120 | # Also, if we replaced all words in our document with unique identifiers 121 | # right now, the words "dog" and "Dog" would be treated differently 122 | # because one includes a capital letter and the other does not. 123 | 124 | # This command will make all the words lowercase to get around this 125 | # problem: 126 | 127 | blog_corpus <- tm_map(blog_corpus, content_transformer(tolower)) 128 | 129 | # to a computer, even a space in between words is treated is something 130 | # that is meaningful, so believe it or not we need to ask R to 131 | # remove spaces before or after a word from our dataset: 132 | 133 | blog_corpus <- tm_map(blog_corpus , content_transformer(stripWhitespace)) 134 | 135 | # Next, we need to decide what we want to do with extremely common words 136 | # such as "and" or "the." As soon as we move from words to numbers, all 137 | # words are treated equally, but we know that these very common words 138 | # are not going to add much meaning to our analysis. 139 | 140 | # very common words are often called "stop words" or words we don't want 141 | # to include in our analysis. I put a csv file that contains a popular 142 | # list of stop terms in the dropbox, let's read that in, and use it to 143 | # remove those words from our corpus: 144 | 145 | stoplist <- read.csv("english_stopwords.csv", header=TRUE, stringsAsFactors = FALSE) 146 | stoplist<-stoplist$stopword 147 | blog_corpus <- tm_map(blog_corpus , content_transformer(removeWords), stoplist) 148 | 149 | # I want to pause and note that there some within the field 150 | # who believe stop words should not be removed. Some people believe we 151 | # lost important context. For example the phrase "I hate the president," is 152 | # much much different than "I'd hate to be president" but if we remove stop 153 | # words, both phrases would be reduced to "hate" and "president." 154 | 155 | # Another somewhat controversial issue is whether you should "stem" words. 156 | # Stemming means taking a word like "gladly" and transforming it into 157 | # its root word, which is "glad." 158 | 159 | # This is actually a very complex task that requires some sophisticated 160 | # databases, fortunately, the tm package handles all of that for us 161 | # by calling data from some other websites 162 | 163 | blog_corpus <- tm_map(blog_corpus , content_transformer(stemDocument), language = "english") 164 | 165 | # Ok, we are now finally ready to create our document- term matrix. 166 | # The command for this in the tm package is: 167 | 168 | Blog_DTM <- DocumentTermMatrix(blog_corpus, control = list(wordLengths = c(2, Inf))) 169 | 170 | # I've asked R to only create the matrix for words that are great than 2 171 | # characters long. This is to get rid of some messy stuff that was created 172 | # throughout the text pre-processing stages described above. 173 | 174 | # Betcha can't wait to look at your first Document-Term matrix, huh? 175 | 176 | inspect(Blog_DTM[300:310,1000:1002]) 177 | 178 | # some words hardly ever appear in any documents. In order to handle such 179 | # words, we can drop them from our document term matrix because it 180 | # makes our topic models perform a bit better (they don't struggle 181 | # to figure out what to do with these rare terms): 182 | 183 | DTM <- removeSparseTerms(Blog_DTM , 0.990) 184 | 185 | #I've now removed terms that only appear in .01 of all documents. 186 | 187 | # Now that all of our words are properly cleaned, let's take a look 188 | # at some of the most popular terms. The following line finds all 189 | # the words that occur more than 3,000 times in the dataset 190 | 191 | findFreqTerms(DTM, 3000) 192 | 193 | # This is a good step to get a sense of whether or not there 194 | # are still words in your document term matrix that you do 195 | # not want to exert undue influence upon your topic models. 196 | 197 | # Ok, now we are ready to run a topic model.... finally! 198 | 199 | # One downside of topic models is they do not automatically 200 | # figure out how many topics exist in corpus. Ideally, 201 | # we would have a sense of how many their might be. Let's 202 | # take a wild guess and say there's seven topics- 203 | # just for the purpose of illustration. 204 | 205 | # the number of topics in the lda package is controlled 206 | # by a parameter called k: 207 | 208 | k<-7 209 | 210 | # Now we need to set a bunch of additional parameters. We 211 | # don't have time to walk through what each of these mean 212 | # right now. Unfortunately. Some of them help us ensure 213 | # that we can get reproducible results, others help us 214 | # asset the fit of our model. 215 | 216 | control_LDA_Gibbs <- list(alpha = 50/k, estimate.beta = T, 217 | verbose = 0, prefix = tempfile(), 218 | save = 0, 219 | keep = 50, 220 | seed = 980, # for reproducibility 221 | nstart = 1, best = T, 222 | delta = 0.1, 223 | iter = 2000, 224 | burnin = 100, 225 | thin = 2000) 226 | 227 | # Ok, now let's reate a topic model using the "Gibbs" Sampling method, and the 228 | # "control" parameters we just declared. Also, we need to install the "lda" 229 | # package 230 | 231 | #install.packages("topicmodels") 232 | library(topicmodels) 233 | 234 | my_first_topic_model <- LDA(DTM, k, method = "Gibbs", control = control_LDA_Gibbs) 235 | 236 | # And then we can look at which words are associated with which topic. Here we look at the top 20 words by topic. 237 | 238 | terms(my_first_topic_model, 30) 239 | 240 | 241 | # I mentioned earlier that there is no way to figure out the appropriate number 242 | # of topics, but we can look at shifts in the log likelihoods produced by 243 | # the LDA and try to identify the point where the curve flattens out. 244 | 245 | # To do this, however, we have to repeate our LDA again and again. For example, 246 | # this code will run models that have everywhere from 2 to 35 topics 247 | 248 | many_models <- mclapply(seq(2, 35, by = 1), function(x) {LDA(Blog_DTM, x, method = "Gibbs", control = control_LDA_Gibbs)} ) 249 | 250 | # Hat tip to Achim Edelman for this nice function! 251 | 252 | many_models.logLik <- as.data.frame(as.matrix(lapply(many_models, logLik))) 253 | 254 | # We can then plot the results to see where we get decreasing returns for 255 | # increasing the number of topics: 256 | 257 | plot(2:35, unlist(lda.models.gibbs.logLik), xlab="Number of Topics", ylab="Log-Likelihood") 258 | 259 | # Once we choose the best number of topics, we can change k and 260 | # run our model again 261 | 262 | k<-10 263 | 264 | my_first_topic_model <- LDA(Blog_DTM, k, method = "Gibbs", control = control_LDA_Gibbs) 265 | 266 | # And if we want to see how each document gets assigned to each 267 | # topic, we can simply right 268 | 269 | topic_assignments_by_docs <- topics(my_first_topic_model) 270 | 271 | 272 | 273 | #STRUCTURAL TOPIC MODELS 274 | 275 | # topic models perform much better with meta-data. Here is how to use Brandon Stewart's stm package 276 | # I borrow his example verbatim here: 277 | 278 | #install.packages("stm") 279 | library(stm) 280 | processed <- textProcessor(blog_data$documents, metadata = blog_data) 281 | #structure and index for usage in the stm model. Verify no-missingness. 282 | out <- prepDocuments(processed$documents, processed$vocab, processed$meta) 283 | #output will have object meta, documents, and vocab 284 | docs <- out$documents 285 | vocab <- out$vocab 286 | meta <-out$meta 287 | 288 | # use the utility function prepDocuments to process the loaded data to make sure it is in the right format 289 | plotRemoved(processed$documents, lower.thresh=seq(1,200, by=100)) 290 | 291 | # and run the model 292 | poliblogPrevFit <- stm(out$documents,out$vocab,K=20, 293 | prevalence =~ rating+ s(day), max.em.its=75, 294 | data=out$meta,seed=5926696) 295 | 296 | # see the vignette at http://structuraltopicmodel.com/ for many more helpful tools 297 | # for model interpretation and validation (including visualization and identifying illustrative quotes) 298 | -------------------------------------------------------------------------------- /Class R Code/Class #6 R Code.R: -------------------------------------------------------------------------------- 1 | # SCRIPT FOR CLASS #6, COMPUTATIONAL SOCIOLOGY 2 | # Instructor: Chris Bail 3 | # Copyright, Chris Bail 4 | 5 | # Visualizing and Analyzing Data 6 | 7 | # Base R has a variety of routines for visualizing data. Many 8 | # of these are fairly good, but there is widespread consensus 9 | # that the "ggplot2" package provides the most sophisticated 10 | # visualization capacities. Let's take a quick peak at 11 | # some of ggplot's capabilities: 12 | # http://shinyapps.stat.ubc.ca/r-graph-catalog/ 13 | 14 | 15 | # Let's install ggplot2 16 | 17 | install.packages("ggplot2") 18 | 19 | # The data we are going to use today come "built in" with 20 | # the ggplot package. These data describe various 21 | # characteristics of a large sample of diamonds (e.g. their 22 | # size, cut, clarity) 23 | 24 | # to load a built in dataset we need to call ggplot2 and then 25 | # run the data() command 26 | 27 | library(ggplot2) 28 | data("diamonds") 29 | 30 | #2.1 Scatterplots 31 | 32 | #Let's try a basic scatterplot in ggplot2: 33 | 34 | ggplot(diamonds, aes(x=carat, y=price)) + geom_point() 35 | 36 | #`diamonds` is the data set we want to plot 37 | 38 | #"aes" refers to the x,y coordinates we want to plot. 39 | #Note that we did not need to use the 40 | #`$` operator to specify the variable names geom_point() 41 | #describes the type of plot. The `+` indicates this 42 | #is a "layer." We can add many different types of layers 43 | #to a ggplot2 graph, as we will soon see. 44 | 45 | #Not bad, but the graph could be much more informative if 46 | # we added some color. Lets color the points of the graph 47 | # according to the clarity of the diamonds. This variable 48 | # is a factor variable 49 | 50 | ggplot(diamonds, aes(x=carat, y=price, color=clarity)) + geom_point() 51 | 52 | # Conventiently, ggplot automatically creates a legend 53 | # on the right hand side of the graph. 54 | 55 | # But we can go even further by manipulating the size of the 56 | # points. 57 | 58 | ggplot(diamonds, aes(x=carat, y=price, color=clarity, size=cut)) + geom_point() 59 | 60 | # Note that we can also use different kinds of shapes (instead of 61 | # circles) by specifying `shape=` within our "aes" command. 62 | 63 | # Earlier I mentioned that ggplot uses layers. Let's go even 64 | #further and add a some smoothing to further illustrate 65 | # the relationship between price and carat 66 | 67 | ggplot(diamonds, aes(x=carat, y=price)) + geom_point() + geom_smooth() 68 | 69 | # The blue line is the result of the smoothing, and the 70 | # grey bars are the standard errors Here we can see 71 | # there is not a 1 to 1 relationship between carat and price. 72 | 73 | # we can put a variety of "options" within geom_smooth if we 74 | # want. For example, if we want to use linear regression to 75 | # draw the trendline, we can write: 76 | 77 | ggplot(diamonds, aes(x=carat, y=price)) + geom_point() + geom_smooth(method="lm") 78 | 79 | # Or, if we want to look at different trend lines for different 80 | # variables, we could run: 81 | 82 | ggplot(diamonds, aes(x=carat, y=price, color=clarity)) + geom_smooth() 83 | 84 | #Note that the points disappeared from our chart because we 85 | #removed the geom_point() layer. 86 | 87 | # Many R users not only like to add layers to individual plots, 88 | # but create many plots beside each other in order to 89 | # communicate even more information. 90 | 91 | #3.2 Facet Wraps 92 | 93 | # To do this, we add something called a "facet_wrap:" 94 | 95 | ggplot(diamonds, aes(x=carat, y=price)) + geom_point() + facet_wrap(~ cut) 96 | 97 | # The tilde here tells R which variable it should use to make 98 | # the separate plots 99 | 100 | # We could of course bring back in color, or change the size 101 | # of the points again. 102 | 103 | # This is really just the very beginning of ggplot's capability 104 | # If this were a course on visualization alone, we could go 105 | # into much greater depth about how to customize titles (ggtitle, 106 | # xlab, ylab), Or how to change the range of an axis (e.g. xlim(0,2)) 107 | 108 | # But for now, let's look at some other types of graphs ggplot 109 | # can produce. Here is a line graph: 110 | 111 | ggplot(diamonds, aes(x=carat, y=price, color=clarity)) + geom_line() 112 | 113 | # 3.3 Histograms, Boxplots, Violin Plots 114 | 115 | # Here is a histogram 116 | 117 | ggplot(diamonds, aes(x=price)) + geom_histogram() 118 | 119 | ## once again, each layer has a range of different options, let's 120 | # say we want the bars to represent more unique values of price. 121 | # to do this, we'd use the bindwidth option 122 | 123 | ggplot(diamonds, aes(x=price)) + geom_histogram(binwidth=100) 124 | 125 | # once again, we could add facet_wraps or other types of 126 | # functionality to this graph, but in the interest of time 127 | # let's keep going and look at some other types of plots 128 | 129 | # For example, ggplot's boxplots are fairly popular 130 | 131 | ggplot(diamonds, aes(x=color, y=price)) + geom_boxplot() 132 | 133 | # We can get a bit more information about the standard 134 | # errors via a "violin plot": 135 | 136 | ggplot(diamonds, aes(x=color, y=price)) + geom_violin() 137 | 138 | 139 | ## Now You Try It: 140 | # 1) Load the `mtcars` **data 141 | # 2) plot the relationship between the `mpg` and `hp` variables in 142 | # the form of a scatterplot with facets for the `gear` variable 143 | # 3) Bonus points: add a title to the graph 144 | 145 | 146 | # Solution: 147 | 148 | ggplot(mtcars, aes(mpg, hp)) + geom_point() +facet_wrap(~gear) +ggtitle("Relationship Between MPG and Horsepower by Number of Gears") 149 | 150 | 151 | ##OTHER VISUALIZATION PACKAGES 152 | 153 | ## Though we have focused on ggplot, There are so many 154 | ## other great packages for visualization in R. Check 155 | ## out this "tabplot" package, for example: 156 | 157 | install.packages("tabplot") 158 | library(tabplot) 159 | tableplot(diamonds) 160 | 161 | # Or check out this beautiful map of geo-tagged tweets 162 | # created using "ggmap" a spin-off of ggplot 163 | 164 | 165 | # Finally, I'll show you how to create the heatmap 166 | # I showed you earlier 167 | 168 | nba <- read.csv("http://datasets.flowingdata.com/ppg2008.csv", sep=",") 169 | nba <- nba[order(nba$PTS),] 170 | row.names(nba) <- nba$Name 171 | nba <- nba[,2:20] 172 | nba_matrix <- data.matrix(nba) 173 | nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA, col = cm.colors(256), scale="column", margins=c(5,10)) 174 | 175 | # I briefly showed you this example to note how many 176 | # visualization tasks involve some data cleaning 177 | # or re-structuring before they can be performed. 178 | # In this case, we sorted the data, changed row names, 179 | # and transformed the data into a matrix. 180 | 181 | # But many regular applications require even more 182 | # Challenging tasks such as reshaping your data from 183 | # wide to long, or re-formatting variables from 184 | # character to factor, or factor to numeric. 185 | 186 | 187 | ## Exporting plots from R is fairly straitforward. If you are working with a ggplot object, 188 | ## the ggsave command is quite useful: 189 | 190 | myplot<-ggplot(diamonds, aes(x=color, y=price)) + geom_violin() 191 | ggsave(file="organ donation.png", plot=myplot, width=5, height=5, dpi=300) 192 | 193 | 194 | #If you are using another package, wrap your plot in between the png() function 195 | # and the dev.off() function as follows 196 | 197 | # We do this by sandwhich our code in between two new lines: 198 | png(file="nba_heatmap", width=480, height=480) 199 | nba <- read.csv("http://datasets.flowingdata.com/ppg2008.csv", sep=",") 200 | nba <- nba[order(nba$PTS),] 201 | row.names(nba) <- nba$Name 202 | nba <- nba[,2:20] 203 | nba_matrix <- data.matrix(nba) 204 | nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA, col = cm.colors(256), scale="column", margins=c(5,10)) 205 | dev.off() 206 | 207 | # A png file entitled nba_heatmap is now saved in our working directory 208 | # we can change the size using the "width" and "height" options 209 | # within the png command above (and you can do the same in the ggsave command) 210 | 211 | 212 | # Because the Twitter API Script takes a long time to run, I ran earlier and saved the data for you in a 213 | # .Rdata file that is in our class dropbox. It is called "Twitter 214 | # Network Data.Rdata". If you have set your working directory 215 | # to be the same folder that you downloaded from Dropbox, 216 | # you can write: 217 | 218 | load("Twitter Network Data.Rdata") 219 | 220 | # Once we have the complete dataset, we could do any number 221 | # of things. Because I'm guessing that many of you are interested 222 | # in network analysis, let's just make a quick network plot using 223 | # the igraph package: 224 | 225 | # First let's install and load the package: 226 | install.packages("igraph") 227 | library(igraph) 228 | 229 | #Next, let's convert our data frame to an "igraph object" 230 | # which is necessary to do any network analysis in this 231 | # package. 232 | 233 | twitter_igraph <- graph.data.frame(twitter_network_data, directed=FALSE) 234 | 235 | #Calculating network stats is extremely easy using igraph: 236 | 237 | twitter_betweennes<-betweenness(twitter_igraph) 238 | twitter_closeness<-closeness(twitter_igraph) 239 | twitter_clustering_coefficient<-transitivity(twitter_igraph) 240 | 241 | #...and there are many, many more. Working with two-mode, weighted, and dynamic 242 | # network data is R is also very easy because of its sophisticated database manipulation tools 243 | # as well as a number of different packages such as `sna tnet SoNIA` 244 | 245 | 246 | # Now, let's plot it: 247 | 248 | plot(twitter_igraph) 249 | 250 | # If you got an error message that says "figure margins too large" 251 | # run this code which resets the allowable limits of visualizations: 252 | 253 | par(mar = rep(2, 4)) 254 | 255 | # what a mess! First, we are simply trying to plot too much 256 | # info, we need to drop all the people who have very few 257 | # network ties from the dataset in order to get a cleaner 258 | # picture: 259 | 260 | only_cool_kids<-delete.vertices(twitter_igraph,which(degree(twitter_igraph)<20)) 261 | plot(only_cool_kids) 262 | 263 | # We are still getting an error message about the labels not working. This 264 | # is most likely because some of the characters in the data we read in 265 | # were in a foreign language (remember, our data describe the most popular 266 | # Twitter users around the entire world) 267 | 268 | #To fix this I'm going to change character encoding as follows: 269 | 270 | twitter_network_data$Source<-iconv(twitter_network_data$Source, "latin1", "ASCII", sub="") 271 | twitter_network_data$Target<-iconv(twitter_network_data$Target, "latin1", "ASCII", sub="") 272 | 273 | # now we need to repeat all the steps: 274 | twitter_igraph <- graph.data.frame(twitter_network_data, directed=FALSE) 275 | 276 | # We are still getting a warning message that says some strings were read 277 | # in as NA. This is probably because the character conversion did not 278 | # work for every single language that Twitter users can use: 279 | 280 | # once again, let's prune the network 281 | only_cool_kids<-delete.vertices(twitter_igraph,which(degree(twitter_igraph)<20)) 282 | plot(only_cool_kids) 283 | 284 | # What are those funny loops? Those are people who are following themselves? 285 | # must be a data building error. A quick fix would be to use igraph's 286 | # simplify command which removes these self references 287 | 288 | only_cool_kids<-simplify(only_cool_kids) 289 | plot(only_cool_kids) 290 | 291 | #looking better, but hardly beautiful. Let's try a different layout: 292 | 293 | plot(only_cool_kids, layout=layout.reingold.tilford) 294 | 295 | #Katy Perry rules all! 296 | 297 | plot(only_cool_kids, layout=layout.circle) 298 | 299 | #Everyone is cool! 300 | 301 | plot(only_cool_kids) 302 | 303 | #USING GEPHI FOR NETWORK VISUALIZTION 304 | 305 | # You can spend a long time making network plots look pretty in igraph, but I prefer to use Gephi, because it is 306 | # much more interactive and has a better graphics engine. It also works well with large network datasets, and it can handle 307 | # both node and edge attributes, as well as dynamic/longitudinal network data. 308 | 309 | # See the slides associated with this lecture for instructions about how to install Gephi, import data, and 310 | # visualize/analyze your data. 311 | 312 | -------------------------------------------------------------------------------- /Class R Code/Class #7 R Code.R: -------------------------------------------------------------------------------- 1 | # SCRIPT FOR CLASS #7, COMPUTATIONAL SOCIOLOGY 2 | # Instructor: Chris Bail 3 | # Copyright, Chris Bail 4 | 5 | # LINEAR MODELING 6 | 7 | # Most of you are not only interested in visualizing 8 | # your data, but understanding it at a much deeper level. 9 | # We can learn quite a lot from visualization. Everything 10 | # from identifying patterns of missing data to three and 11 | # four way interactions between variables. 12 | 13 | # But most of us also care about "statistical significance," 14 | # to some degree. While there are new debates about the 15 | # meaning of significance in the age of big data, I do not 16 | # think we are going to abandon P-values over night. 17 | 18 | # In this class, we will 19 | # focus on some very basic statistics one can do in R. 20 | # Once again, I will try to show you enough that you 21 | # could learn more on your own, but because R is used 22 | # by everyone from economists to ecologists, the range 23 | # of statistical routines available is truly astounding 24 | # and therefore too broad to cover in one morning- let 25 | # alone one week! 26 | 27 | # As always, I'm going to focus on messy, "real-world" 28 | # examples. These are particularly important this week 29 | # because linear models can create very big problems 30 | # with big data. This includes not only issues related to 31 | # p-values, but also outlier detection, equifinality, 32 | # and a range of other issues. So today we will look 33 | # at some of the data I've collected and make some 34 | # mistakes that I hope will help you avoid similar 35 | # issues in your own work. 36 | 37 | # The other reason I'm teaching you how to run models 38 | # in R is that often computational social science requires 39 | # an iterative approach to data cleaning, data classification, 40 | # and data analysis. If you keep moving data back and forth 41 | # between R and STATA Or SAS, you will waste a lot of time 42 | # converting file formats, etc. Plus, I will try to convince 43 | # you that R is a much better platform for statistical analyses 44 | # because it allows you to use state of the art techniques. 45 | # This is also the reason why I advocate R over Python (stats 46 | # packages in the latter pale in comparison to the former) 47 | 48 | # Let's start by some of the most basic statistical 49 | # analysis available to us in R. We are going to return 50 | # to base R for the time being, but I'll show you 51 | # a few packages later on. 52 | 53 | # First, let's load our Pew data from yesterday using the 54 | # load command: 55 | 56 | load("Pew Data.Rdata") 57 | 58 | # Remember that you will need to specify a file path if the file 59 | # is not in your working directory. To check, write "getwd()" 60 | 61 | # let's try to predict whether people support or oppose 62 | # the ground zero mosque (this variable is labelled pew10) 63 | # 1=oppose construction, 2= support construction 64 | 65 | table(pewdata$pew10) 66 | 67 | # let's create a new factor variable that describes 68 | # whether someone is Republican 69 | 70 | pewdata$Republican<-0 71 | pewdata$Republican[pewdata$partlyn=="Republican"]<-1 72 | 73 | 74 | #Now, let's run a ttest to see if being a republican 75 | # shapes your opinion of the ground zero mosque. To 76 | # do this we use the t.test command. Note that we can 77 | # specify our data as an option in this command, or 78 | # write them out using $. 79 | # The ~ in r generally stands for "explained with." 80 | 81 | t.test(pew10 ~ Republican, data=pewdata) 82 | 83 | # here we see a clear cut difference. 84 | 85 | # but was is the direction? To figure this out we 86 | # need to run a correlation. We can use the cor.test 87 | # command 88 | 89 | # CORRELATIONS 90 | 91 | cor.test(pewdata$pew10, pewdata$Republican) 92 | 93 | # we see a negative, significant relationship, so 94 | # this suggests that being republican is negatively 95 | # associated with supporting the Ground Zero Mosque 96 | 97 | 98 | # One of the most useful things about R is that it 99 | # can very easily and efficiently combine statistics 100 | # and visualization. 101 | 102 | # Let's say we wanted to quickly identify variables 103 | # with significant correlations with each other. The 104 | # corrgram package produces a "heatmap-style" correlation 105 | # matrix: 106 | 107 | install.packages("corrgram") 108 | library(corrgram) 109 | corrgram(pewdata) 110 | 111 | 112 | 113 | # MULTIPLE LINEAR REGRESSION 114 | 115 | # but what if we are concerned that age is a confounding 116 | # factor in this relationship. That is older people might 117 | # be less accepting of Muslims. 118 | 119 | # To figure this out, we'd need to run a multiple linear 120 | # regression model. In R, the syntax for this is: 121 | 122 | lm(pew10~Republican+age, pewdata) 123 | 124 | # But this doesn't give us the output we want. This 125 | # is because R stores the output as an object. 126 | # so we need to write something like: 127 | 128 | results<-lm(pew10~Republican+age, pewdata) 129 | 130 | # and then to obtain the results, we need to write 131 | 132 | summary(results) 133 | 134 | # This is a bit annoying, but it can come in handy when 135 | # you want to extract different parts of the results 136 | # and put them into tables or plot them. 137 | 138 | # For example, if we want to get the coefficiences, we 139 | # can use the $ operator to get the coefficients 140 | 141 | x<-results$coefficients 142 | y<-results$residuals 143 | 144 | z<-cbind(x, y) 145 | write.csv(z, file="myresults.csv") 146 | 147 | 148 | # we can also plot the residuals values easily 149 | 150 | plot(results) 151 | 152 | #You need to hit return to see the different types 153 | # of plots available 154 | 155 | #The scatterplot matrix is also pretty cool. Let's 156 | # try it out with the built-in "mtcars" data, because 157 | # it has lots of continuous variables unlike these 158 | # pew data: 159 | 160 | pairs(~ mpg + hp + cyl, data=mtcars) 161 | 162 | #R also includes pretty much every single diagnostic test 163 | #available. For exmaple, here is the command for evaluating 164 | #MultiCollinearity/Variance Inflation Factors: 165 | 166 | install.packages("car") 167 | library(car) 168 | vif(results) 169 | 170 | # Here is the Bonferonni p-value for the most extreme observations 171 | outlierTest(results) 172 | 173 | #comes up negative. Some of them are even interactive: 174 | 175 | influencePlot(results, id.method="identify", main="Influence Plot", sub="Circle size is proportial to Cook's Distance" ) 176 | 177 | 178 | #There are also popular tools for assessing non-normality: 179 | 180 | qqPlot(results) 181 | 182 | # It looks a little funky because it is bimodal. Normally we use a qqplot to 183 | #asses wether the data fit a normal distribution 184 | 185 | #See also 186 | leveragePlots(results) 187 | avPlots(results) 188 | 189 | #We can also evaluate homoscedasticity using a non-constant error 190 | #variance test: 191 | 192 | ncvTest(results) 193 | 194 | # look for non-linearity 195 | crPlots(results) 196 | 197 | # looks ok. 198 | 199 | # What about missing data? 200 | install.packages("VIM") 201 | library(VIM) 202 | aggr(pewdata) 203 | 204 | #extremely efficient, huh? 205 | 206 | # we can also combine the matrix scatterplot with a missing data 207 | #analysis as follows: 208 | 209 | #first let's take a subset 210 | subsample<-pewdata[,c("age","sex","pew10")] 211 | marginmatrix(subsample) 212 | 213 | # If you get the "figure margins too large" warning, try making 214 | # your plot window large in RStudio. 215 | 216 | # There are many more ways of doing this, and many more great 217 | # visualization techniques in the VIM package to help you 218 | # identify even more subtle missing data patterns 219 | 220 | 221 | # But from this brief glance we know we have 222 | # Lots of missing data. I'll show you how 223 | # to use multiple imputation very quickly. 224 | # A word of warning, however: multiple imputation 225 | # can make things worse if your model is not 226 | # properly specified. Also, make sure you are 227 | # only passing numeric variables or factor variables 228 | # to the multiple imputation commands: 229 | 230 | # Let's just try imputing for the small 231 | # data set we created above called "subsample": 232 | 233 | library(mice) 234 | 235 | 236 | # now impute! 237 | mice.dat <- mice(subsample,m=10,seed=3) 238 | ## combine datasets 239 | mice.dat <- complete(mice.dat,action=10) 240 | # now we could re-run our analysis if we so choose. 241 | 242 | ## **Now You Try It:** 243 | 244 | #1) Determine whether the relationship between mpg and 245 | #hp of a car is significant when controlling for the number 246 | # of cylinders, quarter second time (qsec), and whether or not 247 | # a car is automatic (using the "am" variable)" 248 | 249 | # solution: summary(lm(mpg~hp+cyl+qsec+am, data=mtcars)) 250 | 251 | 252 | # FIXED-EFFECTS 253 | 254 | # To run a fixed effects model you can simply run: 255 | 256 | fixed<-(lm(pew10~Republican+age+factor(state), data=pewdata)) 257 | 258 | # RANDOM-EFFECTS 259 | 260 | install.packages("lme4") 261 | library(lme4) 262 | random<-lmer(pew10~Republican+age+ (1|state), data=pewdata) 263 | 264 | 265 | # To specify a different distribution of the outcome, use the "family" argument: 266 | 267 | random2<-lmer(pew10~Republican+age+ (1|state), family=poisson, data=pewdata) 268 | 269 | # We get some funky error messages here because we chose a family that 270 | #do not fit the data: 271 | 272 | # To determine whether or not to use random effects we can use 273 | # the Breusch-Pagan test- if it is significant this suggests the 274 | #random model is better way to describe the data. 275 | 276 | install.packages("lmtest") 277 | library(lmtest) 278 | bptest(fixed, random) 279 | 280 | # this suggests we should use the random effects model. 281 | 282 | 283 | #TIME-SERIES 284 | 285 | # R also shines when it comes to time series analysis. We are 286 | # going to use the plm package, and some sample data on 287 | # employment in the UK: 288 | 289 | install.packages("plm") 290 | library(plm) 291 | data("EmplUK", package="plm") 292 | head(EmplUK) 293 | 294 | #These "conditioning plots help us see if the relationships 295 | # between sectors really vary across year: 296 | 297 | coplot(wage ~ year|firm, type="l", data=EmplUK) 298 | coplot(wage ~ year|firm, type="l", data=EmplUK) 299 | 300 | #it looks like they do. 301 | 302 | 303 | #I also like to plot the means across organizations 304 | install.packages("gplots") 305 | library(gplots) 306 | plotmeans(wage ~ year, main="Heterogeineity across time", data=EmplUK) 307 | plotmeans(wage ~ firm, main="Heterogeineity across Employment sectors", data=EmplUK) 308 | 309 | # so we are seeing differences across both time and sectors 310 | 311 | # now lets run some models. First, let's run fixed effects: 312 | 313 | fixed1 <- plm(wage ~ capital+output+emp, index=c("firm", "year"), data=EmplUK, model="within") 314 | summary(fixed1) 315 | 316 | #and here's how we would run a random effects model 317 | 318 | random1 <- plm(wage ~ capital+output+emp, index=c("firm", "year"), data=EmplUK, model="random") 319 | summary(random1) 320 | 321 | #test for serial correlation 322 | library(lmtest) 323 | pbgtest(random1) 324 | 325 | #We should also test for cross-sectional dependence/contemporaneous causation 326 | pcdtest(random1, test = c("lm")) 327 | 328 | #And of course, we could test for much, much more... 329 | 330 | 331 | 332 | -------------------------------------------------------------------------------- /Class R Code/Class #8 R Code.R: -------------------------------------------------------------------------------- 1 | # SCRIPT FOR CLASS #8, COMPUTATIONAL SOCIOLOGY 2 | # Instructor: Chris Bail 3 | # Copyright, Chris Bail 4 | 5 | # MACHINE LEARNING 6 | 7 | # Last class we noted a number of problems that plague analysis of large, 8 | # complex datasets. These include issues about the meaning of p-values 9 | # an statistical significance, non-linear relationships between variables 10 | # the high likelihood of interaction effects given the large number 11 | # of variables that can be collected but not analyzed in every possible 12 | # iteration, and- in general- causal complexity, or the likelihood that 13 | # many of the outcomes we are interested in as social scientists do not 14 | # involve a single causal recipe. 15 | 16 | # Today we are going to talk about some exciting new (or new-ish) tools 17 | # that help address some of these problems. Some of them are replacements 18 | # for linear models, but others are probably unlikely to displace linear 19 | # models. Given how deeply engrained linear models are within sociology, 20 | # I think the best way to think about these new techniques are alternative 21 | # ways of analyzing/classifying/modeling your data that will help you 22 | # identify interactions that you might have missed and/or important 23 | # subsets of the data that you might want to code and include in the 24 | # linear models you ultimately use. 25 | 26 | # Once again, you could spend an entire semester on so-called "machine-learning" 27 | # sometimes it is also called "statistical learning." Most of these techniques 28 | # come out of statistics, but they are applied broadly by computer scientists, 29 | # and people in industry in order to both classify data, and- more often- to make 30 | # predictions about individual behavior. If you are really taken by the ideas 31 | # we talk about today, I recommend checking out this 15 hour class, which will 32 | # give you a comprehensive overview of the techniques, the math and notation behind 33 | # them, and hands on info about how to implement them in R: 34 | 35 | #http://www.r-bloggers.com/in-depth-introduction-to-machine-learning-in-15-hours-of-expert-videos/ 36 | 37 | # In fact, one of the teachers in this video invented one of the techniques we 38 | # will be talking about today (GAMs). For a less technical overview, check out 39 | #this extremely cool visual tutorial: 40 | 41 | #http://www.r2d3.us/visual-intro-to-machine-learning-part-1/ 42 | 43 | # Instead of going in depth into each technique, we are going to do what we 44 | # always do in this class: try to give you enough of an overview that you 45 | # could pursue this in depth on your own. We will also work through messy 46 | # real-world examples that I hope will help you see the strengths and 47 | # weaknesses of these new models. 48 | 49 | # We are going to focus on three methods in particular: Generalized 50 | # Additive Models, Regression Trees, and Random Forests. 51 | 52 | # GENERALIZED ADDITIVE MODELS 53 | 54 | # One huge problem with many linear models is that they are parametric, 55 | # or defined by functions that describe the data using a very small set of 56 | # parameters. By contrast, GAMs are non-parametric, meaning that the shape 57 | # of the predictor functions is fully determined by the data 58 | 59 | # We can of course build in non-linear transformations of variables 60 | # to account for this problem, but this assumes that we know about the 61 | # problem in the first place. Also, interpretation of such transformations 62 | # can be difficult. For example, do you know what it means if a polynomial 63 | # term is positive or negative? What if there are multiple polynomial 64 | # predictors in your model? To make matters worse, transforming multiple 65 | # predictors within the same model can create colinearity issues. Even 66 | # if we can get around all of these problems, it would take us a ton 67 | # of time- we'd have to try every single type of transformation for 68 | # every single type of variable until we found the one that is the 69 | # best fit for the data. 70 | 71 | # Generalized Additive Models are a unique approach developed in 1986 72 | # by two statisticians. GAMs look a lot like traditional regression models, 73 | # but they are much "smarter." GAMs automatically select the best 74 | # transformation of each variable FOR YOU- these can be both non-linear 75 | # or linear. After "Smoothing" or transforming each variable (where necessary) 76 | # GAMS simultaneously estimate the relationship between each predictor 77 | # and the outcome, outputting coefficients that are very similar to 78 | # a regular linear model. 79 | 80 | # Note that this not only gets us around the issue of having to try 81 | # out different linear combinations of variables we have hunches about 82 | # but also helps us account for potential non-linearities that we did 83 | # not even know to look for. 84 | 85 | # GAMS can also support any type of link function, or in other words, 86 | # any type of dependent variable that might be put into a generalized 87 | # linear model (e.g. binary, continuous, etc.) 88 | 89 | # GAMs are kind of a compromise between conventional linear models 90 | # which are almost always very biased but easy to interpret; and 91 | # new machine learning techniques such as random forests which are 92 | # very good at representing/classifying relationships between 93 | # variables but very difficult to interpret. This is one of the 94 | # main reasons I don't think we will see random forests in ASR/AJS 95 | # any time soon, but we may well see GAMs. 96 | 97 | #Let's look at an example that was put together by Michael Clark at Notre Dame 98 | # these are science test scores from the PISA cross-national education data 99 | 100 | data = read.csv("http://www.nd.edu/~mclark19/learn/data/pisasci2006.csv") 101 | 102 | # and let's take a look at the data: 103 | library(car) 104 | scatterplotMatrix(data) 105 | 106 | 107 | # the red smoothed lines show us there is a lot of potential non-linearity 108 | # in the data. 109 | 110 | # first, let's run a simple linear model sot hat we can get a baseline. 111 | # we use the gam command, but unless we enclose each predictor in an 112 | # s() it assumes it has a linear relationship with the outcome, 113 | 114 | install.packages("mgcv") 115 | library(mgcv) 116 | first_model <- gam(Overall ~ Income + Edu + Health, data = data) 117 | summary(first_model) 118 | 119 | # Now let's try a model that applies a smoother function to each predictor 120 | second_model <- gam(Overall ~ s(Income) + s(Edu) + s(Health), data = data) 121 | summary(second_model) 122 | 123 | # and now we can plot the relationship between each predictor and the outcome 124 | plot(second_model, pages=1, residuals=T, pch=19, cex=0.25, 125 | scheme=1, col='#FF8000', shade=T,shade.col='gray90') 126 | 127 | # and this neat contour plot lets us look at multiple variables at once 128 | vis.gam(second_model, type = "response", plot.type = "contour") 129 | 130 | # The "lassos" describe the predicted values of the outcome. That is 131 | # the sweat spot is having both high income and high education. 132 | 133 | # we can also do something called "tensor product smoothing" here which 134 | # you can think of as a smooth of the smoothers of multiple variables 135 | # at once 136 | 137 | third_model <- gam(Overall ~ te(Income, Edu), data = data) 138 | summary(third_model) 139 | 140 | # and we can plot it again 141 | vis.gam(third_model3, type='response', plot.type='persp', 142 | phi=30, theta=30,n.grid=500, border=NA) 143 | 144 | # Once again, I'm not sure why GAMs don't have more 145 | # influence in sociology. At the very least, I encourage 146 | # you to use them to triangulate your work with linear 147 | # models. And I suppose that with enough work a linear 148 | # model can "resemble' GAM, so perhaps people are using 149 | # them and then making the relevant transformations within 150 | # the linear models they present in papers? Probably not :) 151 | 152 | # REGRESSION TREES 153 | 154 | # The idea behind regression and classification trees is to group 155 | # datasets into different subsets and use these different subsets 156 | # to predict different pathways to an outcome. Basically, the 157 | # algorithm breaks down the dataset into different subsets or 158 | # regions using a stopping rule (for example, the region must 159 | # include at least five observations). Then, the model simply takes 160 | # the mean outcome for each subset or region, and predicts this 161 | # will be the value of the outcome for this subset of the data. 162 | # The results take the form of a "tree" which describes how different 163 | # subsets of the data lead one to expect different values for 164 | # the outcome variable. This visualization is arguably much 165 | # more easy to read than a regression model and can often 166 | # be a better fit for the data (because it can help identify 167 | # non-linearity or causal complexity in the data) 168 | 169 | # There are two types of common trees used in machine 170 | #learning : regression trees and classification 171 | # trees. Regression trees are for quantitative outcomes and for these 172 | # we can use the MASS package. classification trees are for 173 | # categorical outcomes and for these we can use the "tree" package 174 | 175 | # Let's take a look at an example which will help make this more 176 | # clear. 177 | 178 | install.packages("MASS") 179 | library(MASS) 180 | 181 | # the key function we are going to use is called "tree" 182 | # The "mpg~." indicates we want to treat mpg as the outcome 183 | # but look at every other variable in the dataset as a 184 | # possible preditor: 185 | 186 | tree.cars <- tree(mpg~., mtcars) 187 | 188 | # One of the great things about regression trees is that 189 | # you can plot them- they kind of resemble a "flowchart" 190 | # within an organization, or a tree used to clasify 191 | # biological organisms. The main difference, in terms 192 | # or interpretation, is that we are examining causal 193 | # pathways to the outcome (in this case mpg), and the 194 | # ways in which different predictors combine to shape 195 | # the outcome (both high and low levels of the outcome) 196 | 197 | plot(tree.cars) 198 | # This adds labels 199 | text(tree.cars ,pretty =0) 200 | 201 | # The way to read these trees is to focus on the 202 | #'<' sign. The branches to the left are less than 203 | # and the branches on the right are greater than. 204 | # the "wt" variable here describes weight (in tons), 205 | # so the tree shows us that this is a big factor in mpg, 206 | # which is not surprising. In fact, all cars less than 2.26 207 | # get about 30mpg. Those greater than 2.26 tons have two 208 | # categories: those with less than six cylinders and those 209 | # with eight cylinders. The V8 cars with the highest horsepower 210 | # (hp) also have the lowest mpg. Again, this makes sense. 211 | 212 | # To further intepret how well this classification 213 | #tree fits the data, we can summarize the object: 214 | 215 | summary (tree.cars) 216 | 217 | #The number we want to focus on is the "residual mean deviance" 218 | # which tells us how far off the regression tree estimates may 219 | # be. If we run a classification tree- which we are about to do 220 | #-we will get another metric: The missclassification rate, which 221 | # tells us how many cases cannot be explained by the classification tree. 222 | 223 | #Finding the best fit for the data usually requires "pruning" 224 | # the tree, or removing branches of the tree that add more 225 | # complexity but less explanatory power. To do this we use the 226 | # cv.tree command 227 | 228 | prune.cars =prune.tree(tree.cars ,best =5) 229 | plot(prune.cars) 230 | text(prune.cars ,pretty =0) 231 | 232 | # In this case, our tree was already so simple that pruning it 233 | # did not add parsimony. Note that the same pruning process 234 | # can be used for classification trees, as I will soon discuss. 235 | 236 | # Let's run through a quick classification tree example, let's 237 | # use the Pew data from a previous class in order to classify 238 | # those who did and did not support the construction of the 239 | #ground zero mosque. 240 | 241 | load("Pew Data.Rdata") 242 | install.packages("tree") 243 | library(tree) 244 | tree.groundzero =tree(pew10~educ+sex+age+inc+partyln, pewdata) 245 | plot(tree.groundzero) 246 | text(tree.groundzero ,pretty =0) 247 | 248 | # The only difference in this process is that the algorithm selects 249 | # the most common value of the outcome given the subset or "branch" 250 | # of the tree instead of the mean value, which is what regression 251 | # trees use. 252 | 253 | # Another neat party for creating trees is the "party" package. 254 | # In addition to the same tree structure generated by the two 255 | # packages above, it adds plots of the cases within each branch 256 | # of the tree. Let's take a look: 257 | 258 | install.packages("party") 259 | library(party) 260 | prettycartree <- ctree(mpg~., data=mtcars) 261 | plot(prettycartree) 262 | 263 | 264 | #These aren't the best datasets to use. I find that regression 265 | # and classification trees are most useful when you have 266 | # very large datasets with many different variables. I encourage 267 | # you to try this out on different datasets- unfortunately 268 | # I did not have a better dataset on hand. 269 | 270 | 271 | 272 | #RANDOM FORESTS 273 | 274 | # The main downside of regression trees is that they are not the most 275 | # accurate tool available. In fact, if all relationships between 276 | # predictors and outcoems are linear, then a linear model will 277 | # be more efficient. 278 | 279 | # The principal reason that regression trees are not the most efficient way 280 | # of making predictions about data is that they only use one "round" 281 | # of subsetting the data. Therefore, regions with high variance 282 | # create problems. But what if instead of partitioning the data 283 | # only once, we did it hundreds of even thousands of times in slightly 284 | # different ways and then averaged the results? This is what is known 285 | # as bootstrapping, and in the language of regression trees, it is 286 | # often described as "bagging." Random forests are an extension of 287 | # bagging that includes a small tweak that "decorrelates" the bagged 288 | # trees. It does this by taking a random sample of predictor variables 289 | # within each subset, and then choosing one of them as a branch. The 290 | # random selection of the predictor helps avoid the influence of 291 | # one important predictor over other predictors that have a more 292 | # moderate yet still meaningful association with the outcome. 293 | 294 | # Unfortunately, when we extract a large number of subsets 295 | # from the data using bagging or random forests, we sacrifice 296 | # the interpritibility a single regression tree. That is, we 297 | # cannot construct a tree that visualizes all the bagged/randomly 298 | # partioned datasets. 299 | 300 | 301 | # On the other hand, we can still measure how important each predictor 302 | # is; or how important that "branch" is within the regression tree by 303 | # examining the residual sum of squares. 304 | 305 | #Let's try it out. 306 | install.packages("randomForest") 307 | library(randomForest) 308 | 309 | set.seed (123) 310 | rf.cars =randomForest(mpg~.,data=mtcars,importance =TRUE) 311 | 312 | # To create the variable importance plot, we use this line: 313 | varImpPlot (rf.cars) 314 | 315 | 316 | # The first plot shows us the how much error we add to our model if we 317 | # remove the variable from our model using the Mean Square Error. The 318 | # plot on the right describes 319 | # the total increase in "node impurity" if the variable is removed from 320 | # the model- this is another measure of how important it is to have 321 | # the variable in the model that explains the outcome (and more specifically 322 | # how consistently the outcome is correct for each observation 323 | # within that branch. So a perfectly "pure" node or branch would 324 | # descibe all cases exactly) 325 | 326 | # But the best way to figure out how solid your predictions are is to 327 | # use a training dataset or subset of the data to create the model, and 328 | # then see how well it predicts the outcomes. 329 | 330 | 331 | # The most recent advance in regression trees is the concept of 332 | # boosting, which is similar to bagging but allows trees to grow 333 | # sequentially. If you want to learn more about these models 334 | # check out the book "An Introduction to Statistical Learning" 335 | # and the gbm package 336 | 337 | 338 | 339 | 340 | 341 | -------------------------------------------------------------------------------- /Class R Code/Class #9 R Code.R: -------------------------------------------------------------------------------- 1 | #SCRIPT FOR CLASS #9, COMPUTATIONAL SOCIOLOGY 2 | # Instructor: Chris Bail 3 | # Copyright, Chris Bail 4 | 5 | # The vast majority of the code we are using this week is for the program "NetLogo." While it is possible 6 | # run NetLogo from R using the RNetLogo package, I opted not to do this because this package is currently 7 | #unstable. Instead, I provided more detailed annotation in the class slides for this week than I usually do 8 | # so most of the activities for this class will not involve this code. 9 | 10 | # At the very end of the class, I explain how to write a simulation from scratch in R. The code below is adapted 11 | # from James Kitts and it implements the Threshold Model of Collective Behavior developed by Granovetter. In 12 | # this model, riots emerge because of a critical threshold wherein the risks of participating in the riot 13 | # decrease given a sufficient number of people who participate. 14 | 15 | #As in NetLogo, we must first initialize our simulation. But in R, we need to be a little more explicit (Net 16 | # Logo knows we may define paramaters using sliders, but R does not know this, because there are no 17 | # sliders or buttons!). Here's what the Schelling Model looks like (note the large number of functions 18 | # within functions- just like NetLogo). 19 | 20 | 21 | 22 | # Number of people 23 | n <- 1000 24 | 25 | # The people will live in a square with area side^2 26 | side <- ceiling(sqrt(n)) 27 | 28 | df <- data.frame(x=((0:(n-1)) %% side), 29 | y=floor((0:(n-1)) / side), row.names=0:(n-1)) 30 | 31 | # Most lots will have a race; some will be empty, ie uninhabited 32 | races <- c("forestgreen", "dodgerblue", "darkred") 33 | 34 | # Assign races iid uniformly; leave roughly 10% of lots empty 35 | df$race <- sample(c(races, "empty"), n, replace=TRUE, 36 | prob=c(rep(0.90 / length(races), length(races)), 37 | 0.10)) 38 | 39 | PlotNeighborhood <- function() { 40 | 41 | with(subset(df, race != "empty"), 42 | plot(x, y, col=race, pch=20, axes=FALSE, 43 | xlab="", ylab="", xlim=c(0, side), ylim=c(0, side), 44 | main="A Schelling-esque Neighborhood")) 45 | 46 | } 47 | 48 | dev.new(height=8, width=8) 49 | par(mar=rep(1, 4), oma=rep(1, 4)) 50 | PlotNeighborhood() 51 | savePlot("neighborhood_before_movement.png") 52 | 53 | # Neighbors are counted within a Chebyshev distance <= depth 54 | depth <- 3 55 | 56 | CountNeighbors <- function(i) { 57 | 58 | # Count people of each race in person i's immediate neighborhood 59 | curr.x <- i %% side 60 | curr.y <- floor(i / side) 61 | neighbors <- subset(df, x %in% (curr.x - depth):(curr.x + depth) & 62 | y %in% (curr.y - depth):(curr.y + depth) & 63 | !(curr.x == x & curr.y == y)) 64 | 65 | return(sapply(races, function(x) { sum(neighbors$race == x) })) 66 | 67 | } 68 | 69 | # Apply CountNeighbors to the entire data frame; append results to df 70 | df <- cbind(df, t(sapply(0:(n - 1), CountNeighbors))) 71 | 72 | df$num.neighbors <- rowSums(df[ , races]) 73 | 74 | # Minimum fraction of own-race neighbors 75 | fraction <- 0.30 76 | 77 | GetPeopleWhoWantToMove <- function(curr.race) { 78 | 79 | subset.who.want.to.move <- 80 | subset(df, race == curr.race & 81 | get(curr.race) / num.neighbors < fraction) 82 | 83 | return(rownames(subset.who.want.to.move)) 84 | 85 | } 86 | 87 | AdjustNeighborCounts <- function(row, race.of.mover, delta) { 88 | 89 | curr.x <- as.integer(row) %% side 90 | curr.y <- floor(as.integer(row) / side) 91 | 92 | df[df$x %in% (curr.x - depth):(curr.x + depth) & 93 | df$y %in% (curr.y - depth):(curr.y + depth) & 94 | !(curr.x == df$x & curr.y == df$y), 95 | c(race.of.mover, "num.neighbors")] <<- 96 | df[df$x %in% (curr.x - depth):(curr.x + depth) & 97 | df$y %in% (curr.y - depth):(curr.y + depth) & 98 | !(curr.x == df$x & curr.y == df$y), 99 | c(race.of.mover, "num.neighbors")] + delta 100 | 101 | } 102 | 103 | MoveOnePerson <- function() { 104 | 105 | # Returns 1 if a person was successfully moved, and 0 otherwise 106 | 107 | people.who.want.to.move <- c(lapply(races, GetPeopleWhoWantToMove), 108 | recursive=TRUE) 109 | 110 | if (!length(people.who.want.to.move) >= 1) return(0) 111 | 112 | # Of people who want to move, pick one uniformly at random 113 | person.who.will.move <- sample(people.who.want.to.move, size=1) 114 | 115 | race.of.mover <- df$race[rownames(df) == person.who.will.move] 116 | 117 | possible.new.homes <- rownames( 118 | subset(df, race == "empty" & 119 | get(race.of.mover) / num.neighbors >= fraction)) 120 | 121 | if (!length(possible.new.homes) >= 1) return(0) 122 | 123 | # Of acceptable new homes, choose one uniformly at random 124 | new.home <- sample(possible.new.homes, size=1) 125 | 126 | df[rownames(df) == new.home, ]$race <<- race.of.mover 127 | df[rownames(df) == person.who.will.move, ]$race <<- "empty" 128 | 129 | AdjustNeighborCounts(person.who.will.move, race.of.mover, -1) 130 | AdjustNeighborCounts(new.home, race.of.mover, +1) 131 | 132 | return(1) 133 | 134 | } 135 | 136 | RunSimulation <- function(max.movements = 5000, plots.in.loop=TRUE) { 137 | 138 | par(mar=rep(1, 4), oma=rep(1, 4)) 139 | 140 | for(i in 1:max.movements) { 141 | 142 | if (!MoveOnePerson()) break 143 | if (plots.in.loop & (i %% 50) == 0) PlotNeighborhood() 144 | 145 | } 146 | 147 | } 148 | 149 | library(animation) 150 | 151 | saveVideo(RunSimulation(), 152 | video.name="schelling_neighborhood_model.mp4", 153 | interval=0.20, outdir=getwd()) 154 | 155 | dev.new(height=8, width=8) 156 | par(mar=rep(1, 4), oma=rep(1, 4)) 157 | PlotNeighborhood() 158 | savePlot("neighborhood_after_movement.png") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Computational-Sociology 2 | 3 | This repository is for materials and group projects associated with my Computational Sociology course at Duke University. 4 | 5 | 6 | 7 | 8 | --------------------------------------------------------------------------------