├── .DS_Store
├── .Rhistory
├── .gitignore
├── Archive
    ├── .DS_Store
    ├── R-Programs-lessons-2021
    │   ├── .DS_Store
    │   ├── Day01_Intro_programs
    │   │   ├── R_Demo_Intro_Program_Challenge_two_sum.R
    │   │   └── R_Demo_Intro_start_here.R
    │   ├── Day02_Data_Analysis_Managment
    │   │   ├── R_Demo_Into_linear_model_predict.R
    │   │   ├── R_Demo_Intro_data_visulization.R
    │   │   ├── R_Demo_Intro_forloop_vs_lapply.R
    │   │   ├── R_Demo_Intro_functions.R
    │   │   ├── R_Demo_Intro_lists_dataframes_tables.R
    │   │   ├── R_Demo_Intro_loops_logic_lists_dataframes_tables.R
    │   │   └── R_Demo_Intro_vectors_matrices.R
    │   ├── Day03_Simulation_Learning
    │   │   ├── R_Demo_Intro_probability_review.R
    │   │   ├── R_Demo_Simulation_Learning_Estimate_Mean.R
    │   │   ├── R_Demo_Simulation_Learning_Estimate_linear_model.R
    │   │   ├── R_Demo_Simulation_Learning_Potential_Outcomes_Framework.R
    │   │   └── R_Demo_Simulation_Learning_Probability_Distributions.R
    │   ├── Day04_Simulation_Inference
    │   │   ├── R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R
    │   │   ├── R_Demo_Simulation_Inference_2_Sample_Cross_Validation.R
    │   │   ├── R_Demo_Simulation_Inference_2_Sample_Hold_Out.R
    │   │   ├── R_Demo_Simulation_Inference_Bootstrap.R
    │   │   ├── R_Demo_Simulation_Inference_MCMC_Estimate_Mean.R
    │   │   ├── R_Demo_Simulation_Inference_naive_Bayes.R
    │   │   └── R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R
    │   ├── Day05_Measurement
    │   │   ├── .Rhistory
    │   │   ├── R_Demo_Measurement_SVD.R
    │   │   ├── R_Demo_Measurement_factor_analysis.R
    │   │   └── R_Demo_logit_transformation_latent_variable.R
    │   ├── Day05_Measurement_RSTAN
    │   │   ├── RSTAN_binary_IRT_NAs_simulation.R
    │   │   ├── RSTAN_binary_IRT_simulation.R
    │   │   ├── RSTAN_dyanamic_binary_IRT_NAs_simulation.R
    │   │   ├── RSTAN_dynamic_binary_IRT_simulation.R
    │   │   ├── RSTAN_linear_model_simulation.R
    │   │   ├── RSTAN_logistic_regression_simulation.R
    │   │   ├── RSTAN_mean_simulation.R
    │   │   ├── RSTAN_normal_distribution_simulation.R
    │   │   ├── RSTAN_ordered_IRT_simulation.R
    │   │   └── StarTrek20210730.R
    │   ├── Day06_Text_as_data_programs
    │   │   ├── R_Demo_Ternary_Diriclet.R
    │   │   ├── R_Demo_text_as_data_DTM.R
    │   │   ├── R_Demo_text_as_data_DTM_stm_package.R
    │   │   ├── R_Demo_text_as_data_DTM_tm_package.R
    │   │   ├── R_Demo_text_as_data_NYT_Text_Process_v03.R
    │   │   ├── R_Demo_text_as_data_package_description.R
    │   │   ├── R_Demo_text_as_data_regular_expressions.R
    │   │   ├── R_Demo_text_as_data_twitteR_get_twitter_users.R
    │   │   ├── R_Demo_text_as_data_twitteR_setup.R
    │   │   ├── R_Demo_text_as_data_word_probabilities_and_STM.R
    │   │   └── R_Demo_text_as_wikip_stm.R
    │   ├── Day07_Applied_Machine_Learning_AML
    │   │   ├── R_Demo_AML_keras_example.R
    │   │   ├── R_Demo_AML_neuralnet_gradiant_decent_glm.R
    │   │   ├── R_Demo_AML_neuralnet_gradiant_decent_lm.R
    │   │   ├── R_Demo_AML_neuralnet_gradiant_decent_mu.R
    │   │   ├── R_Demo_AML_neuralnet_interaction_example.R
    │   │   └── R_Demo_AML_neuralnet_squared_term_example.R
    │   └── Day08_Model_Evaluation
    │   │   ├── .DS_Store
    │   │   ├── R_Demo_Model_Evaluation_false_discovery_rate.R
    │   │   └── R_Demo_Model_Evaluation_precision_recall_accuracy.R
    └── R-Programs-lessons-2022
    │   ├── .DS_Store
    │   ├── Day01_Intro_programs
    │       ├── .Rhistory
    │       ├── R_Demo_Intro_Program_Challenge_two_sum.R
    │       └── R_Demo_Intro_start_here.R
    │   ├── Day02_Data_Analysis_Managment
    │       ├── .DS_Store
    │       ├── .Rhistory
    │       ├── R_Demo_Into_linear_model_predict.R
    │       ├── R_Demo_Intro_data_visulization.R
    │       ├── R_Demo_Intro_forloop_vs_lapply.R
    │       ├── R_Demo_Intro_functions.R
    │       ├── R_Demo_Intro_lists_dataframes_tables.R
    │       ├── R_Demo_Intro_loops_logic_lists_dataframes_tables.R
    │       ├── R_Demo_Intro_mean.R
    │       ├── R_Demo_Intro_variance.R
    │       └── R_Demo_Intro_vectors_matrices.R
    │   ├── Day03_Simulation_Learning
    │       ├── .DS_Store
    │       ├── R_Demo_Intro_probability_review.R
    │       ├── R_Demo_Simulation_Learning_Dice_Rolls.R
    │       ├── R_Demo_Simulation_Learning_Estimate_Mean.R
    │       ├── R_Demo_Simulation_Learning_Estimate_linear_model.R
    │       ├── R_Demo_Simulation_Learning_Potential_Outcomes_Framework.R
    │       ├── R_Demo_Simulation_Learning_ProbCalc_BayesRule.R
    │       ├── R_Demo_Simulation_Learning_Probability_Continuous_Distributions.R
    │       └── R_Demo_Simulation_Learning_Probability_Discrete_Distributions.R
    │   ├── Day04_Simulation_Inference
    │       ├── .DS_Store
    │       ├── R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R
    │       ├── R_Demo_Simulation_Inference_2_Sample_Cross_Validation.R
    │       ├── R_Demo_Simulation_Inference_2_Sample_Hold_Out.R
    │       ├── R_Demo_Simulation_Inference_Bootstrap.R
    │       ├── R_Demo_Simulation_Inference_MCMC_Estimate_Mean.R
    │       ├── R_Demo_Simulation_Inference_naive_Bayes.R
    │       └── R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R
    │   ├── Day05_Measurement_Intro
    │       ├── .Rhistory
    │       ├── R_Demo_Measurement_SVD.R
    │       ├── R_Demo_Measurement_factor_analysis.R
    │       └── R_Demo_logit_transformation_latent_variable.R
    │   ├── Day05_Measurement_RSTAN
    │       ├── RSTAN_binary_IRT_NAs_simulation.R
    │       ├── RSTAN_binary_IRT_simulation.R
    │       ├── RSTAN_dyanamic_binary_IRT_NAs_simulation.R
    │       ├── RSTAN_dynamic_binary_IRT_simulation.R
    │       ├── RSTAN_linear_model_simulation.R
    │       ├── RSTAN_logistic_regression_simulation.R
    │       ├── RSTAN_mean_simulation.R
    │       ├── RSTAN_normal_distribution_simulation.R
    │       ├── RSTAN_ordered_IRT_simulation.R
    │       ├── StarTrek20210730.R
    │       └── StarTrekQuiz20220829.csv
    │   ├── Day06_Text_as_data_programs
    │       ├── .DS_Store
    │       ├── R_Demo_Ternary_Diriclet.R
    │       ├── R_Demo_google_trends_Human_Rights.R
    │       ├── R_Demo_text_as_data_DTM.R
    │       ├── R_Demo_text_as_data_DTM_stm_package.R
    │       ├── R_Demo_text_as_data_DTM_tm_package.R
    │       ├── R_Demo_text_as_data_NYT_Text_Process_v03.R
    │       ├── R_Demo_text_as_data_package_description.R
    │       ├── R_Demo_text_as_data_regular_expressions.R
    │       ├── R_Demo_text_as_data_twitteR_get_twitter_users.R
    │       ├── R_Demo_text_as_data_twitteR_get_twitter_users_Oauth_v2.R
    │       ├── R_Demo_text_as_data_word_probabilities_and_STM.R
    │       ├── R_Demo_text_as_wikip_stm.R
    │       └── google_trends_stuff
    │       │   ├── R_Demo_google_trends_Texas.R
    │       │   ├── R_Demo_google_trends_trump_brexit.R
    │       │   ├── google_search_trends_paired_comparisons.R
    │       │   └── groundhog_library_func.R
    │   ├── Day07_Applied_Machine_Learning_AML_Intro
    │       ├── R_Demo_AML_keras_example.R
    │       ├── R_Demo_AML_neuralnet_gradiant_decent_glm.R
    │       ├── R_Demo_AML_neuralnet_gradiant_decent_lm.R
    │       ├── R_Demo_AML_neuralnet_gradiant_decent_mu.R
    │       ├── R_Demo_AML_neuralnet_interaction_example.R
    │       └── R_Demo_AML_neuralnet_squared_term_example.R
    │   ├── Day08_Applied_Machine_Learning_AML_HuggingFace
    │       └── R_Demo_huggingface_intro.R
    │   └── Day09_Model_Evaluation_Review
    │       ├── .DS_Store
    │       ├── R_Demo_Model_Evaluation_false_discovery_rate.R
    │       ├── R_Demo_Model_Evaluation_precision_recall_accuracy.R
    │       └── R_Demo_Model_Evaluation_standard_errors_tstats_pvalues_CIs.R
├── Datasets
    ├── .DS_Store
    ├── Anscombes_quartet.csv
    ├── Myobject.Rdata
    ├── Myworkspace.Rdata
    ├── NYT_Text_Articles
    │   ├── .DS_Store
    │   ├── metadata.csv
    │   └── unProcessed_Files
    │   │   ├── NYT_1981
    │   │   ├── NYT_1982.txt
    │   │   ├── NYT_1983
    │   │   └── NYT_1984
    ├── SIMpoliticalTweets.txt
    ├── macro.csv
    ├── ny_stop_frisk.csv
    ├── ny_stop_frisk_black.csv
    ├── stopwords_twitter.txt
    ├── tweet_data.Rdata
    ├── tweet_data.csv
    └── users-by-social-media-platform.csv
├── R-Program-Lessons
    ├── .DS_Store
    ├── Day01_Intro_programs
    │   ├── .DS_Store
    │   ├── R_Demo_Intro_Program_Challenge_two_sum.R
    │   └── R_Demo_Intro_start_here.R
    ├── Day02_Data_Analysis_Managment
    │   ├── .DS_Store
    │   ├── R_Demo_Into_linear_model_predict.R
    │   ├── R_Demo_Intro_data_visulization.R
    │   ├── R_Demo_Intro_forloop_vs_lapply.R
    │   ├── R_Demo_Intro_functions_part1.R
    │   ├── R_Demo_Intro_functions_part2.R
    │   ├── R_Demo_Intro_lists_dataframes_tables.R
    │   ├── R_Demo_Intro_loops_logic_lists_dataframes_tables.R
    │   ├── R_Demo_Intro_mean.R
    │   ├── R_Demo_Intro_variance.R
    │   └── R_Demo_Intro_vectors_matrices.R
    ├── Day03_Simulation_Learning
    │   ├── .DS_Store
    │   ├── R_Demo_Intro_probability_review.R
    │   ├── R_Demo_Simulation_Learning_Dice_Rolls.R
    │   ├── R_Demo_Simulation_Learning_Estimate_Mean.R
    │   ├── R_Demo_Simulation_Learning_Estimate_linear_model.R
    │   ├── R_Demo_Simulation_Learning_Potential_Outcomes_Framework.R
    │   ├── R_Demo_Simulation_Learning_ProbCalc_BayesRule.R
    │   ├── R_Demo_Simulation_Learning_Probability_Continuous_Distributions.R
    │   └── R_Demo_Simulation_Learning_Probability_Discrete_Distributions.R
    ├── Day04_Simulation_Inference
    │   ├── .DS_Store
    │   ├── R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R
    │   ├── R_Demo_Simulation_Inference_2_Sample_Cross_Validation.R
    │   ├── R_Demo_Simulation_Inference_2_Sample_Hold_Out.R
    │   ├── R_Demo_Simulation_Inference_Bootstrap.R
    │   ├── R_Demo_Simulation_Inference_MCMC_Estimate_Mean.R
    │   ├── R_Demo_Simulation_Inference_naive_Bayes.R
    │   └── R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R
    ├── Day05_Measurement_Intro
    │   ├── .Rhistory
    │   ├── R_Demo_Measurement_SVD.R
    │   ├── R_Demo_Measurement_factor_analysis.R
    │   └── R_Demo_logit_transformation_latent_variable.R
    ├── Day05_Measurement_RSTAN
    │   ├── .DS_Store
    │   ├── RSTAN_binary_IRT_NAs_simulation.R
    │   ├── RSTAN_binary_IRT_simulation.R
    │   ├── RSTAN_dyanamic_binary_IRT_NAs_simulation.R
    │   ├── RSTAN_dynamic_binary_IRT_simulation.R
    │   ├── RSTAN_linear_model_simulation.R
    │   ├── RSTAN_logistic_regression_simulation.R
    │   ├── RSTAN_mean_simulation.R
    │   ├── RSTAN_normal_distribution_simulation.R
    │   ├── RSTAN_ordered_IRT_simulation.R
    │   ├── StarTrek20210730.R
    │   └── StarTrekQuiz20220829.csv
    ├── Day06_Text_as_data_programs
    │   ├── .DS_Store
    │   ├── R_Demo_Ternary_Diriclet_visulaization.R
    │   ├── R_Demo_google_search_trends_paired_comparisons.R
    │   ├── R_Demo_google_trends_Human_Rights.R
    │   ├── R_Demo_google_trends_Texas.R
    │   ├── R_Demo_text_as_data_DTM.R
    │   ├── R_Demo_text_as_data_DTM_stm_package.R
    │   ├── R_Demo_text_as_data_DTM_tm_package.R
    │   ├── R_Demo_text_as_data_NYT_Text_Process_v03.R
    │   ├── R_Demo_text_as_data_package_description.R
    │   ├── R_Demo_text_as_data_regular_expressions.R
    │   ├── R_Demo_text_as_data_word_probabilities_and_STM.R
    │   ├── R_Demo_text_as_wikip_stm.R
    │   ├── R_twitter_examples
    │   │   ├── R_Demo_text_as_data_twitteR_get_twitter_users.R
    │   │   └── R_Demo_text_as_data_twitteR_get_twitter_users_Oauth_v2.R
    │   └── groundhog_library_func.R
    ├── Day07_Applied_Machine_Learning_AML_Intro
    │   ├── R_Demo_AML_MNIST_example.R
    │   ├── R_Demo_AML_keras_example.R
    │   ├── R_Demo_AML_neuralnet_gradiant_decent_glm.R
    │   ├── R_Demo_AML_neuralnet_gradiant_decent_lm.R
    │   ├── R_Demo_AML_neuralnet_gradiant_decent_mu.R
    │   ├── R_Demo_AML_neuralnet_interaction_example.R
    │   └── R_Demo_AML_neuralnet_squared_term_example.R
    ├── Day08_Applied_Machine_Learning_AML_HuggingFace
    │   └── R_Demo_huggingface_intro.R
    ├── Day09_Model_Evaluation_Review
    │   ├── .DS_Store
    │   ├── R_Demo_Model_Evaluation_false_discovery_rate.R
    │   ├── R_Demo_Model_Evaluation_precision_recall_accuracy.R
    │   └── R_Demo_Model_Evaluation_standard_errors_tstats_pvalues_CIs.R
    └── Day10_Additional_Programs
    │   ├── dynamic_IRT_practice_v1.R
    │   ├── dynamic_IRT_practice_v2.R
    │   ├── dynamic_IRT_practice_v3.R
    │   ├── dynamic_IRT_practice_v4.R
    │   └── dynamic_IRT_practice_v5.R
├── README.md
└── Rplots
    └── Google_search_term_pairs_longlist.pdf


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.Rproj
2 | .Rproj.user/
3 | 


--------------------------------------------------------------------------------
/Archive/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/.DS_Store


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2021/.DS_Store


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day01_Intro_programs/R_Demo_Intro_Program_Challenge_two_sum.R:
--------------------------------------------------------------------------------
  1 | ## R_Demo_Intro_Program_Challenge_two_sum.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2021
  6 | ##
  7 | ## Date: 2021-07-24
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ## Instructions:
 14 | ##
 15 | ## These challenges are meant to be just that, challenging. 
 16 | ## They should also be fun. I encourage you to think creatively and collaboratively. 
 17 | ## Getting stuck or not finishing all the steps is expected and encouraged. 
 18 | ## This is how learning works!
 19 | ## Learn to program, program to learn.
 20 | ##
 21 | ## Always start with step (1) and then continue to each step as time permits.
 22 | ## Don't worry about completing each step. Document your code for each step.
 23 | ## You may wish to come back to some of the harder steps as you progress through the course.
 24 | ## Note that some of the steps may ask you to use skills we have not yet covered in the course.
 25 | ## Don't worry about these steps now but definitely think through the programming logic if you are stuck and make plans to come back to try them once you feel ready.
 26 | ##
 27 | ##########################################################################
 28 | ##
 29 | ## Steps for the Challenge
 30 | ##
 31 | ## (1) create a vector of integers numbers and a scalar integer target
 32 | ## (2) write a program in R that determines (returns) two numbers from the vector that add up to the target scalar
 33 | ## (3) how many combinations of numbers in the vector of integers add up to the target scalar?
 34 | ## (4) write a function to complete steps  1-3 (hint: wrap the program from (3) within a function)
 35 | ## (5) write a simulation that explore the relationship between (a) the vector of integers numbers and (b) scalar integer target sum
 36 | ## (6) re-write the program or function so that it takes fewer steps to calculate the number of numeric combinations that add up to the scalar target
 37 | ##
 38 | ##########################################################################
 39 | 
 40 | ## step 1: create a vector of integers numbers
 41 | int_numbers <- 1:10
 42 | int_numbers
 43 | 
 44 | ## step 1: and a scalar integer target
 45 | target_number <- 6
 46 | target_number
 47 | 
 48 | ## step 2: write a program in R that determines (returns) two of numbers that add up to the target scalar
 49 | count <- 1
 50 | count
 51 | 
 52 | value <- c()
 53 | value
 54 | 
 55 | mat <- matrix(NA, nrow=length(int_numbers), ncol=length(int_numbers))
 56 | mat
 57 | 
 58 | for(i in 1:length(int_numbers)){
 59 |     for(j in 1:length(int_numbers)){
 60 |         mat[i,j] <- int_numbers[i] + int_numbers[j]
 61 |         value[count] <- ifelse(mat[i,j]==target_number, TRUE, FALSE)
 62 |         count <- count + 1
 63 |     }
 64 | }
 65 | count
 66 | mat
 67 | value
 68 | 
 69 | ## step 3: how many combinations of numbers in the vector of integers add up to the target scalar?
 70 | sum(value)
 71 | sum(value==TRUE)
 72 | 
 73 | 
 74 | ## step 4: write a function to complete steps  1-3
 75 | 
 76 | sum_func <- function(int_numbers=1:10, target_number=2){
 77 |     count <- 1
 78 |     value <- c()
 79 |     mat <- matrix(NA, nrow=length(int_numbers), ncol=length(int_numbers))
 80 |     for(i in 1:length(int_numbers)){
 81 |         for(j in 1:length(int_numbers)){
 82 |             mat[i,j] <- int_numbers[i] + int_numbers[j]
 83 |             value[count] <- ifelse(mat[i,j]==target_number, TRUE, FALSE)
 84 |             count <- count + 1
 85 |         }
 86 |     }
 87 |     mat
 88 |     value
 89 |     return(sum(value==TRUE))
 90 | }
 91 | 
 92 | ## print function definition to screen
 93 | sum_func
 94 | 
 95 | ## use the function with its default arguments
 96 | sum_func()
 97 | 
 98 | ## use the function with other arguments
 99 | sum_func(int_numbers=1:10, target_number=6)
100 | 
101 | sum_func(int_numbers=1:100, target_number=6)
102 | 
103 | sum_func(int_numbers=1:10, target_number=60)
104 | 
105 | sum_func(int_numbers=1:100, target_number=60)
106 | 
107 | 
108 | ## try to figure out step 5 and step 6 here
109 | 
110 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day02_Data_Analysis_Managment/R_Demo_Intro_forloop_vs_lapply.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_Intro_forloop_vs_lapply.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2021
 6 | ##
 7 | ## Date: 2021-07-24
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## in class coding of loops and lapply
16 | 
17 | 1:10
18 | 
19 | lenght(1:10)
20 | 
21 | 
22 | ## for loop version
23 | output <- c()
24 | for(i in 1:10){
25 |   
26 |   local_var <- i*i ## the simple but core calculation of the iterative process
27 |   output[i] <- local_var
28 |   
29 | }
30 | output
31 | 
32 | 
33 | ## lapply version
34 | output <- lapply(1:10, function(i){
35 |   
36 |   local_var <- i*i ## the simple but core calculation of the iterative process
37 |   return(local_var)
38 | })
39 | unlist(output)
40 | 
41 | 
42 | 
43 | 
44 | mat <- matrix(NA, nrow=10, ncol=5)
45 | mat
46 | 
47 | for(i in 1:10){
48 |   for(j in 1:5){
49 |     local_var <- i*j
50 |     mat[i,j] <- local_var
51 |   }  
52 | }
53 | mat
54 | 
55 | mat_list <- lapply(1:10, function(i){
56 |   
57 |   local_vec <- c()
58 |   for(j in 1:5){
59 |     local_var <- i*j
60 |     local_vec[j] <- local_var
61 |   }  
62 |   
63 |  return(local_vec)
64 | })
65 | 
66 | unlist(mat_list)
67 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T)
68 | mat
69 | 
70 | 
71 | mat_list <- lapply(1:10, function(i){
72 |   local_out <- lapply(1:5, function(j){
73 |     temp <- i*j
74 |     return(temp)
75 |   })  
76 |   return(local_out)
77 | })
78 | 
79 | mat_list
80 | unlist(mat_list)
81 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T)
82 | mat
83 | 
84 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day04_Simulation_Inference/R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R:
--------------------------------------------------------------------------------
  1 | #### R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2021
  6 | ##
  7 | ## Date: 2021-07-24
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ##
 14 | ## Goal: Improve the predictive power or predictive validity of a model when applied to new observed values
 15 | ##
 16 | ##########################################################################
 17 | ## Introduction to tutorial:
 18 | ##
 19 | ## For this R tutorial we will simulate a dataset and then randomly divide it into ten subsets. 
 20 | ##
 21 | ## We will then fit a model using the observations from 1 of the subsets of data (test data) and then use the model estimates to predict the value of the dependent variable for the remaining out of sample data subset (test data).
 22 | ##
 23 | ## After that step, we will use the other half of the data to fit the model and then predict the other hold out sample.
 24 | ##
 25 | ## In this way, both halves of the data will be predicted using a model estimated from other data not used in the fitting
 26 | ##
 27 | ## This process is increasingly common and required in almost all Machine Learning and predictive tasks in data science and increasingly so in the social sciences.
 28 | ##
 29 | ##########################################################################
 30 | 
 31 | 
 32 | #rm(list = ls())
 33 | 
 34 | set.seed(940)
 35 | 
 36 | ## set number of observations for simulation
 37 | n <- 100
 38 | 
 39 | ## number of folds (randomly created sub samples of data)
 40 | k <- 10
 41 | 
 42 | ## simulation of variables
 43 | x <- sample(4:14,n,replace=TRUE)
 44 | y <- -5.996 + 2.781*x -0.127*x^2   + rnorm(n,0,1)
 45 | #y <- -5.996 + 2.781*x -0.127*x^2   + rnorm(n,0,2)
 46 | 
 47 | plot(x,y)
 48 | 
 49 | ## create a subject/unit ID variable with one values for each unit
 50 | ## here the indicator values takes on 2-Fold values {1,2}
 51 | folds <- sample(rep(1:k, k), n, replace=FALSE)
 52 | 
 53 | table(folds)
 54 | 
 55 | ## create a data frame with the dependent varaible, independent variable, and randomly created ID
 56 | dat <- data.frame(y, x, folds)
 57 | 
 58 | ## create vectors for storing predictions
 59 | dat$y.hat1 <- NA
 60 | dat$y.hat2 <- NA
 61 | 
 62 | #test <- matrix(NA, nrow=k, ncol=2)
 63 | 
 64 | ##  function to
 65 | for(i in 1:k){
 66 |     
 67 |     ## fit a linear model
 68 |     fit1 <- lm(y ~ x, data=subset(dat, folds!=i))
 69 |     pred1 <- predict(fit1, newdata=subset(dat, folds==i))
 70 |     y.hat1 <- as.numeric(pred1)
 71 |     
 72 |     dat$y.hat1[dat$fold==i] <- y.hat1
 73 |     
 74 |     
 75 |     ## fit a linear model with a squared term
 76 |     fit2 <- lm(y ~ x + I(x^2), data=subset(dat, folds!=i))
 77 |     pred2 <- predict(fit2, newdata=subset(dat, folds==i))
 78 |     y.hat2 <- as.numeric(pred2)
 79 |     
 80 |     dat$y.hat2[dat$fold==i] <- y.hat2
 81 |     
 82 |     print(summary(dat))
 83 | }
 84 | 
 85 | rmse.fit1 <- sqrt(mean((dat$y.hat1-dat$y)^2))
 86 | rmse.fit1
 87 | 
 88 | rmse.fit2 <- sqrt(mean((dat$y.hat2-dat$y)^2))
 89 | rmse.fit2
 90 | 
 91 | 
 92 | cor.fit1 <- cor(dat$y.hat1, dat$y, method="spearman")
 93 | cor.fit1
 94 | 
 95 | cor.fit2 <- cor(dat$y.hat2, dat$y, method="spearman")
 96 | cor.fit2
 97 | 
 98 | c(cor.fit1, cor.fit2)
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day04_Simulation_Inference/R_Demo_Simulation_Inference_2_Sample_Hold_Out.R:
--------------------------------------------------------------------------------
  1 | #### R_Demo_Simulation_Inference_2_Sample_Hold_out.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2021
  6 | ##
  7 | ## Date: 2021-07-24
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ##
 14 | ## Goal: Improve the predictive power or predictive validity of a model
 15 | ##
 16 | ##########################################################################
 17 | ## Introduction to tutorial:
 18 | ##
 19 | ## (1) Begin building towards and learning about cross-validation
 20 | ## (NOTE: There is no "crossing" yet)
 21 | ##
 22 | ## For this R tutorial we will simulate a dataset and then randomly divide it into two subsets.
 23 | ##
 24 | ## We will fit a model using the observations from one of the subsets of data (training data).
 25 | ##
 26 | ## We will then use the model estimates to predict the value of the dependent variable for the remaining out-of-sample data subset (testing data).
 27 | ##
 28 | ##########################################################################
 29 | 
 30 | 
 31 | set.seed(940)
 32 | 
 33 | 
 34 | ## set number of observations for simulation
 35 | n <- 100
 36 | 
 37 | ## simulation of variables (This model is one of Anscombe's quartets)
 38 | x <- sample(4:14,n,replace=TRUE)
 39 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,1)
 40 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,2)
 41 | 
 42 | ## plot the simulated relationship
 43 | par(mfrow=c(1,1))
 44 | plot(x=x, y=y)
 45 | 
 46 | ## create a subject/unit ID variable with one values for each unit
 47 | ## here the indicator values takes on 2-Fold values {1,2}
 48 | folds <- sample(rep(1:2, n/2), n, replace=FALSE)
 49 | folds
 50 | 
 51 | table(folds)
 52 | 
 53 | ## create a data frame with the dependent variable, independent variable, and randomly created ID
 54 | dat <- data.frame(y, x, folds)
 55 | 
 56 | summary(dat)
 57 | 
 58 | head(dat)
 59 | 
 60 | ## fit a linear model to the full dataset
 61 | model <- lm(y ~ x, data=dat)
 62 | summary(model)
 63 | 
 64 | 
 65 | ## subset the full dataset into to subsets based on the ID variable
 66 | train <- subset(dat, folds==1)
 67 | test <- subset(dat, folds==2)
 68 | 
 69 | train <- dat[dat$folds==1,]
 70 | test <- dat[dat$folds==2,]
 71 | 
 72 | nrow(train)
 73 | nrow(test)
 74 | 
 75 | 
 76 | ## Model 0: fit a linear model
 77 | fit <- lm(y ~ 1, data=train)
 78 | pred <- predict(fit, newdata=test)
 79 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2))
 80 | rmse
 81 | 
 82 | ## Model 1: fit a linear model
 83 | fit <- lm(y ~ x, data=train)
 84 | pred <- predict(fit, newdata=test)
 85 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2))
 86 | rmse
 87 | 
 88 | ## Model 2: fit a linear model with a squared term
 89 | fit <- lm(y ~ x + I(x^2), data=train)
 90 | pred <- predict(fit, newdata=test)
 91 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2))
 92 | rmse
 93 | 
 94 | 
 95 | ##################################################
 96 | ## Model 2: fit a linear model with a squared term
 97 | fit <- lm(y ~ x + I(x^2), data=train)
 98 | pred <- predict(fit, newdata=train)
 99 | rmse <- sqrt(mean((as.numeric(pred)-train$y)^2))
100 | rmse
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes.R:
--------------------------------------------------------------------------------
 1 | #### R_Demo_Simulation_Inference_naive_Bayes.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2021
 6 | ##
 7 | ## Date: 2021-07-24
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## Overview of Bayes rule and the calculation of conditional probability. Introduces the naive Bayes classifier function from the e1071 library.
16 | ##
17 | ## P(outcome | evidence) = P(outcome) * P(evidence | outcome) / P(evidence)
18 | ##
19 | ## Below is the Bayes’ Theorem:
20 | ## P(A | B) = P(A) * P(B | A) / P(B)
21 | ##
22 | ## Which can be derived from the general multiplication formula for AND events:
23 | ## P(A and B) = P(A) * P(B | A)
24 | ## P(B | A) = P(A and B) / P(A)
25 | ## P(B | A) = P(B) * P(A | B) / P(A)
26 | ## P(y|x) = P(x|y) * P(y) / P(x)
27 | ## P(x|y) = P(x AND y) / P(x)
28 | ##
29 | ## Pr(A[1] = Pr(y==0)
30 | ## Pr(A[2] = Pr(y==1)
31 | ## Pr(B | A[1]) = Pr(Data | y==0)
32 | ## Pr(B | A[2]) = Pr(Data | y==1)
33 | ##
34 | ##########################################################################
35 | 
36 | 
37 | ## load libraries
38 | library(e1071)
39 | library(LaplacesDemon)
40 | 
41 | 
42 | ## example code from BayesTheorem() function
43 | PrA <- c(0.75,0.25)
44 | PrBA <- c(6/9, 5/7)
45 | BayesTheorem(PrA, PrBA)
46 | 
47 | 
48 | ## create fake data
49 | n <- 10
50 | x <- c(rep(0,n/2), rep(1,n/2))
51 | y <- c(0,0,0,1,1,0,0,1,1,1)
52 | 
53 | ## inspect data
54 | cbind(y,x)
55 | 
56 | ## inspect tabulation of data
57 | table(y,x)
58 | 
59 | 
60 | ## calculate the probability of the evidence/data
61 | PrX <- NA
62 | PrX[1] <- sum(as.numeric(x==1)) / n
63 | PrX[2] <- sum(as.numeric(x==1)) / n
64 | 
65 | ## calculate the probability of the outcome
66 | PrY <- NA
67 | PrY[1] <- sum(as.numeric(y==0))/n
68 | PrY[2] <- sum(as.numeric(y==1))/n
69 | PrY
70 | 
71 | ## calculate the probability of the data conditional on the value of y (the likelihood)
72 | PrXY<- NA
73 | PrXY[1] <- sum(x[y==0])/length(as.numeric(x[y==0]))
74 | PrXY[2] <- sum(x[y==1])/length(as.numeric(x[y==1]))
75 | PrXY
76 | 
77 | ## apply Bayes Rule
78 | PrXY * PrY / PrX
79 | 
80 | ## apply Bayes Rule with BayesTheorem() function
81 | BayesTheorem(PrA=PrY, PrBA=PrXY)
82 | 
83 | 
84 | ## apply Bayes Rule with naiveBayes() function
85 | fit <- naiveBayes(y~x, data=data.frame(y,x))
86 | fit
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R:
--------------------------------------------------------------------------------
  1 | #### R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2021
  6 | ##
  7 | ## Date: 2021-07-24
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ## Introduction to tutorial:
 14 | ##
 15 | ## (1) Program simulates count data that is then used to predict a binary outcome variable.
 16 | ##
 17 | ## (2) Three models are evaluated using the count data to predict the outcome:
 18 | ## (2a) linear model
 19 | ## (2b) generalized linear model with a logit link function
 20 | ## (2c) naive Bayes classifier.
 21 | ##
 22 | ##########################################################################
 23 | 
 24 | 
 25 | ## load library
 26 | llibrary(e1071)
 27 | 
 28 | 
 29 | ## simulate x1 and set the "true" population values alpha and beta
 30 | n <- 100
 31 | 
 32 | ## unobserved
 33 | x <- runif(n,0,1)
 34 | 
 35 | ## observed counts
 36 | x1 <- rpois(n, lambda=x)
 37 | x2 <- rpois(n, lambda=x)
 38 | x3 <- rpois(n, lambda=2*x)
 39 | x4 <- rpois(n, lambda=2*x)
 40 | x5 <- rpois(n, lambda=4*x)
 41 | 
 42 | ## systematic component of the model based on observed counts
 43 | xb <- -2 + x1 + x2 + x3 + x4 + x5
 44 | 
 45 | ## transform the linear term xb using
 46 | ## the inverse logit function
 47 | ## so that theta is bound from 0 to 1
 48 | pi <- 1 / (1 + exp(-xb))
 49 | 
 50 | ## generate the dependent variable y with probability pi and measurement error from a Bernoulli trial
 51 | y <- rbinom(n, size=1, prob=pi)
 52 | 
 53 | 
 54 | ## make data frame
 55 | dat <- data.frame(y, x1, x2, x3, x4, x5)
 56 | 
 57 | 
 58 | ## summarize fit using linear model
 59 | summary(lm(y ~ x1 + x2 + x3 + x4 + x5, data=dat))
 60 | 
 61 | 
 62 | ## summarize fit using glm using the logit link function
 63 | summary(glm(y ~ x1 + x2 + x3 + x4 + x5, family=binomial(link="logit")))
 64 | 
 65 | 
 66 | ## summarize fit using naiveBayes model
 67 | naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=dat)
 68 | 
 69 | 
 70 | ## create vectors for storing predictions
 71 | dat$y.hat1 <- NA
 72 | dat$y.hat2 <- NA
 73 | dat$y.hat3 <- NA
 74 | 
 75 | ## select number of folds
 76 | k <- 10
 77 | 
 78 | ## create vector of folds for cross validation
 79 | dat$folds <- sample(rep(1:k, k), n, replace=FALSE)
 80 | 
 81 | ## lapply function to
 82 | for(i in 1:k){
 83 |     
 84 |     ## fit a linear model
 85 |     fit1 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i))
 86 |     pred1 <- predict(fit1, newdata=subset(dat, folds==i))
 87 |     y.hat1 <- as.numeric(pred1)
 88 |     
 89 |     dat$y.hat1[dat$fold==i] <- y.hat1
 90 |     
 91 |     
 92 |     ## fit a glm model
 93 |     fit2 <- glm(y ~ x1 + x2 + x3 + x4 + x5, binomial(link="logit"), data=subset(dat, folds!=i))
 94 |     pred2 <- predict(fit2, newdata=subset(dat, folds==i))
 95 |     y.hat2 <- as.numeric(pred2)
 96 |     
 97 |     dat$y.hat2[dat$fold==i] <- y.hat2
 98 |     
 99 |     
100 |     ## fit a naiveBayes classifier model
101 |     fit3 <- naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i))
102 |     pred3 <- predict(fit3, newdata=subset(dat, folds==i))
103 |     y.hat3 <- as.numeric(pred3)
104 |     
105 |     dat$y.hat3[dat$fold==i] <- y.hat3
106 |     
107 |     #print(summary(dat))
108 | }
109 | 
110 | rmse.fit1 <- sqrt(mean((dat$y.hat1-dat$y)^2))
111 | rmse.fit1
112 | 
113 | cor.fit1 <- cor(dat$y.hat1, dat$y, method="spearman")
114 | 
115 | rmse.fit2 <- sqrt(mean((dat$y.hat2-dat$y)^2))
116 | rmse.fit2
117 | 
118 | cor.fit2 <- cor(dat$y.hat2, dat$y, method="spearman")
119 | 
120 | rmse.fit3 <- sqrt(mean((dat$y.hat3-dat$y)^2))
121 | rmse.fit3
122 | 
123 | cor.fit3 <- cor(dat$y.hat3, dat$y, method="spearman")
124 | 
125 | c(cor.fit1, cor.fit2, cor.fit3)
126 | 
127 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day05_Measurement_RSTAN/RSTAN_linear_model_simulation.R:
--------------------------------------------------------------------------------
  1 | ## RSTAN_linear_model_simulation.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN
  5 | ##
  6 | ## Please e-mail me if you find any errors or have and suggestions
  7 | ## e-mail: cjf0006@gmail.com
  8 | ##
  9 | ##########################################################################
 10 | ##
 11 | ## Introduction to tutorial:
 12 | ##
 13 | ## For this R tutorial we will draw random samples from the normal distribution using the STAN program. These parameters will be estimated based on the likelihood function that links them to the data contained in the y and x variables that are simulated observed data. The model produces the slope and intercept from a standard linear model, which is also estimated using lm() in R.
 14 | ##
 15 | ##########################################################################
 16 | 
 17 | ## load library
 18 | library(rstan) # load rstan library
 19 | library(MASS) # load library with truehist function
 20 | 
 21 | ## -------------------------------------------------- #
 22 | ## define STAN model as a character
 23 | ## -------------------------------------------------- #
 24 | model <- "
 25 |     data {
 26 |         // declared the data in memory
 27 |         int<lower=0> n;
 28 |         vector[n] y;
 29 |         vector[n] x;
 30 |     }
 31 |     parameters {
 32 |         // declared the parameters in memory
 33 |         real alpha;
 34 |         real beta;
 35 |         real<lower=0> sigma;
 36 |     }
 37 |     model {
 38 |         // priors (these are variances not precision)
 39 |         alpha ~ normal(0,10);
 40 |         beta ~ normal(0,10);
 41 | 
 42 |         // likelihood (link data to some combination of parameters and more data)
 43 |         for(i in 1:n){
 44 |             y[i] ~ normal(alpha + beta * x[i], sigma);
 45 |         }
 46 |     }
 47 |     generated quantities {
 48 |         // posterior predictions
 49 |         vector[n] y_predict;
 50 | 
 51 |         // the loop is necessary within the generated quantities block
 52 |         for(i in 1:n){
 53 |             y_predict[i] = normal_rng(alpha + beta * x[i], sigma);
 54 |         }
 55 |     }
 56 | "
 57 | ## -------------------------------------------------- #
 58 | 
 59 | 
 60 | ## set data for simulation
 61 | n <- 100
 62 | x <- rnorm(n,0,1)
 63 | alpha <- 1.25
 64 | beta <- 2.50
 65 | 
 66 | ## simulate a dependent variable with normally distribtued error using the data and parameter values defined above
 67 | error <- rnorm(n)
 68 | y <- alpha + beta * x + error
 69 | 
 70 | plot(x=x, y=y)
 71 | 
 72 | ## fit linear model
 73 | summary(lm(y~x))
 74 | 
 75 | ## create data list
 76 | data_list <- list(y = y, x=x, n=n)
 77 | 
 78 | ## set time start variable
 79 | time1 <- Sys.time()
 80 | 
 81 | # fit stan model
 82 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4)
 83 | 
 84 | ## calcuate the duration of the program file up to this point
 85 | print(Sys.time() - time1)
 86 | 
 87 | ## extract draws from stan model object (creates a list object)
 88 | output <- extract(fit, permuted = TRUE)
 89 | 
 90 | ## print names of each element/slot in the list
 91 | names(output)
 92 | 
 93 | ## print model fit object
 94 | fit
 95 | 
 96 | ## there are number of methods to subset and summarize parameters
 97 | ## keep in mind that the output object is a list that contains vectors or matrices of of posterior estimates for each of the named parameter defined in the model statement above
 98 | ## lapply (list-apply) a function to all of the objects in the list
 99 | lapply(output, mean)[1:3]
100 | lapply(output, sd)[1:3]
101 | 
102 | ## create a matrix using some of the named slots in the list
103 | model_parameters <- as.matrix(fit, pars = c("alpha", "beta", "sigma"))
104 | model_predictions <- as.matrix(fit, pars = "y_predict")
105 | 
106 | ## check the dimensions (they should be the same)
107 | dim(model_predictions)
108 | dim(output$y_predict)
109 | 
110 | ## plot the simulated y variable and the estimated posterior means
111 | plot(y, apply(model_predictions,2,mean))
112 | 
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day05_Measurement_RSTAN/RSTAN_logistic_regression_simulation.R:
--------------------------------------------------------------------------------
  1 | ## RSTAN_logistic_regression_simulation.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN
  5 | ##
  6 | ## Please e-mail me if you find any errors or have and suggestions
  7 | ## e-mail: cjf0006@gmail.com
  8 | ##
  9 | ##########################################################################
 10 | ##
 11 | ## Introduction to tutorial:
 12 | ##
 13 | ## For this R tutorial we will simulate a binary dependent variable and then estimate the parameters that generate the variable. These parameters will be estimated based on the likelihood function that links them to the data contained in the y and x variables that are simulated observed data. The model produces the slope and intercept from a standard logistic regression model, which is also estimated using glm() in R.
 14 | ##
 15 | ##########################################################################
 16 | 
 17 | ## load library
 18 | library(rstan) # load rstan library
 19 | library(MASS) # loaed library with truehist function
 20 | 
 21 | ## -------------------------------------------------- ##
 22 | ## define STAN model
 23 | ## -------------------------------------------------- ##
 24 | model <- "
 25 |     data {
 26 |         // declared the data in memory
 27 |         int<lower=0> n;
 28 |         int<lower=0, upper=1> y[n];
 29 |         vector[n] x;
 30 |     }
 31 |     // declared the parameters in memory
 32 |     parameters {
 33 |         real alpha;
 34 |         real beta;
 35 |     }
 36 |     model {
 37 |         // priors (these are variances not precision)
 38 |         alpha ~ normal(0,10);
 39 |         beta ~ normal(0,10);
 40 |     
 41 |         // likelihood (link data to some combination of parameters and more data)
 42 |         y ~ bernoulli_logit(alpha + beta * x);
 43 |     }
 44 | generated quantities {
 45 |     // posterior predictions
 46 |     vector[n] y_predict;
 47 |     
 48 |     // the loop is necessary within the generated quantities block
 49 |     for(i in 1:n){
 50 |         y_predict[i] = bernoulli_logit_rng(alpha + beta * x[i]);
 51 |     }
 52 | }
 53 | 
 54 | "
 55 | ## -------------------------------------------------- ##
 56 | 
 57 | 
 58 | ## simulate x1 and set the "true" population values alpha and beta
 59 | n <- 100
 60 | x <- rnorm(n,0,1)
 61 | alpha <- 1.25
 62 | beta <- 2.50
 63 | 
 64 | ## systematic component of the model
 65 | xb <- alpha + beta * x
 66 | 
 67 | ## transform the linear term xb using
 68 | ## the inverse logit function
 69 | ## so that theta is bound from 0 to 1
 70 | eta <- 1 / (1 + exp(-xb))
 71 | 
 72 | ## generate the dependent variable y with probability inv.theta and measurment error from a Bernoulli trial
 73 | y <- rbinom(n, size=1, prob=eta)
 74 | 
 75 | table(y)
 76 | 
 77 | ## create data list
 78 | data_list <- list(y = y, x=x, n=n)
 79 | 
 80 | ## fit linear model
 81 | summary(glm(y~x, family=binomial(link="logit")))
 82 | 
 83 | ## fit stan model
 84 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4)
 85 | 
 86 | ## extract draws from stan model object
 87 | output <- extract(fit, permuted = TRUE)
 88 | 
 89 | ## print names
 90 | names(output)
 91 | 
 92 | ## there are number of methods to subset and summarize parameters
 93 | ## keep in mind that the output object is a list that contains vectors or matrices of of posterior estimates for each of the named parameter defined in the model statement above
 94 | ## lapply (list-apply) a function to all of the objects in the list
 95 | lapply(output, mean)
 96 | lapply(output, sd)
 97 | 
 98 | ## tabulate the simulated binary dependent variable it should be very close to the mean value of the predicted y
 99 | table(y)
100 | 
101 | ## create a matrix using some of the named slots in the list
102 | model_parameters <- as.matrix(fit, pars = c("alpha", "beta"))
103 | model_predictions <- as.matrix(fit, pars = "y_predict")
104 | 
105 | ## check the dimensions (they should be the same)
106 | dim(model_predictions)
107 | dim(output$y_predict)
108 | 
109 | ## plot the simulated y variable and the estimated posterior means
110 | plot(apply(model_predictions,2,mean), y)
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day05_Measurement_RSTAN/RSTAN_mean_simulation.R:
--------------------------------------------------------------------------------
 1 | ## RSTAN_mean_simulation.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN
 5 | ##
 6 | ## Please e-mail me if you find any errors or have and suggestions
 7 | ## e-mail: cjf0006@gmail.com
 8 | ##
 9 | ##########################################################################
10 | ##
11 | ## Introduction to tutorial:
12 | ##
13 | ## For this R tutorial we will use the normal distribution in STAN to estimate the mean value for an observed variable y. The parameter for the mean will be selected based on the likelihood function that links them to the data contained in the y variable.
14 | ##
15 | ##########################################################################
16 | 
17 | ## load library
18 | library(rstan) # load rstan library
19 | library(MASS) # load library with truehist function
20 | 
21 | ## -------------------------------------------------- #
22 | ## define STAN model as a character
23 | ## -------------------------------------------------- #
24 | model <- "
25 |     data {
26 |         // declared the data in memory
27 |         int<lower=0> n;
28 |         vector[n] y;
29 |     }
30 |     parameters {
31 |         // declared the parameters in memory
32 |         real mu;
33 |         real<lower=0> sigma;
34 |     }
35 |     model {
36 |         // there are no prior statements for mu or sigma; 
37 |         // by default the priors on the parameters are flat unless we provide more information (see the other examples)
38 |         // likelihood (link data to some combination of parameters and more data)
39 |         
40 |         mu ~ normal(0,1);
41 |         
42 |         for(i in 1:n){
43 |             y[i] ~ normal(mu, sigma);
44 |         }
45 |     }
46 |     generated quantities {
47 |         // posterior predictions
48 |         vector[n] y_predict;
49 | 
50 |         // the loop is necessary within the generated quantities block
51 |         for(i in 1:n){
52 |             y_predict[i] = normal_rng(mu, sigma);
53 |         }
54 |     }
55 | "
56 | ## -------------------------------------------------- #
57 | 
58 | 
59 | ## set data for simulation
60 | y <- 1:5
61 | y <- rep(1:5,200)
62 | 
63 | n <- length(y)
64 | y
65 | n
66 | 
67 | ## create data list
68 | data_list <- list(y = y, n=n)
69 | data_list
70 | 
71 | ## set time start variable
72 | time1 <- Sys.time()
73 | 
74 | # fit stan model
75 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4)
76 | 
77 | ## calcuate the duration of the program file up to this point
78 | print(Sys.time() - time1)
79 | 
80 | ## extract draws from stan model object (creates a list object)
81 | output <- extract(fit, permuted = TRUE)
82 | 
83 | ## print names of each element/slot in the list
84 | names(output)
85 | 
86 | ## print model fit object
87 | fit
88 | 
89 | ## lapply (list-apply) a function to all of the objects in the list
90 | lapply(output, mean)
91 | lapply(output, sd)
92 | 
93 | truehist(output$mu)
94 | 
95 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day05_Measurement_RSTAN/RSTAN_normal_distribution_simulation.R:
--------------------------------------------------------------------------------
 1 | ## RSTAN_normal_distribution_simulation.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN
 5 | ##
 6 | ## Please e-mail me if you find any errors or have and suggestions
 7 | ## e-mail: cjf0006@gmail.com
 8 | ##
 9 | ##########################################################################
10 | ##
11 | ## Introduction to tutorial:
12 | ##
13 | ## For this R tutorial we will draw random samples from the normal distribution using the STAN program. 
14 | ## This program is equivalent to using the rnorm function in R.
15 | ##
16 | ##########################################################################
17 | 
18 | 
19 | ## load library
20 | library(rstan) # load rstan library
21 | library(MASS) # load library with truehist function
22 | 
23 | 
24 | ## -------------------------------------------------- #
25 | ## define STAN model
26 | ## -------------------------------------------------- #
27 | model <- "
28 | 
29 | parameters {
30 |     real mu;
31 | }
32 | 
33 | model {
34 |     mu ~ normal(0,1);
35 | }
36 | "
37 | ## -------------------------------------------------- #
38 | 
39 | 
40 | ## set time start variable
41 | time1 <- Sys.time()
42 | 
43 | ## fit stan model
44 | fit <- stan(model_code = model, iter = 1000, chains = 4)
45 | 
46 | ## calculate the duration of the program file up to this point
47 | print(Sys.time() - time1)
48 | 
49 | ## extract draws from stan model object (creates a list object)
50 | output <- extract(fit, permuted = TRUE)
51 | 
52 | ## print names of each element/slot in the list
53 | names(output)
54 | 
55 | ## print model fit object
56 | fit
57 | 
58 | ## there are number of methods to subset and summarize parameters
59 | ## keep in mind that the output object is a list that contains vectors or matrices of posterior estimates for each of the named parameter defined in the model statement above
60 | ## lapply (list-apply) a function to all of the objects in the list
61 | lapply(output, mean)
62 | lapply(output, sd)
63 | 
64 | ## create a matrix using some of the named slots in the list
65 | model_parameters <- as.matrix(fit, pars = c("mu"))
66 | 
67 | ## check the dimensions (they should be the same)
68 | length(output$mu)
69 | 
70 | ## make a nice plot
71 | #par(mfrow=c(2,2))
72 | truehist(output$mu)
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_Ternary_Diriclet.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_Ternary_Diriclet.R
 2 | library(Ternary)
 3 | library(MCMCpack)
 4 | 
 5 | value1 <- value2 <- value3 <- value4 <-  value5 <-  value6 <- matrix(NA, nrow=1000, ncol=3)
 6 | 
 7 | value1[,1:3] <- rdirichlet(1000, alpha=c(1,1,5))
 8 | value2[,1:3] <- rdirichlet(1000, alpha=c(1,5,1))
 9 | value3[,1:3] <- rdirichlet(1000, alpha=c(10,10,1))
10 | 
11 | value4[,1:3] <- rdirichlet(1000, alpha=c(1,1,1))
12 | value5[,1:3] <- rdirichlet(1000, alpha=c(10,10,10))
13 | value6[,1:3] <- rdirichlet(1000, alpha=c(100,100,100))
14 | 
15 | ## coordinates within the simplex sum to 1
16 | table(apply(value4[,1:3],1,sum))
17 | 
18 | 
19 | ## unicode for right arrow: "\u2192"
20 | ## unicode for left arrow: "\u2190"
21 | 
22 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192",
23 |     point='right', lab.cex=0.8, grid.minor.lines = 0,
24 |     grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white',
25 |     axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6),
26 |     padding=0.08, main="simplex")
27 | TernaryPoints(value1, col=4)
28 | TernaryPoints(value2, col=3)
29 | TernaryPoints(value3, col=2)
30 | 
31 | TernaryPoints(value4, col=1)
32 | TernaryPoints(value5, col=4)
33 | TernaryPoints(value6, col=2)
34 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_stm_package.R:
--------------------------------------------------------------------------------
 1 | #### R_Demo_text_as_data_DTM_stm_package.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2021
 6 | ##
 7 | ## Date: 2021-07-24
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ##
14 | ## Introduction to tutorial:
15 | ##
16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file.
17 | ##
18 | ## This code uses functions available in the tm package.
19 | ##
20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions.
21 | ##
22 | ##########################################################################
23 | 
24 | 
25 | ## load libraries
26 | library(stm)
27 | library(tm)
28 | library(SnowballC)
29 | 
30 | ## load data
31 | data <- read.csv("SIMpoliticalTweets.txt", header=FALSE)
32 | data
33 | names(data) <- "text"
34 | data
35 | 
36 | #trumptweets <- fromJSON("trump_json_files_20190707.txt")
37 | #data <- trumptweets
38 | 
39 | ##
40 | #data <- data[1:1000,]
41 | 
42 | 
43 | ## preprocess the documents
44 | ## This function uses function from the tm package (see the tm Demo for more details)
45 | ## stem words and remove stop words
46 | prep <- textProcessor(documents=data$text, meta=data)
47 | 
48 | ## list attributes
49 | attributes(prep)
50 | 
51 | ## inspect
52 | head(prep$documents)
53 | head(prep$vocab)
54 | head(prep$meta)
55 | prep$docs.removed
56 | 
57 | 
58 | ## pre Documents
59 | ## additional processing (removes some documents because of word frequencies greater than .99 or less than .01)
60 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta)
61 | out
62 | 
63 | ## inspect
64 | head(out$documents)
65 | head(out$vocab)
66 | head(out$meta)
67 | 
68 | ## fit a structural topic model
69 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=2)
70 | 
71 | ## inspect attributes
72 | attributes(fit)
73 | 
74 | dim(fit$theta)
75 | 
76 | ## display topic probabilities
77 | fit$theta
78 | 
79 | apply(fit$theta, 1, sum)
80 | 
81 | out$meta
82 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_tm_package.R:
--------------------------------------------------------------------------------
 1 | #### R_Demo_text_as_data_DTM_tm_package.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2021
 6 | ##
 7 | ## Date: 2021-07-24
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ##
14 | ## Introduction to tutorial:
15 | ##
16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file.
17 | ##
18 | ## This code uses functions available in the tm package.
19 | ##
20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions.
21 | ##
22 | ##########################################################################
23 | 
24 | ## load libraries
25 | library(stm)
26 | library(tm)
27 | library(SnowballC)
28 | 
29 | ## load data
30 | data <- read.csv("SIMpoliticalTweets.txt", header=FALSE)
31 | data
32 | names(data) <- "text"
33 | data
34 | 
35 | #trumptweets <- fromJSON("trump_json_files_20190707.txt")
36 | #data <- trumptweets
37 | 
38 | ##
39 | ##data <- data[1:1000,]
40 | 
41 | ## create character vector for processing
42 | newtext <- as.character(data$text)
43 | length(newtext)
44 | 
45 | 
46 | ## use gsub to remove special characters that usually cause errors if left for later
47 | newtext <- gsub("[^0-9A-Za-z///' ]", "", newtext)
48 | newtext <-  gsub("[^[:alnum:]///' ]", "", newtext)
49 | newtext <- gsub("[^\x20-\x7F\x0D\x0A]", "", newtext) # remove all non-ascii characters
50 | newtext <- gsub("http.*", "", newtext) # replace all of the urls
51 | newtext <- gsub("www.*", "", newtext) #
52 | 
53 | ## data$newtext
54 | data$newtext <- newtext
55 | 
56 | ## convert to corpus object using additional functions from the tm package
57 | ## the tm_map function takes as its first argument the vector of text and a function as its second argument
58 | corpus <-Corpus(VectorSource(newtext))
59 | corpus <- tm_map(corpus, removePunctuation)
60 | corpus <- tm_map(corpus, removeNumbers)
61 | corpus <- tm_map(corpus, stripWhitespace)
62 | corpus <- tm_map(corpus, tolower)
63 | corpus <- tm_map(corpus, removeWords, stopwords("english"))
64 | corpus <- tm_map(corpus, stemDocument)
65 | 
66 | ## print to screen
67 | inspect(corpus[1:10])
68 | 
69 | 
70 | ## make document by term matrix
71 | DTM <- DocumentTermMatrix(corpus)
72 | DTM
73 | 
74 | 
75 | ## print DTM to screen
76 | inspect(DTM)
77 | inspect(DTM[1:10,1:25])
78 | 
79 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_data_package_description.R:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## text adapted from these references:
 3 | ## https://hub.packtpub.com/9-useful-r-packages-for-nlp-text-mining/
 4 | ## https://analyticsindiamag.com/top-10-r-packages-for-natural-language-processing-nlp/
 5 | 
 6 | ## koRpus
 7 | ## koRpus is an R package for analysing texts. 
 8 | ## It includes a diverse collection of functions for automatic language detection. 
 9 | ## It also includes indices of lexical diversity, such as type token ratio, MTLD, etc. 
10 | ## koRpus’ also provides a plugin for R GUI as well as IDE RKWard that assists in providing graphical dialogs for its basic features. 
11 | 
12 | ## lsa
13 | ## Latent Semantic Analysis or lsa is an R package that provides routines for performing a latent semantic analysis with R. 
14 | ## The basic idea of this package is that text do have a higher-order or latent semantic structure which is obscured by word usage e.g. through the use of synonyms or polysemy.
15 | 
16 | ## OpenNLP
17 | ## OpenNLP provides an R interface to Apache OpenNLP, which is a collection of natural language processing tools written in Java. 
18 | ## OpenNLP supports common natural language processing tasks such as tokenisation, sentence segmentation, part-of-speech tagging, named entity extraction, chunking, parsing and coreference resolution.
19 | 
20 | ## Quanteda
21 | ## Quanteda is an R package for managing and analysing text. 
22 | ## It is a fast, flexible, and comprehensive framework for quantitative text analysis in R. 
23 | ## Quanteda provides functionality for corpus management, creating and manipulating tokens and ngrams, exploring keywords in context, forming and manipulating sparse matrices of documents by features and more.
24 | 
25 | ## RWeka
26 | ## RWeka is an interface to Weka, which is a collection of machine learning algorithms for data mining tasks written in Java. 
27 | ## It contains tools for data pre-processing, clustering, association rules, visualisation and more. 
28 | ## This package contains an interface code, known as the Weka jar that resides in a separate package called ‘RWekajars’.
29 | 
30 | ## Spacyr
31 | ## Spacyr is an R wrapper to the Python spaCy NLP library. 
32 | ## The package is designed to provide easy access to the functionality of spaCy library in a simple format. 
33 | ## One of the easiest methods to install spaCy and spacyr is through the spacyr function spacy_install(). 
34 | 
35 | ## Stringr
36 | ## Stringr is a consistent, simple and easy to use R package that provides consistent wrappers for the string package and therefore simplifies the manipulation of character strings in R. 
37 | ## It includes a set of internally consistent tools for working with character strings, i.e. sequences of characters surrounded by quotation marks.  
38 | 
39 | ## Text2vec 
40 | ## Text2vec is an R package which provides an efficient framework with a concise API for text analysis and natural language processing (NLP). 
41 | ## Some of its important features include allowing users to easily solve complex tasks, maximise efficiency per single thread, transparently scale to multiple threads on multicore machines, use streams and iterators, among others.
42 | 
43 | ## TM
44 | ## TM or Text Mining Package is a framework for text mining applications within R. 
45 | ## The package provides a set of predefined sources, such as DirSource, 
46 | ## DataframeSource, etc. which handle a directory, a vector interpreting each component as a document, or data frame like structures (such as CSV files), and more.
47 | 
48 | ## Wordcloud
49 | ## Wordcloud is an R package that creates pretty word clouds, visualises differences and similarity between documents, and avoids overplotting in scatter plots with text. 
50 | ## The word cloud is a commonly used plot to visualise a speech or set of documents in a clear way.
51 | 
52 | ## STM
53 | ## STM is an R package that efficiently estimates the Latent Diriclet Allocation model (LDA) and the related Structural Topic model (STM).
54 | 
55 | ## maxent
56 | ## maxent
57 | 
58 | ## RKEA
59 | ## RKEA
60 | 
61 | ## languageR
62 | ## languageR
63 | 
64 | ## RcmdrPlugin.temis
65 | ## RcmdrPlugin.temis
66 | 
67 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_data_twitteR_get_twitter_users.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_text_as_data_twitteR_get_tweeter_users.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2021
 6 | ##
 7 | ## Date: 2021-07-24
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## setup to access the twitter API using R
16 | ##
17 | ## Create a Twitter application at http://dev.twitter.com. Make sure to give the app read, write and direct message authority.
18 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret".
19 | 
20 | ##########################################################################
21 | 
22 | #install.packages("twitteR")
23 | library(twitteR)
24 | 
25 | ## grab the most recent 100 tweets from Barak Obama
26 | Obama_ut <- userTimeline('barackobama', n=100)
27 | 
28 | ## grab the max number of tweets for the President of Senegal Macky Sall
29 | Macky_Sall_ut <- userTimeline('Macky_Sall', n=3200)
30 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_data_twitteR_setup.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_text_as_data_twitteR_setup.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2021
 6 | ##
 7 | ## Date: 2021-07-24
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## setup to access the twitter API using R
16 | ##
17 | ##########################################################################
18 | 
19 | install.packages("twitteR")
20 | library(twitteR)
21 | 
22 | ## go to twitter's developer page: https://developer.twitter.com/
23 | ## clink to create an App
24 | ## for the desription, you can say something like this "learn to use twitteR API"
25 | ## once you have succefully setup you APP, you will be able to get the four strings that you will fill in below
26 | 
27 | ## set keys and tokens to access the twitter API
28 | consumer_key <- "your_consumer_key"
29 | consumer_secret <- "your_consumer_secret"
30 | access_token <- "your_access_token"
31 | access_secret <- "your_access_secret"
32 | 
33 | setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
34 | 
35 | setup_twitter_oauth("API key", "API secret")
36 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_wikip_stm.R:
--------------------------------------------------------------------------------
 1 | ##########################################################################
 2 | ## INSTRUCTOR: Christopher Fariss
 3 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 4 | ## University of Essex Summer School 2021
 5 | ##
 6 | ## Date: 2021-07-24
 7 | ##
 8 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 9 | ## e-mail: cjf0006@gmail.com
10 | ## e-mail: cjfariss@umich.edu
11 | ##########################################################################
12 | ## Introduction to tutorial:
13 | ##
14 | ## This tutorial demonstrates how the Structural Topic Model functions for a real set of documents.
15 | ##
16 | ## Each unit in the dataset representes a randomly selected sentence from wikipedia.  
17 | ##
18 | ##########################################################################
19 | 
20 | library(stm)
21 | 
22 | wikip_word_dat <- readLines("one_meelyun_sentences.txt")
23 | head(wikip_word_dat)
24 | 
25 | n <- length(wikip_word_dat)
26 | n
27 | 
28 | wikip_word_dat <- data.frame(id=1:n, text=wikip_word_dat)
29 | head(wikip_word_dat)
30 | 
31 | 
32 | wikip_word_dat$human_rights_test <- as.numeric(grepl("human rights", wikip_word_dat$text))
33 | table(wikip_word_dat$human_rights_test)
34 | 
35 | wikip_word_dat$civil_rights_test <- as.numeric(grepl("civil rights", wikip_word_dat$text))
36 | table(wikip_word_dat$civil_rights_test)
37 | 
38 | ## preprocess the documents
39 | ## This function uses function from the tm package (see the tm Demo for more details)
40 | prep <- textProcessor(documents=wikip_word_dat$text, meta=wikip_word_dat)
41 | 
42 | save(out, file="wikip_word_dat_stm_prep.Rdata")
43 | 
44 | ## inspect
45 | head(prep$documents)
46 | head(prep$vocab)
47 | head(prep$meta)
48 | 
49 | 
50 | ## pre Documents
51 | ## stem words and remove stop words
52 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta)
53 | 
54 | save(out, file="wikip_word_dat_stm_out.Rdata")
55 | 
56 | ## inspect
57 | head(out$documents)
58 | head(out$vocab)
59 | head(out$meta)
60 | 
61 | ## fit a structural topic model
62 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=20)
63 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=40)
64 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=100)
65 | 
66 | #fit_2 <- fit
67 | #fit_3 <- fit
68 | #fit_10 <- fit
69 | #fit_20 <- fit
70 | fit_40 <- fit
71 | 
72 | ## display topic probabilities
73 | head(fit$theta)
74 | 
75 | dim(fit$theta)
76 | summary(head(fit$theta, 1000))
77 | 
78 | #save(fit, file="wikip_word_dat_stm20.Rdata")
79 | #save(fit, file="wikip_word_dat_stm40.Rdata")
80 | save(fit, file="wikip_word_dat_stm100.Rdata")
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day07_Applied_Machine_Learning_AML/R_Demo_AML_neuralnet_gradiant_decent_glm.R:
--------------------------------------------------------------------------------
  1 | ## R_Demo_neuralnet_gradiant_decent_glm.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2021
  6 | ##
  7 | ## Date: 2021-07-24
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ## Introduction to tutorial:
 14 | ##
 15 | ## Overview of Gradient Decent for generalized linear model example.
 16 | ##
 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network.
 18 | ##
 19 | ##########################################################################
 20 | 
 21 | ## set learning rate this varies on the unit interval (0 to 1]
 22 | library(MASS)
 23 | library(boot)
 24 | library(gtools)
 25 | 
 26 | lr <- .2
 27 | n <- 100
 28 | x <- rnorm(n)
 29 | 
 30 | alpha <- -1
 31 | beta <- -2
 32 | 
 33 | y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x))
 34 | #y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x + rnorm(n)))
 35 | 
 36 | ## Start with a random guess
 37 | X_mat <- cbind(1,x)
 38 | 
 39 | alpha_hat <- 4
 40 | beta_hat <- 4
 41 | iterations <- 1000
 42 | 
 43 | loss <- variance <- NA
 44 | 
 45 | y_hat <- matrix(NA, nrow=n, ncol=iterations)
 46 | y_error <- matrix(NA, nrow=n, ncol=iterations)
 47 | delta <- matrix(NA, nrow=2, ncol=iterations)
 48 | 
 49 | ## sequential iterations to evaulate loss function
 50 | for (j in 1:iterations){
 51 |     
 52 |     ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta
 53 |     y_hat[,j] <- inv.logit(alpha_hat[j] + beta_hat[j] * x)
 54 | 
 55 |     ## this works too but not as well as above
 56 |     #y_hat[,j] <- rbinom(1:n,1,prob=inv.logit(alpha_hat[j] + beta_hat[j] * x))
 57 |     
 58 |     ## difference between the predicted y (y_hat) and y is the error for y
 59 |     y_error[,j] <- y_hat[,j] - y
 60 |     
 61 |     ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors
 62 |     loss[j] <- sum(y_error[,j]^2)
 63 |     
 64 |     ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X
 65 |     ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n)
 66 |     delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n
 67 |     delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n
 68 |     
 69 |     ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y)
 70 |     alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j])
 71 |     beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j])
 72 | }
 73 | 
 74 | ## estimate glm model
 75 | fit <- glm(y~x, family=binomial("logit"))
 76 | fit
 77 | 
 78 | ## print the last value of the sequence of parameter estimates
 79 | alpha_hat[length(alpha_hat)]
 80 | beta_hat[length(beta_hat)]
 81 | 
 82 | 
 83 | ## graph the values as a function of the loss statistic
 84 | ALPHA_seq <- seq(from=-4,4,.05)
 85 | BETA_seq <- seq(from=-4,4,.05)
 86 | 
 87 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq))
 88 | 
 89 | ## grid search for graphing
 90 | for(i in 1:length(ALPHA_seq)){
 91 |     for(j in 1:length(ALPHA_seq)){
 92 |         Y_hat <- inv.logit(ALPHA_seq[i] + BETA_seq[j] * X_mat[,2])
 93 |         LOSS[i,j] <- sum((y-Y_hat)^2)
 94 |         
 95 |     }
 96 | }
 97 | 
 98 | ## plot the log loss
 99 | contour(ALPHA_seq, BETA_seq,(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5)
100 | lines(alpha_hat, beta_hat, col=2, lwd=4)
101 | 
102 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day07_Applied_Machine_Learning_AML/R_Demo_AML_neuralnet_gradiant_decent_lm.R:
--------------------------------------------------------------------------------
  1 | #### R_Demo_neuralnet_gradiant_decent_lm.R 
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2021
  6 | ##
  7 | ## Date: 2021-07-24
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ## Introduction to tutorial:
 14 | ##
 15 | ## Overview of Gradient Decent for a linear model with two parameters (intercept and slope).
 16 | ##
 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network.
 18 | ##
 19 | ##########################################################################
 20 | 
 21 | 
 22 | ## load libraries
 23 | library(MASS)
 24 | 
 25 | ## set learning rate this varies on the unit interval (0 to 1]
 26 | lr <- .2
 27 | 
 28 | n <- 100
 29 | x <- rnorm(n)
 30 | 
 31 | alpha <- -1
 32 | beta <- -2
 33 | 
 34 | y <- alpha + beta*x + rnorm(n)
 35 | plot(x=x, y=y)
 36 | fit <- lm(y ~ x)
 37 | summary(fit)
 38 | 
 39 | 
 40 | X_mat <- cbind(1,x)
 41 | head(X_mat)
 42 | 
 43 | # Start with a random guess
 44 | alpha_hat <- 1
 45 | beta_hat <- 4
 46 | 
 47 | 
 48 | iterations <- 100
 49 | 
 50 | loss <- NA
 51 | variance <- NA
 52 | 
 53 | y_hat <- matrix(NA, nrow=n, ncol=iterations)
 54 | y_error <- matrix(NA, nrow=n, ncol=iterations)
 55 | delta <- matrix(NA, nrow=2, ncol=iterations)
 56 | 
 57 | ## sequential iterations to evaulate loss function
 58 | for (j in 1:iterations){
 59 |     
 60 |     ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta
 61 |     y_hat[,j] <- alpha_hat[j] + beta_hat[j] * x
 62 |     
 63 |     ## difference between the predicted y (y_hat) and y is the error for y
 64 |     y_error[,j] <- y_hat[,j] - y
 65 |     
 66 |     ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors
 67 |     loss[j] <- sum(y_error[,j]^2)
 68 |     
 69 |     ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X
 70 |     ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n)
 71 |     
 72 |     ##
 73 |     delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n
 74 |     delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n
 75 | 
 76 |     ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y)
 77 |     alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j])
 78 |     beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j])
 79 | }
 80 | 
 81 | ## print lm summary from above
 82 | summary(fit)
 83 | 
 84 | ## print the last value of the sequence of parameter estimates
 85 | alpha_hat[length(alpha_hat)]
 86 | beta_hat[length(beta_hat)]
 87 | 
 88 | 
 89 | ## the least squares solution
 90 | solve(t(X_mat) %*% X_mat) %*% t(X_mat) %*% y
 91 | 
 92 | ## note that these give distinct regression models because the covariance between alpha and beta are not included (so they are different from the one above)
 93 | solve(t(X_mat[,2]) %*% X_mat[,2]) %*% t(X_mat[,2]) %*% y
 94 | solve(t(X_mat[,1]) %*% X_mat[,1]) %*% t(X_mat[,1]) %*% y
 95 | 
 96 | 
 97 | ## graph the values as a function of the loss statistic
 98 | ALPHA_seq <- seq(from=-4,4,.05)
 99 | BETA_seq <- seq(from=-4,4,.05)
100 | 
101 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq))
102 | 
103 | ## grid search for graphing
104 | for(i in 1:length(ALPHA_seq)){
105 |     for(j in 1:length(ALPHA_seq)){
106 |         Y_hat <- ALPHA_seq[i] + BETA_seq[j] * X_mat[,2]
107 |         LOSS[i,j] <- sum((y-Y_hat)^2)
108 |     }
109 | }
110 | 
111 | ## plot the log loss
112 | par(mar=c(5,5,1,1))
113 | contour(ALPHA_seq, BETA_seq, log(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5)
114 | lines(alpha_hat, beta_hat, col=2, lwd=4)
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day07_Applied_Machine_Learning_AML/R_Demo_AML_neuralnet_gradiant_decent_mu.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_neuralnet_gradiant_decent_mu.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2021
 6 | ##
 7 | ## Date: 2021-07-24
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## Overview of Gradient Decent for a model with one parameter for a mean.
16 | ##
17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network.
18 | ##
19 | ##########################################################################
20 | 
21 | 
22 | 
23 | 
24 | 
25 | ## load libraries
26 | library(MASS)
27 | 
28 | ## set learning rate this varies on the unit interval (0 to 1]
29 | lr <- .2
30 | #lr <- .01
31 | #lr <- 1
32 | 
33 | y <- c(1,2,3,4,5)
34 | ##y <- rnorm(1000, pi, 1)
35 | n <- length(y)
36 | 
37 | y
38 | n
39 | 
40 | 
41 | iterations <- 100
42 | 
43 | loss <- NA
44 | 
45 | mu_hat <- -1.5
46 | 
47 | y_hat <- matrix(NA, nrow=n, ncol=iterations)
48 | y_error <- matrix(NA, nrow=n, ncol=iterations)
49 | 
50 | delta_mu_hat <- NA
51 | 
52 | ## sequential iterations to evaulate loss function
53 | for (j in 1:iterations){
54 |     
55 |     ## y_hat: calculate the predicted y_hat based on the best guess of mu
56 |     
57 |     y_hat[,j] <- mu_hat[j]
58 |     
59 |     ## risidual: the difference between the predicted y (y_hat) and y is the error for y
60 |     y_error[,j] <- y_hat[,j] - y
61 |     
62 |     ## loss: the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors
63 |     loss[j] <- sum(y_error[,j]^2)
64 |     
65 |     ## difference between current estimate and the unexplained differences, which is the method to calculate the gradient at that point
66 |     delta_mu_hat[j] <- sum(y_error[,j])/n
67 |     
68 |     ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y)
69 |     mu_hat[j+1] <- mu_hat[j] - (lr*delta_mu_hat[j])
70 | }
71 | 
72 | mu_hat[length(mu_hat)]
73 | 
74 | plot(mu_hat)
75 | 
76 | plot(mu_hat[-1], delta_mu_hat)
77 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2021/Day08_Model_Evaluation/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2021/Day08_Model_Evaluation/.DS_Store


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/.DS_Store


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day01_Intro_programs/.Rhistory:
--------------------------------------------------------------------------------
1 | ## what is the current working directory?
2 | getwd()
3 | ## set the working directory to an object using the assignment operator <- or = (more on this later)
4 | wd <- getwd()
5 | ## print to screen
6 | wd
7 | ## if you remember the function name use  a single question mark
8 | ?rm
9 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day01_Intro_programs/R_Demo_Intro_Program_Challenge_two_sum.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_Intro_Program_Challenge_two_sum.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-08
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Instructions:
14 | ##
15 | ## These challenges are meant to be just that, challenging. 
16 | ## They should also be fun. I encourage you to think creatively and collaboratively. 
17 | ## Getting stuck or not finishing all the steps is expected and encouraged. 
18 | ## This is how learning works!
19 | ## Learn to program, program to learn.
20 | ##
21 | ## Always start with step (1) and then continue to each step as time permits.
22 | ## Don't worry about completing each step. Document your code for each step.
23 | ## You may wish to come back to some of the harder steps as you progress through the course.
24 | ## Note that some of the steps may ask you to use skills we have not yet covered in the course.
25 | ## Don't worry about these steps now but definitely think through the programming logic if you are stuck and make plans to come back to try them once you feel ready.
26 | ##
27 | ##########################################################################
28 | ##
29 | ## Steps for the Challenge
30 | ##
31 | ## (1) create a vector of integers numbers and a scalar integer target
32 | ## (2) write a program in R that determines (returns) two numbers from the vector that add up to the target scalar
33 | ## (3) how many combinations of numbers in the vector of integers add up to the target scalar?
34 | ## (4) write a function to complete steps  1-3 (hint: wrap the program from (3) within a function)
35 | ## (5) write a simulation that explore the relationship between (a) the vector of integers numbers and (b) scalar integer target sum
36 | ## (6) re-write the program or function so that it takes fewer steps to calculate the number of numeric combinations that add up to the scalar target
37 | ##
38 | ##########################################################################
39 | 
40 | ## (1) create a vector of integers numbers and a scalar integer target
41 | x <- c(1,2,3,4,5)
42 | y <- 5
43 | 
44 | ## (2) write a program in R that determines (returns) two numbers from the vector that add up to the target scalar
45 | 
46 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day02_Data_Analysis_Managment/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/Day02_Data_Analysis_Managment/.DS_Store


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day02_Data_Analysis_Managment/R_Demo_Intro_forloop_vs_lapply.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_Intro_forloop_vs_lapply.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-08
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## in class coding of loops and lapply
16 | 
17 | 1:10
18 | 
19 | lenght(1:10)
20 | 
21 | 
22 | ## for loop version
23 | output <- c()
24 | for(i in 1:10){
25 |   
26 |   local_var <- i*i ## the simple but core calculation of the iterative process
27 |   output[i] <- local_var
28 |   
29 | }
30 | output
31 | 
32 | 
33 | ## lapply version
34 | output <- lapply(1:10, function(i){
35 |   
36 |   local_var <- i*i ## the simple but core calculation of the iterative process
37 |   return(local_var)
38 | })
39 | unlist(output)
40 | 
41 | 
42 | 
43 | ## nested for loop version
44 | mat <- matrix(NA, nrow=10, ncol=5)
45 | mat
46 | 
47 | for(i in 1:10){
48 |   for(j in 1:5){
49 |     local_var <- i*j
50 |     mat[i,j] <- local_var
51 |   }  
52 | }
53 | mat
54 | 
55 | ## lapply version with an inner for loop (replicates the nested for loop structure above)
56 | mat_list <- lapply(1:10, function(i){
57 |   
58 |   local_vec <- c()
59 |   for(j in 1:5){
60 |     local_var <- i*j
61 |     local_vec[j] <- local_var
62 |   }  
63 |   
64 |  return(local_vec)
65 | })
66 | 
67 | unlist(mat_list)
68 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T)
69 | mat
70 | 
71 | 
72 | ## nested lapply version (replicated both versions above)
73 | mat_list <- lapply(1:10, function(i){
74 |   local_out <- lapply(1:5, function(j){
75 |     temp <- i*j
76 |     return(temp)
77 |   })  
78 |   return(local_out)
79 | })
80 | 
81 | mat_list
82 | unlist(mat_list)
83 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T)
84 | mat
85 | 
86 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day03_Simulation_Learning/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/Day03_Simulation_Learning/.DS_Store


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day03_Simulation_Learning/R_Demo_Simulation_Learning_Dice_Rolls.R:
--------------------------------------------------------------------------------
  1 | ## R_Demo_Simulation_Learning_Estimate_Mean.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2022
  6 | ##
  7 | ## Date: 2022-08-28
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##
 13 | ##########################################################################
 14 | ## For this R tutorial, we will learn how:
 15 | ##
 16 | ## (1) Simulate the roll of a D6 dice
 17 | ## (2) Repeatedly simulate the roll of a D6 dice and see how close the average value is to the expected value. 
 18 | ## (3) Learn about the central limit theorem from the simulation.
 19 | ##
 20 | ## Notes: The Central Limit Theorem (CLT) establishes that when independently generated variables (iid: independent and identically distributed random variables) are added together, the sums or averages of these variables (when normalized) converge towards a normal distribution. 
 21 | ##
 22 | ## This property emerges even if the original variables are not individually normally distributed, as with the roll of a die. 
 23 | ##
 24 | ## The probability of any value from the single roll of die is equivalent to any other value for the same-sided die in the limit (when the number of rolls approaches infinity).
 25 | ## 
 26 | ##########################################################################
 27 | 
 28 | 
 29 | library(MASS)
 30 | 
 31 | 
 32 | ## simulate 20 randomly generated rolls from a D6 (6-sided-die)
 33 | sample(1:6, size=20, replace=TRUE)
 34 | 
 35 | ## true mean is 3.5
 36 | (1 + 2 + 3 + 4 + 5 + 6) / 6
 37 | 
 38 | ## or 
 39 | mean(1:6)
 40 | 
 41 | ## true variance is approximately 2.916667 or exactly 70/24 
 42 | (1 - 3.5)^2 * (1/6) + (2 - 3.5)^2 * (1/6) + (3 - 3.5)^2 * (1/6) + (4 - 3.5)^2 * (1/6) + (5 - 3.5)^2 * (1/6) + (6 - 3.5)^2 * (1/6)
 43 | 
 44 | ## or
 45 | sum((1:6 - mean(1:6))^2 * (1/6))
 46 | 
 47 | ## repeat the simulation 10,000 times and calculate the average
 48 | n_sims <- 2000
 49 | 
 50 | ## number of samples to roll each iteration
 51 | n_samples <- 10
 52 | 
 53 | ## create two objects to hold the calculated mean and variance from each simulated sample
 54 | sim_mean_values <- c()
 55 | sim_var_values <- c()
 56 | 
 57 | ## iterate/repeat the simulation n_sims times
 58 | for(i in 1:n_sims){
 59 |   
 60 |   ## create output
 61 |   sample_output <- sample(1:6, size=n_samples, replace=TRUE)
 62 |   
 63 |   ## save the output in the i_th position of the objects
 64 |   sim_mean_values[i] <- mean(sample_output)
 65 |   sim_var_values[i] <- var(sample_output)
 66 | 
 67 | }
 68 | 
 69 | ## calculate the mean and variance of the 10,000 sample means
 70 | mean(sim_mean_values)
 71 | mean(sim_var_values)
 72 | 
 73 | ## set graphical parameters
 74 | par(mfrow=c(1,2), mar=c(4,3,1,1))
 75 | 
 76 | ## plot histograms 
 77 | truehist(sim_mean_values, main="Mean Estimate")
 78 | truehist(sim_var_values, main="Variance Estimate")
 79 | 
 80 | 
 81 | ## calculate and plot the converging average using increasing sample sizes starting at 1 and ending at all the samples
 82 | ## set graphical parameters
 83 | par(mfrow=c(1,2))
 84 | 
 85 | ##
 86 | plot(0,0, ylim=c(2.5,4), xlim=c(0,n_sims), type="n", main="Mean Estimate")
 87 | value <- c()
 88 | for(i in 1:n_sims){
 89 |   value[i] <- mean(sim_mean_values[1:i])
 90 | }
 91 | lines(value)
 92 | abline(h=3.5, col="orange", lwd=2, lty=2)
 93 | 
 94 | ##
 95 | plot(0,0, ylim=c(2.5,4), xlim=c(0,n_sims), type="n", main="Variance Estimate")
 96 | value <- c()
 97 | for(i in 1:n_sims){
 98 |   value[i] <- mean(sim_var_values[1:i])
 99 | }
100 | lines(value)
101 | abline(h=2.916667, col="orange", lwd=2, lty=2)
102 | 
103 | 
104 | sqrt(2.916667)
105 | 
106 | ## the values converge towards normality 
107 | summary((sim_mean_values - 3.5))
108 | mean((sim_mean_values - 3.5))
109 | var((sim_mean_values - 3.5))
110 | 
111 | 
112 | 
113 | var(sim_mean_values)
114 | var(sim_var_values)
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day03_Simulation_Learning/R_Demo_Simulation_Learning_ProbCalc_BayesRule.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_Simulation_Learning_ProbCalc_BayesRule.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-28
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## Overview of Bayes rule and the calculation of conditional probability. Introduces the naive Bayes classifier function from the e1071 library.
16 | ##
17 | ## P(outcome | evidence) = P(outcome) * P(evidence | outcome) / P(evidence)
18 | ##
19 | ## Below is the Bayes’ Theorem:
20 | ## P(A | B) = P(A) * P(B | A) / P(B)
21 | ##
22 | ## Which can be derived from the general multiplication formula for AND events:
23 | ## P(A and B) = P(A) * P(B | A)
24 | ## P(B | A) = P(A and B) / P(A)
25 | ## P(B | A) = P(B) * P(A | B) / P(A)
26 | ## P(y|x) = P(x|y) * P(y) / P(x)
27 | ## P(x|y) = P(x AND y) / P(x)
28 | ##
29 | ## Pr(A[1] = Pr(y==0)
30 | ## Pr(A[2] = Pr(y==1)
31 | ## Pr(B | A[1]) = Pr(Data | y==0)
32 | ## Pr(B | A[2]) = Pr(Data | y==1)
33 | ##
34 | ##########################################################################
35 | 
36 | 
37 | A <- c(0,1,0,1)
38 | B <- c(0,0,0,1)
39 | 
40 | 
41 | ProbCalc <- function(A,B){
42 | 
43 |   len <- length(A)
44 |   data <- as.data.frame(cbind(A,B, A*B, A+B-A*B))
45 |   names(data) <- c("A", "B", "A and B", "A or B")
46 | 
47 |   pr.A <- sum(A)/len
48 |   pr.B <- sum(B)/len
49 |   pr.NotA <- 1 - pr.A
50 |   pr.NotB <- 1 - pr.B
51 |   pr.A.B <- sum(A*B)/len
52 |   pr.NotA.B <- sum(abs(A-1)*B)/len
53 |   pr.A.NotB <- sum(A*abs(B-1))/len
54 |   pr.NotA.NotB <- sum(abs(A-1)*abs(B-1))/len
55 | 
56 |   pr.A.or.B <- pr.A + pr.B - pr.A.B
57 |   
58 |   pr.A.condB <- pr.A.B / pr.B
59 |   pr.B.condA <-pr.A.B / pr.A
60 |   
61 |   out <- list(data=data, pr.A=pr.A, pr.B=pr.B, pr.NotA=pr.NotA, pr.NotB=pr.NotB, pr.A.B=pr.A.B, pr.NotA.B=pr.NotA.B, pr.A.NotB=pr.A.NotB, pr.NotA.NotB=pr.NotA.NotB,  pr.A.or.B= pr.A.or.B, pr.A.condB=pr.A.condB, pr.B.condA=pr.B.condA)
62 |   return(out) 
63 | 
64 | }
65 | 
66 | ProbCalc(A,B)
67 | 
68 | ProbTabCalc <- function(A,B){
69 | 
70 |   tab <- table(A,B)/length(A)
71 | 
72 |   pr.A <- sum(tab[A==1])
73 |   pr.B <- sum(tab[B==1])
74 |   pr.NotA <- sum(tab[A==0])
75 |   pr.NotB <- sum(tab[B==0])
76 |   pr.A.B <- sum(tab[A==1 & B==1])
77 |   pr.NotA.B <- sum(tab[A==0 & B==1])
78 |   pr.A.NotB <- sum(tab[A==1 & B==0])
79 |   pr.NotA.NotB <- sum(tab[A==0 & B==0])
80 | 
81 |   pr.A.or.B <- sum(tab[A==1 | B==1])
82 |   
83 |   pr.A.condB <- pr.A.B / pr.B
84 |   pr.B.condA <-pr.A.B / pr.A
85 | 
86 |   out <- list(table=tab, pr.A=pr.A, pr.B=pr.B, pr.NotA=pr.NotA, pr.NotB=pr.NotB, pr.A.B=pr.A.B, pr.NotA.B=pr.NotA.B, pr.A.NotB=pr.A.NotB, pr.NotA.NotB=pr.NotA.NotB,  pr.A.or.B= pr.A.or.B, pr.A.condB=pr.A.condB, pr.B.condA=pr.B.condA)
87 |     return(out)
88 | 
89 | }
90 | 
91 | ProbTabCalc(A,B)
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/.DS_Store


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R:
--------------------------------------------------------------------------------
  1 | #### R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2022
  6 | ##
  7 | ## Date: 2022-08-28
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ##
 14 | ## Goal: Improve the predictive power or predictive validity of a model when applied to new observed values
 15 | ##
 16 | ##########################################################################
 17 | ## Introduction to tutorial:
 18 | ##
 19 | ## For this R tutorial we will simulate a dataset and then randomly divide it into ten subsets. 
 20 | ##
 21 | ## We will then fit a model using the observations from 1 of the subsets of data (test data) and then use the model estimates to predict the value of the dependent variable for the remaining out of sample data subset (test data).
 22 | ##
 23 | ## After that step, we will use the other half of the data to fit the model and then predict the other hold out sample.
 24 | ##
 25 | ## In this way, both halves of the data will be predicted using a model estimated from other data not used in the fitting
 26 | ##
 27 | ## This process is increasingly common and required in almost all Machine Learning and predictive tasks in data science and increasingly so in the social sciences.
 28 | ##
 29 | ##########################################################################
 30 | 
 31 | 
 32 | #rm(list = ls())
 33 | 
 34 | set.seed(940)
 35 | 
 36 | ## set number of observations for simulation
 37 | n <- 100
 38 | 
 39 | ## number of folds (randomly created sub samples of data)
 40 | k <- 10
 41 | 
 42 | ## simulation of variables
 43 | x <- sample(4:14,n,replace=TRUE)
 44 | y <- -5.996 + 2.781*x -0.127*x^2   + rnorm(n,0,1)
 45 | #y <- -5.996 + 2.781*x -0.127*x^2   + rnorm(n,0,2)
 46 | 
 47 | plot(x,y)
 48 | 
 49 | ## create a subject/unit ID variable with one values for each unit
 50 | ## here the indicator values takes on 2-Fold values {1,2}
 51 | folds <- sample(rep(1:k, k), n, replace=FALSE)
 52 | 
 53 | table(folds)
 54 | 
 55 | ## create a data frame with the dependent varaible, independent variable, and randomly created ID
 56 | dat <- data.frame(y, x, folds)
 57 | 
 58 | ## create vectors for storing predictions
 59 | dat$y.hat0 <- NA
 60 | dat$y.hat1 <- NA
 61 | dat$y.hat2 <- NA
 62 | 
 63 | #test <- matrix(NA, nrow=k, ncol=2)
 64 | 
 65 | ##  function to
 66 | for(i in 1:k){
 67 |     
 68 |     ## fit a linear model
 69 |     fit0 <- lm(y ~ 1, data=subset(dat, folds!=i))
 70 |     pred0 <- predict(fit0, newdata=subset(dat, folds==i))
 71 |     y.hat0 <- as.numeric(pred0)
 72 |   
 73 |     dat$y.hat0[dat$fold==i] <- y.hat0
 74 |   
 75 |   ## fit a linear model
 76 |     fit1 <- lm(y ~ x, data=subset(dat, folds!=i))
 77 |     pred1 <- predict(fit1, newdata=subset(dat, folds==i))
 78 |     y.hat1 <- as.numeric(pred1)
 79 |     
 80 |     dat$y.hat1[dat$fold==i] <- y.hat1
 81 |     
 82 |     
 83 |     ## fit a linear model with a squared term
 84 |     fit2 <- lm(y ~ x + I(x^2), data=subset(dat, folds!=i))
 85 |     pred2 <- predict(fit2, newdata=subset(dat, folds==i))
 86 |     y.hat2 <- as.numeric(pred2)
 87 |     
 88 |     dat$y.hat2[dat$fold==i] <- y.hat2
 89 |     
 90 |     print(summary(dat))
 91 | }
 92 | 
 93 | rmse.fit0 <- sqrt(mean((dat$y.hat0-dat$y)^2))
 94 | rmse.fit0
 95 | 
 96 | rmse.fit1 <- sqrt(mean((dat$y.hat1-dat$y)^2))
 97 | rmse.fit1
 98 | 
 99 | rmse.fit2 <- sqrt(mean((dat$y.hat2-dat$y)^2))
100 | rmse.fit2
101 | 
102 | c(rmse.fit0, rmse.fit1, rmse.fit2)
103 | 
104 | 
105 | cor.fit0 <- cor(dat$y.hat0, dat$y, method="spearman")
106 | cor.fit0
107 | 
108 | cor.fit1 <- cor(dat$y.hat1, dat$y, method="spearman")
109 | cor.fit1
110 | 
111 | cor.fit2 <- cor(dat$y.hat2, dat$y, method="spearman")
112 | cor.fit2
113 | 
114 | c(cor.fit0, cor.fit1, cor.fit2)
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/R_Demo_Simulation_Inference_2_Sample_Hold_Out.R:
--------------------------------------------------------------------------------
  1 | #### R_Demo_Simulation_Inference_2_Sample_Hold_out.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2022
  6 | ##
  7 | ## Date: 2022-08-28
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ##
 14 | ## Goal: Improve the predictive power or predictive validity of a model
 15 | ##
 16 | ##########################################################################
 17 | ## Introduction to tutorial:
 18 | ##
 19 | ## (1) Begin building towards and learning about cross-validation
 20 | ## (NOTE: There is no "crossing" yet)
 21 | ##
 22 | ## For this R tutorial we will simulate a dataset and then randomly divide it into two subsets.
 23 | ##
 24 | ## We will fit a model using the observations from one of the subsets of data (training data).
 25 | ##
 26 | ## We will then use the model estimates to predict the value of the dependent variable for the remaining out-of-sample data subset (testing data).
 27 | ##
 28 | ##########################################################################
 29 | 
 30 | 
 31 | set.seed(940)
 32 | 
 33 | 
 34 | ## set number of observations for simulation
 35 | n <- 100
 36 | 
 37 | ## simulation of variables (This model is one of Anscombe's quartets)
 38 | x <- sample(4:14,n,replace=TRUE)
 39 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,1)
 40 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,2)
 41 | 
 42 | ## plot the simulated relationship
 43 | par(mfrow=c(1,1))
 44 | plot(x=x, y=y)
 45 | 
 46 | ## create a subject/unit ID variable with one values for each unit
 47 | ## here the indicator values takes on 2-Fold values {1,2}
 48 | folds <- sample(rep(1:2, n/2), n, replace=FALSE)
 49 | folds
 50 | 
 51 | table(folds)
 52 | 
 53 | ## create a data frame with the dependent variable, independent variable, and randomly created ID
 54 | dat <- data.frame(y, x, folds)
 55 | 
 56 | summary(dat)
 57 | 
 58 | head(dat)
 59 | 
 60 | ## fit a linear model to the full dataset
 61 | model <- lm(y ~ x, data=dat)
 62 | summary(model)
 63 | 
 64 | 
 65 | ## subset the full dataset into to subsets based on the ID variable
 66 | train <- subset(dat, folds==1)
 67 | test <- subset(dat, folds==2)
 68 | 
 69 | train <- dat[dat$folds==1,]
 70 | test <- dat[dat$folds==2,]
 71 | 
 72 | nrow(train)
 73 | nrow(test)
 74 | 
 75 | 
 76 | ## Model 0: fit a linear model
 77 | fit <- lm(y ~ 1, data=train)
 78 | pred <- predict(fit, newdata=test)
 79 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2))
 80 | rmse
 81 | 
 82 | ## Model 1: fit a linear model
 83 | fit <- lm(y ~ x, data=train)
 84 | pred <- predict(fit, newdata=test)
 85 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2))
 86 | rmse
 87 | 
 88 | ## Model 2: fit a linear model with a squared term
 89 | fit <- lm(y ~ x + I(x^2), data=train)
 90 | pred <- predict(fit, newdata=test)
 91 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2))
 92 | rmse
 93 | 
 94 | 
 95 | ##################################################
 96 | ## Model 2: fit a linear model with a squared term
 97 | fit <- lm(y ~ x + I(x^2), data=train)
 98 | pred <- predict(fit, newdata=train)
 99 | rmse <- sqrt(mean((as.numeric(pred)-train$y)^2))
100 | rmse
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes.R:
--------------------------------------------------------------------------------
 1 | #### R_Demo_Simulation_Inference_naive_Bayes.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-28
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## Overview of Bayes rule and the calculation of conditional probability. Introduces the naive Bayes classifier function from the e1071 library.
16 | ##
17 | ## P(outcome | evidence) = P(outcome) * P(evidence | outcome) / P(evidence)
18 | ##
19 | ## Below is the Bayes’ Theorem:
20 | ## P(A | B) = P(A) * P(B | A) / P(B)
21 | ##
22 | ## Which can be derived from the general multiplication formula for AND events:
23 | ## P(A and B) = P(A) * P(B | A)
24 | ## P(B | A) = P(A and B) / P(A)
25 | ## P(B | A) = P(B) * P(A | B) / P(A)
26 | ## P(y|x) = P(x|y) * P(y) / P(x)
27 | ## P(x|y) = P(x AND y) / P(x)
28 | ##
29 | ## Pr(A[1] = Pr(y==0)
30 | ## Pr(A[2] = Pr(y==1)
31 | ## Pr(B | A[1]) = Pr(Data | y==0)
32 | ## Pr(B | A[2]) = Pr(Data | y==1)
33 | ##
34 | ##########################################################################
35 | 
36 | 
37 | ## load libraries
38 | library(e1071)
39 | library(LaplacesDemon)
40 | 
41 | 
42 | ## example code from BayesTheorem() function
43 | PrA <- c(0.75,0.25)
44 | PrBA <- c(6/9, 5/7)
45 | BayesTheorem(PrA, PrBA)
46 | 
47 | 
48 | ## create fake data
49 | n <- 10
50 | x <- c(rep(0,n/2), rep(1,n/2))
51 | y <- c(0,0,0,1,1,0,0,1,1,1)
52 | 
53 | ## inspect data
54 | cbind(y,x)
55 | 
56 | ## inspect tabulation of data
57 | table(y,x)
58 | 
59 | 
60 | ## calculate the probability of the evidence/data
61 | PrX <- NA
62 | PrX[1] <- sum(as.numeric(x==1)) / n
63 | PrX[2] <- sum(as.numeric(x==1)) / n
64 | 
65 | ## calculate the probability of the outcome
66 | PrY <- NA
67 | PrY[1] <- sum(as.numeric(y==0))/n
68 | PrY[2] <- sum(as.numeric(y==1))/n
69 | PrY
70 | 
71 | ## calculate the probability of the data conditional on the value of y (the likelihood)
72 | PrXY<- NA
73 | PrXY[1] <- sum(x[y==0])/length(as.numeric(x[y==0]))
74 | PrXY[2] <- sum(x[y==1])/length(as.numeric(x[y==1]))
75 | PrXY
76 | 
77 | ## apply Bayes Rule
78 | PrXY * PrY / PrX
79 | 
80 | ## apply Bayes Rule with BayesTheorem() function
81 | BayesTheorem(PrA=PrY, PrBA=PrXY)
82 | 
83 | 
84 | ## apply Bayes Rule with naiveBayes() function
85 | fit <- naiveBayes(y~x, data=data.frame(y,x))
86 | fit
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R:
--------------------------------------------------------------------------------
  1 | #### R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2022
  6 | ##
  7 | ## Date: 2022-08-28
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ## Introduction to tutorial:
 14 | ##
 15 | ## (1) Program simulates count data that is then used to predict a binary outcome variable.
 16 | ##
 17 | ## (2) Three models are evaluated using the count data to predict the outcome:
 18 | ## (2a) linear model
 19 | ## (2b) generalized linear model with a logit link function
 20 | ## (2c) naive Bayes classifier.
 21 | ##
 22 | ##########################################################################
 23 | 
 24 | 
 25 | ## load library
 26 | llibrary(e1071)
 27 | 
 28 | 
 29 | ## simulate x1 and set the "true" population values alpha and beta
 30 | n <- 100
 31 | 
 32 | ## unobserved
 33 | x <- runif(n,0,1)
 34 | 
 35 | ## observed counts
 36 | x1 <- rpois(n, lambda=x)
 37 | x2 <- rpois(n, lambda=x)
 38 | x3 <- rpois(n, lambda=2*x)
 39 | x4 <- rpois(n, lambda=2*x)
 40 | x5 <- rpois(n, lambda=4*x)
 41 | 
 42 | ## systematic component of the model based on observed counts
 43 | xb <- -2 + x1 + x2 + x3 + x4 + x5
 44 | 
 45 | ## transform the linear term xb using
 46 | ## the inverse logit function
 47 | ## so that theta is bound from 0 to 1
 48 | pi <- 1 / (1 + exp(-xb))
 49 | 
 50 | ## generate the dependent variable y with probability pi and measurement error from a Bernoulli trial
 51 | y <- rbinom(n, size=1, prob=pi)
 52 | 
 53 | 
 54 | ## make data frame
 55 | dat <- data.frame(y, x1, x2, x3, x4, x5)
 56 | 
 57 | 
 58 | ## summarize fit using linear model
 59 | summary(lm(y ~ x1 + x2 + x3 + x4 + x5, data=dat))
 60 | 
 61 | 
 62 | ## summarize fit using glm using the logit link function
 63 | summary(glm(y ~ x1 + x2 + x3 + x4 + x5, family=binomial(link="logit")))
 64 | 
 65 | 
 66 | ## summarize fit using naiveBayes model
 67 | naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=dat)
 68 | 
 69 | 
 70 | ## create vectors for storing predictions
 71 | dat$y.hat1 <- NA
 72 | dat$y.hat2 <- NA
 73 | dat$y.hat3 <- NA
 74 | 
 75 | ## select number of folds
 76 | k <- 10
 77 | 
 78 | ## create vector of folds for cross validation
 79 | dat$folds <- sample(rep(1:k, k), n, replace=FALSE)
 80 | 
 81 | ## lapply function to
 82 | for(i in 1:k){
 83 |     
 84 |     ## fit a linear model
 85 |     fit1 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i))
 86 |     pred1 <- predict(fit1, newdata=subset(dat, folds==i))
 87 |     y.hat1 <- as.numeric(pred1)
 88 |     
 89 |     dat$y.hat1[dat$fold==i] <- y.hat1
 90 |     
 91 |     
 92 |     ## fit a glm model
 93 |     fit2 <- glm(y ~ x1 + x2 + x3 + x4 + x5, binomial(link="logit"), data=subset(dat, folds!=i))
 94 |     pred2 <- predict(fit2, newdata=subset(dat, folds==i))
 95 |     y.hat2 <- as.numeric(pred2)
 96 |     
 97 |     dat$y.hat2[dat$fold==i] <- y.hat2
 98 |     
 99 |     
100 |     ## fit a naiveBayes classifier model
101 |     fit3 <- naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i))
102 |     pred3 <- predict(fit3, newdata=subset(dat, folds==i))
103 |     y.hat3 <- as.numeric(pred3)
104 |     
105 |     dat$y.hat3[dat$fold==i] <- y.hat3
106 |     
107 |     #print(summary(dat))
108 | }
109 | 
110 | rmse.fit1 <- sqrt(mean((dat$y.hat1-dat$y)^2))
111 | rmse.fit1
112 | 
113 | cor.fit1 <- cor(dat$y.hat1, dat$y, method="spearman")
114 | 
115 | rmse.fit2 <- sqrt(mean((dat$y.hat2-dat$y)^2))
116 | rmse.fit2
117 | 
118 | cor.fit2 <- cor(dat$y.hat2, dat$y, method="spearman")
119 | 
120 | rmse.fit3 <- sqrt(mean((dat$y.hat3-dat$y)^2))
121 | rmse.fit3
122 | 
123 | cor.fit3 <- cor(dat$y.hat3, dat$y, method="spearman")
124 | 
125 | c(cor.fit1, cor.fit2, cor.fit3)
126 | 
127 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day05_Measurement_RSTAN/RSTAN_linear_model_simulation.R:
--------------------------------------------------------------------------------
  1 | ## RSTAN_linear_model_simulation.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN
  5 | ##
  6 | ## Please e-mail me if you find any errors or have and suggestions
  7 | ## e-mail: cjf0006@gmail.com
  8 | ##
  9 | ##########################################################################
 10 | ##
 11 | ## Introduction to tutorial:
 12 | ##
 13 | ## For this R tutorial we will draw random samples from the normal distribution using the STAN program. These parameters will be estimated based on the likelihood function that links them to the data contained in the y and x variables that are simulated observed data. The model produces the slope and intercept from a standard linear model, which is also estimated using lm() in R.
 14 | ##
 15 | ##########################################################################
 16 | 
 17 | ## load library
 18 | library(rstan) # load rstan library
 19 | library(MASS) # load library with truehist function
 20 | 
 21 | ## -------------------------------------------------- #
 22 | ## define STAN model as a character
 23 | ## -------------------------------------------------- #
 24 | model <- "
 25 |     data {
 26 |         // declared the data in memory
 27 |         int<lower=0> n;
 28 |         vector[n] y;
 29 |         vector[n] x;
 30 |     }
 31 |     parameters {
 32 |         // declared the parameters in memory
 33 |         real alpha;
 34 |         real beta;
 35 |         real<lower=0> sigma;
 36 |     }
 37 |     model {
 38 |         // priors (these are variances not precision)
 39 |         alpha ~ normal(0,10);
 40 |         beta ~ normal(0,10);
 41 | 
 42 |         // likelihood (link data to some combination of parameters and more data)
 43 |         for(i in 1:n){
 44 |             y[i] ~ normal(alpha + beta * x[i], sigma);
 45 |         }
 46 |     }
 47 |     generated quantities {
 48 |         // posterior predictions
 49 |         vector[n] y_predict;
 50 | 
 51 |         // the loop is necessary within the generated quantities block
 52 |         for(i in 1:n){
 53 |             y_predict[i] = normal_rng(alpha + beta * x[i], sigma);
 54 |         }
 55 |     }
 56 | "
 57 | ## -------------------------------------------------- #
 58 | 
 59 | 
 60 | ## set data for simulation
 61 | n <- 100
 62 | x <- rnorm(n,0,1)
 63 | alpha <- 1.25
 64 | beta <- 2.50
 65 | 
 66 | ## simulate a dependent variable with normally distribtued error using the data and parameter values defined above
 67 | error <- rnorm(n)
 68 | y <- alpha + beta * x + error
 69 | 
 70 | plot(x=x, y=y)
 71 | 
 72 | ## fit linear model
 73 | summary(lm(y~x))
 74 | 
 75 | ## create data list
 76 | data_list <- list(y = y, x=x, n=n)
 77 | 
 78 | ## set time start variable
 79 | time1 <- Sys.time()
 80 | 
 81 | # fit stan model
 82 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4)
 83 | 
 84 | ## calcuate the duration of the program file up to this point
 85 | print(Sys.time() - time1)
 86 | 
 87 | ## extract draws from stan model object (creates a list object)
 88 | output <- extract(fit, permuted = TRUE)
 89 | 
 90 | ## print names of each element/slot in the list
 91 | names(output)
 92 | 
 93 | ## print model fit object
 94 | fit
 95 | 
 96 | ## there are number of methods to subset and summarize parameters
 97 | ## keep in mind that the output object is a list that contains vectors or matrices of of posterior estimates for each of the named parameter defined in the model statement above
 98 | ## lapply (list-apply) a function to all of the objects in the list
 99 | lapply(output, mean)[1:3]
100 | lapply(output, sd)[1:3]
101 | 
102 | ## create a matrix using some of the named slots in the list
103 | model_parameters <- as.matrix(fit, pars = c("alpha", "beta", "sigma"))
104 | model_predictions <- as.matrix(fit, pars = "y_predict")
105 | 
106 | ## check the dimensions (they should be the same)
107 | dim(model_predictions)
108 | dim(output$y_predict)
109 | 
110 | ## plot the simulated y variable and the estimated posterior means
111 | plot(y, apply(model_predictions,2,mean))
112 | 
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day05_Measurement_RSTAN/RSTAN_logistic_regression_simulation.R:
--------------------------------------------------------------------------------
  1 | ## RSTAN_logistic_regression_simulation.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN
  5 | ##
  6 | ## Please e-mail me if you find any errors or have and suggestions
  7 | ## e-mail: cjf0006@gmail.com
  8 | ##
  9 | ##########################################################################
 10 | ##
 11 | ## Introduction to tutorial:
 12 | ##
 13 | ## For this R tutorial we will simulate a binary dependent variable and then estimate the parameters that generate the variable. These parameters will be estimated based on the likelihood function that links them to the data contained in the y and x variables that are simulated observed data. The model produces the slope and intercept from a standard logistic regression model, which is also estimated using glm() in R.
 14 | ##
 15 | ##########################################################################
 16 | 
 17 | ## load library
 18 | library(rstan) # load rstan library
 19 | library(MASS) # loaed library with truehist function
 20 | 
 21 | ## -------------------------------------------------- ##
 22 | ## define STAN model
 23 | ## -------------------------------------------------- ##
 24 | model <- "
 25 |     data {
 26 |         // declared the data in memory
 27 |         int<lower=0> n;
 28 |         int<lower=0, upper=1> y[n];
 29 |         vector[n] x;
 30 |     }
 31 |     // declared the parameters in memory
 32 |     parameters {
 33 |         real alpha;
 34 |         real beta;
 35 |     }
 36 |     model {
 37 |         // priors (these are variances not precision)
 38 |         alpha ~ normal(0,10);
 39 |         beta ~ normal(0,10);
 40 |     
 41 |         // likelihood (link data to some combination of parameters and more data)
 42 |         y ~ bernoulli_logit(alpha + beta * x);
 43 |     }
 44 | generated quantities {
 45 |     // posterior predictions
 46 |     vector[n] y_predict;
 47 |     
 48 |     // the loop is necessary within the generated quantities block
 49 |     for(i in 1:n){
 50 |         y_predict[i] = bernoulli_logit_rng(alpha + beta * x[i]);
 51 |     }
 52 | }
 53 | 
 54 | "
 55 | ## -------------------------------------------------- ##
 56 | 
 57 | 
 58 | ## simulate x1 and set the "true" population values alpha and beta
 59 | n <- 100
 60 | x <- rnorm(n,0,1)
 61 | alpha <- 1.25
 62 | beta <- 2.50
 63 | 
 64 | ## systematic component of the model
 65 | xb <- alpha + beta * x
 66 | 
 67 | ## transform the linear term xb using
 68 | ## the inverse logit function
 69 | ## so that theta is bound from 0 to 1
 70 | eta <- 1 / (1 + exp(-xb))
 71 | 
 72 | ## generate the dependent variable y with probability inv.theta and measurment error from a Bernoulli trial
 73 | y <- rbinom(n, size=1, prob=eta)
 74 | 
 75 | table(y)
 76 | 
 77 | ## create data list
 78 | data_list <- list(y = y, x=x, n=n)
 79 | 
 80 | ## fit linear model
 81 | summary(glm(y~x, family=binomial(link="logit")))
 82 | 
 83 | ## fit stan model
 84 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4)
 85 | 
 86 | ## extract draws from stan model object
 87 | output <- extract(fit, permuted = TRUE)
 88 | 
 89 | ## print names
 90 | names(output)
 91 | 
 92 | ## there are number of methods to subset and summarize parameters
 93 | ## keep in mind that the output object is a list that contains vectors or matrices of of posterior estimates for each of the named parameter defined in the model statement above
 94 | ## lapply (list-apply) a function to all of the objects in the list
 95 | lapply(output, mean)
 96 | lapply(output, sd)
 97 | 
 98 | ## tabulate the simulated binary dependent variable it should be very close to the mean value of the predicted y
 99 | table(y)
100 | 
101 | ## create a matrix using some of the named slots in the list
102 | model_parameters <- as.matrix(fit, pars = c("alpha", "beta"))
103 | model_predictions <- as.matrix(fit, pars = "y_predict")
104 | 
105 | ## check the dimensions (they should be the same)
106 | dim(model_predictions)
107 | dim(output$y_predict)
108 | 
109 | ## plot the simulated y variable and the estimated posterior means
110 | plot(apply(model_predictions,2,mean), y)
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day05_Measurement_RSTAN/RSTAN_mean_simulation.R:
--------------------------------------------------------------------------------
 1 | ## RSTAN_mean_simulation.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN
 5 | ##
 6 | ## Please e-mail me if you find any errors or have and suggestions
 7 | ## e-mail: cjf0006@gmail.com
 8 | ##
 9 | ##########################################################################
10 | ##
11 | ## Introduction to tutorial:
12 | ##
13 | ## For this R tutorial we will use the normal distribution in STAN to estimate the mean value for an observed variable y. The parameter for the mean will be selected based on the likelihood function that links them to the data contained in the y variable.
14 | ##
15 | ##########################################################################
16 | 
17 | ## load library
18 | library(rstan) # load rstan library
19 | library(MASS) # load library with truehist function
20 | 
21 | ## -------------------------------------------------- #
22 | ## define STAN model as a character
23 | ## -------------------------------------------------- #
24 | model <- "
25 |     data {
26 |         // declared the data in memory
27 |         int<lower=0> n;
28 |         vector[n] y;
29 |     }
30 |     parameters {
31 |         // declared the parameters in memory
32 |         real mu;
33 |         real<lower=0> sigma;
34 |     }
35 |     model {
36 |         // there are no prior statements for mu or sigma; 
37 |         // by default the priors on the parameters are flat unless we provide more information (see the other examples)
38 |         // likelihood (link data to some combination of parameters and more data)
39 |         
40 |         mu ~ normal(0,1);
41 |         
42 |         for(i in 1:n){
43 |             y[i] ~ normal(mu, sigma);
44 |         }
45 |     }
46 |     generated quantities {
47 |         // posterior predictions
48 |         vector[n] y_predict;
49 | 
50 |         // the loop is necessary within the generated quantities block
51 |         for(i in 1:n){
52 |             y_predict[i] = normal_rng(mu, sigma);
53 |         }
54 |     }
55 | "
56 | ## -------------------------------------------------- #
57 | 
58 | 
59 | ## set data for simulation
60 | #y <- 1:5
61 | y <- rep(1:5,200)
62 | 
63 | n <- length(y)
64 | y
65 | n
66 | 
67 | ## create data list
68 | data_list <- list(y = y, n=n)
69 | data_list
70 | 
71 | ## set time start variable
72 | time1 <- Sys.time()
73 | 
74 | # fit stan model
75 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4)
76 | 
77 | ## calcuate the duration of the program file up to this point
78 | print(Sys.time() - time1)
79 | 
80 | ## extract draws from stan model object (creates a list object)
81 | output <- extract(fit, permuted = TRUE)
82 | 
83 | ## print names of each element/slot in the list
84 | names(output)
85 | 
86 | ## print model fit object
87 | fit
88 | 
89 | ## lapply (list-apply) a function to all of the objects in the list
90 | lapply(output, mean)
91 | lapply(output, sd)
92 | 
93 | truehist(output$mu)
94 | 
95 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day05_Measurement_RSTAN/RSTAN_normal_distribution_simulation.R:
--------------------------------------------------------------------------------
 1 | ## RSTAN_normal_distribution_simulation.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN
 5 | ##
 6 | ## Please e-mail me if you find any errors or have and suggestions
 7 | ## e-mail: cjf0006@gmail.com
 8 | ##
 9 | ##########################################################################
10 | ##
11 | ## Introduction to tutorial:
12 | ##
13 | ## For this R tutorial we will draw random samples from the normal distribution using the STAN program. 
14 | ## This program is equivalent to using the rnorm function in R.
15 | ##
16 | ##########################################################################
17 | 
18 | 
19 | ## load library
20 | library(rstan) # load rstan library
21 | library(MASS) # load library with truehist function
22 | 
23 | 
24 | ## -------------------------------------------------- #
25 | ## define STAN model
26 | ## -------------------------------------------------- #
27 | model <- "
28 | 
29 | parameters {
30 |     real mu;
31 | }
32 | 
33 | model {
34 |     mu ~ normal(0,1);
35 | }
36 | "
37 | ## -------------------------------------------------- #
38 | 
39 | 
40 | ## set time start variable
41 | time1 <- Sys.time()
42 | 
43 | ## fit stan model
44 | fit <- stan(model_code = model, iter = 1000, chains = 4)
45 | 
46 | ## calculate the duration of the program file up to this point
47 | print(Sys.time() - time1)
48 | 
49 | ## extract draws from stan model object (creates a list object)
50 | output <- extract(fit, permuted = TRUE)
51 | 
52 | ## print names of each element/slot in the list
53 | names(output)
54 | 
55 | ## print model fit object
56 | fit
57 | 
58 | ## there are number of methods to subset and summarize parameters
59 | ## keep in mind that the output object is a list that contains vectors or matrices of posterior estimates for each of the named parameter defined in the model statement above
60 | ## lapply (list-apply) a function to all of the objects in the list
61 | lapply(output, mean)
62 | lapply(output, sd)
63 | 
64 | ## create a matrix using some of the named slots in the list
65 | model_parameters <- as.matrix(fit, pars = c("mu"))
66 | 
67 | ## check the dimensions (they should be the same)
68 | length(output$mu)
69 | 
70 | ## make a nice plot
71 | #par(mfrow=c(2,2))
72 | truehist(output$mu)
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/.DS_Store


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_Ternary_Diriclet.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_Ternary_Diriclet.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-28
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | 
15 | library(Ternary)
16 | library(MCMCpack)
17 | 
18 | value1 <- value2 <- value3 <- value4 <-  value5 <-  value6 <- matrix(NA, nrow=1000, ncol=3)
19 | 
20 | ## generate 6 different 3D coordinate systems that exists within a simplex such that the coordinates for any point our proportions that always sum to 1
21 | value1[,1:3] <- rdirichlet(1000, alpha=c(1,1,5))
22 | value2[,1:3] <- rdirichlet(1000, alpha=c(1,5,1))
23 | value3[,1:3] <- rdirichlet(1000, alpha=c(10,10,1))
24 | value4[,1:3] <- rdirichlet(1000, alpha=c(1,1,1))
25 | value5[,1:3] <- rdirichlet(1000, alpha=c(10,10,10))
26 | value6[,1:3] <- rdirichlet(1000, alpha=c(100,100,100))
27 | 
28 | head(value1[,1:3])
29 | 
30 | 
31 | rdirichlet(10, alpha=c(1,1,1,1))
32 | 
33 | ## show coordinates within the simplex sum to 1
34 | table(apply(value1[,1:3],1,sum))
35 | table(apply(value2[,1:3],1,sum))
36 | table(apply(value3[,1:3],1,sum))
37 | table(apply(value4[,1:3],1,sum))
38 | table(apply(value5[,1:3],1,sum))
39 | table(apply(value6[,1:3],1,sum))
40 | 
41 | ## test coordinates within the simplex sum to 1
42 | summary(apply(value1[,1:3],1,sum))
43 | table(apply(value1[,1:3],1,sum)==1)
44 | 
45 | ## unicode for right arrow: "\u2192"
46 | ## unicode for left arrow: "\u2190"
47 | 
48 | par(mfrow=c(3,2), mar=c(1,1,1,1))
49 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192",
50 |     point='right', lab.cex=0.8, grid.minor.lines = 0,
51 |     grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white',
52 |     axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6),
53 |     padding=0.08, main="simplex")
54 | TernaryPoints(value1, col=grey(.75))
55 | 
56 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192",
57 |             point='right', lab.cex=0.8, grid.minor.lines = 0,
58 |             grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white',
59 |             axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6),
60 |             padding=0.08, main="simplex")
61 | TernaryPoints(value1, col=grey(.75))
62 | 
63 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192",
64 |             point='right', lab.cex=0.8, grid.minor.lines = 0,
65 |             grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white',
66 |             axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6),
67 |             padding=0.08, main="simplex")
68 | TernaryPoints(value3, col=grey(.75))
69 | 
70 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192",
71 |             point='right', lab.cex=0.8, grid.minor.lines = 0,
72 |             grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white',
73 |             axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6),
74 |             padding=0.08, main="simplex")
75 | TernaryPoints(value4, col=grey(.75))
76 | 
77 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192",
78 |             point='right', lab.cex=0.8, grid.minor.lines = 0,
79 |             grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white',
80 |             axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6),
81 |             padding=0.08, main="simplex")
82 | TernaryPoints(value5, col=grey(.75))
83 | 
84 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192",
85 |             point='right', lab.cex=0.8, grid.minor.lines = 0,
86 |             grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white',
87 |             axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6),
88 |             padding=0.08, main="simplex")
89 | TernaryPoints(value6, col=grey(.75))
90 | 
91 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_stm_package.R:
--------------------------------------------------------------------------------
 1 | #### R_Demo_text_as_data_DTM_stm_package.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-08
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ##
14 | ## Introduction to tutorial:
15 | ##
16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file.
17 | ##
18 | ## This code uses functions available in the tm package.
19 | ##
20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions.
21 | ##
22 | ##########################################################################
23 | 
24 | 
25 | ## load libraries
26 | library(stm)
27 | library(tm)
28 | library(SnowballC)
29 | 
30 | ## load data
31 | data <- read.csv("SIMpoliticalTweets.txt", header=FALSE)
32 | data
33 | names(data) <- "text"
34 | data
35 | 
36 | #trumptweets <- fromJSON("trump_json_files_20190707.txt")
37 | #data <- trumptweets
38 | 
39 | ##
40 | #data <- data[1:1000,]
41 | 
42 | 
43 | ## preprocess the documents
44 | ## This function uses function from the tm package (see the tm Demo for more details)
45 | ## stem words and remove stop words
46 | prep <- textProcessor(documents=data$text, meta=data)
47 | 
48 | ## list attributes
49 | attributes(prep)
50 | 
51 | ## inspect
52 | head(prep$documents)
53 | head(prep$vocab)
54 | head(prep$meta)
55 | prep$docs.removed
56 | 
57 | 
58 | ## pre Documents
59 | ## additional processing (removes some documents because of word frequencies greater than .99 or less than .01)
60 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta)
61 | out
62 | 
63 | ## inspect
64 | head(out$documents)
65 | head(out$vocab)
66 | head(out$meta)
67 | 
68 | ## fit a structural topic model
69 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=10)
70 | 
71 | ## inspect attributes
72 | attributes(fit)
73 | 
74 | dim(fit$theta)
75 | 
76 | ## display topic probabilities
77 | fit$theta
78 | 
79 | apply(fit$theta, 1, sum)
80 | 
81 | out$meta
82 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_tm_package.R:
--------------------------------------------------------------------------------
 1 | #### R_Demo_text_as_data_DTM_tm_package.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-08
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ##
14 | ## Introduction to tutorial:
15 | ##
16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file.
17 | ##
18 | ## This code uses functions available in the tm package.
19 | ##
20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions.
21 | ##
22 | ##########################################################################
23 | 
24 | ## load libraries
25 | library(stm)
26 | library(tm)
27 | library(SnowballC)
28 | 
29 | ## load data
30 | data <- read.csv("SIMpoliticalTweets.txt", header=FALSE)
31 | data
32 | names(data) <- "text"
33 | data
34 | 
35 | #trumptweets <- fromJSON("trump_json_files_20190707.txt")
36 | #data <- trumptweets
37 | 
38 | ##
39 | ##data <- data[1:1000,]
40 | 
41 | ## create character vector for processing
42 | newtext <- as.character(data$text)
43 | length(newtext)
44 | 
45 | 
46 | ## use gsub to remove special characters that usually cause errors if left for later
47 | newtext <- gsub("[^0-9A-Za-z///' ]", "", newtext)
48 | newtext <-  gsub("[^[:alnum:]///' ]", "", newtext)
49 | newtext <- gsub("[^\x20-\x7F\x0D\x0A]", "", newtext) # remove all non-ascii characters
50 | newtext <- gsub("http.*", "", newtext) # replace all of the urls
51 | newtext <- gsub("www.*", "", newtext) #
52 | 
53 | ## data$newtext
54 | data$newtext <- newtext
55 | 
56 | ## convert to corpus object using additional functions from the tm package
57 | ## the tm_map function takes as its first argument the vector of text and a function as its second argument
58 | corpus <-Corpus(VectorSource(newtext))
59 | corpus <- tm_map(corpus, removePunctuation)
60 | corpus <- tm_map(corpus, removeNumbers)
61 | corpus <- tm_map(corpus, stripWhitespace)
62 | corpus <- tm_map(corpus, tolower)
63 | corpus <- tm_map(corpus, removeWords, stopwords("english"))
64 | corpus <- tm_map(corpus, stemDocument)
65 | 
66 | ## print to screen
67 | inspect(corpus[1:10])
68 | 
69 | 
70 | ## make document by term matrix
71 | DTM <- DocumentTermMatrix(corpus)
72 | DTM
73 | 
74 | 
75 | ## print DTM to screen
76 | inspect(DTM)
77 | inspect(DTM[1:10,1:25])
78 | 
79 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_text_as_data_twitteR_get_twitter_users.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_text_as_data_twitteR_get_tweeter_users.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-08
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## setup to access the twitter API using R
16 | ##
17 | ## Create a Twitter application at http://dev.twitter.com. Make sure to give the app read, write and direct message authority.
18 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret".
19 | 
20 | ##########################################################################
21 | 
22 | #install.packages("twitteR")
23 | library(twitteR)
24 | 
25 | 
26 | ## (1a) Go to twitter's developer page: https://developer.twitter.com/
27 | ## (1b) Click to create an App
28 | ## (1c) For the desription, you can say something like this "learn to use twitteR API for a college data science course at the University of Michigan."
29 | ##
30 | ## (2) Once you have succefully setup you APP, you will be able to get the four strings that you will fill in below
31 | ##
32 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret".
33 | ##
34 | ## set keys and tokens to access the twitter API
35 | consumer_key <- "your_consumer_key"
36 | consumer_secret <- "your_consumer_secret"
37 | access_token <- "your_access_token"
38 | access_secret <- "your_access_secret"
39 | 
40 | setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
41 | 
42 | ## (3) Run the following tweet to see if you successfully created your account
43 | amnestyusa_tweets <- userTimeline('amnestyusa', n=100)
44 | ##
45 | ## (4) Pick a tweeter account from your country of interest (or you can use the amnestyusa account from above)
46 | ##
47 | ## (5) Make a DTM (Document-by-Term matrix) for the first 100 tweets from the account you have selected or the amnestyusa account
48 | ##
49 | ## (6a) Which words are most commonly used in the corpus of 100 tweets?
50 | ## (6b) Which words are the least commonly used in the corpus of 100 tweets?
51 | ## (6c) Create a barplot of the most frequent words
52 | 
53 | 
54 | ##########################################################################
55 | ## additional examples:
56 | ##
57 | ## grab the most recent 100 tweets from Barak Obama
58 | Obama_ut <- userTimeline('barackobama', n=100)
59 | 
60 | ## grab the max number of tweets for the President of Senegal Macky Sall
61 | Macky_Sall_ut <- userTimeline('Macky_Sall', n=3200)
62 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_text_as_data_twitteR_get_twitter_users_Oauth_v2.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_text_as_data_twitteR_get_tweeter_users.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-08
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## setup to access the twitter API using R
16 | ##
17 | ## Create a Twitter application at http://dev.twitter.com. Make sure to give the app read, write and direct message authority.
18 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret".
19 | 
20 | ##########################################################################
21 | 
22 | #install.packages("twitteR")
23 | library(twitteR)
24 | 
25 | 
26 | ## (1a) Go to twitter's developer page: https://developer.twitter.com/
27 | ## SEE THIS LINK: https://developer.twitter.com/en/docs/tutorials/getting-started-with-r-and-v2-of-the-twitter-api
28 | ## (1b) Click to create an App
29 | ## (1c) For the desription, you can say something like this "learn to use twitteR API for a college data science course at the University of Michigan."
30 | ##
31 | ## (2) Once you have succefully setup you APP, you will be able to get the four strings that you will fill in below
32 | ##
33 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret".
34 | ##
35 | ## set keys and tokens to access the twitter API
36 | consumer_key <- "your_consumer_key"
37 | consumer_secret <- "your_consumer_secret"
38 | access_token <- "your_access_token"
39 | access_secret <- "your_access_secret"
40 | 
41 | setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
42 | 
43 | ## (3) Run the following tweet to see if you successfully created your account
44 | amnestyusa_tweets <- userTimeline('amnestyusa', n=100)
45 | ##
46 | ## (4) Pick a tweeter account from your country of interest (or you can use the amnestyusa account from above)
47 | ##
48 | ## (5) Make a DTM (Document-by-Term matrix) for the first 100 tweets from the account you have selected or the amnestyusa account
49 | ##
50 | ## (6a) Which words are most commonly used in the corpus of 100 tweets?
51 | ## (6b) Which words are the least commonly used in the corpus of 100 tweets?
52 | ## (6c) Create a barplot of the most frequent words
53 | 
54 | 
55 | ##########################################################################
56 | ## additional examples:
57 | ##
58 | ## grab the most recent 100 tweets from Barak Obama
59 | Obama_ut <- userTimeline('barackobama', n=100)
60 | 
61 | ## grab the max number of tweets for the President of Senegal Macky Sall
62 | Macky_Sall_ut <- userTimeline('Macky_Sall', n=3200)
63 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_text_as_wikip_stm.R:
--------------------------------------------------------------------------------
 1 | ##########################################################################
 2 | ## INSTRUCTOR: Christopher Fariss
 3 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 4 | ## University of Essex Summer School 2021
 5 | ##
 6 | ## Date: 2022-08-08
 7 | ##
 8 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 9 | ## e-mail: cjf0006@gmail.com
10 | ## e-mail: cjfariss@umich.edu
11 | ##########################################################################
12 | ## Introduction to tutorial:
13 | ##
14 | ## This tutorial demonstrates how the Structural Topic Model functions for a real set of documents.
15 | ##
16 | ## Each unit in the dataset representes a randomly selected sentence from wikipedia.  
17 | ##
18 | ##########################################################################
19 | 
20 | library(stm)
21 | 
22 | wikip_word_dat <- readLines("one_meelyun_sentences.txt")
23 | head(wikip_word_dat)
24 | 
25 | n <- length(wikip_word_dat)
26 | n
27 | 
28 | wikip_word_dat <- data.frame(id=1:n, text=wikip_word_dat)
29 | head(wikip_word_dat)
30 | 
31 | 
32 | wikip_word_dat$human_rights_test <- as.numeric(grepl("human rights", wikip_word_dat$text))
33 | table(wikip_word_dat$human_rights_test)
34 | 
35 | wikip_word_dat$civil_rights_test <- as.numeric(grepl("civil rights", wikip_word_dat$text))
36 | table(wikip_word_dat$civil_rights_test)
37 | 
38 | ## preprocess the documents
39 | ## This function uses function from the tm package (see the tm Demo for more details)
40 | prep <- textProcessor(documents=wikip_word_dat$text, meta=wikip_word_dat)
41 | 
42 | save(out, file="wikip_word_dat_stm_prep.Rdata")
43 | 
44 | ## inspect
45 | head(prep$documents)
46 | head(prep$vocab)
47 | head(prep$meta)
48 | 
49 | 
50 | ## pre Documents
51 | ## stem words and remove stop words
52 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta)
53 | 
54 | save(out, file="wikip_word_dat_stm_out.Rdata")
55 | 
56 | ## inspect
57 | head(out$documents)
58 | head(out$vocab)
59 | head(out$meta)
60 | 
61 | ## fit a structural topic model
62 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=20)
63 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=40)
64 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=100)
65 | 
66 | #fit_2 <- fit
67 | #fit_3 <- fit
68 | #fit_10 <- fit
69 | #fit_20 <- fit
70 | fit_40 <- fit
71 | 
72 | ## display topic probabilities
73 | head(fit$theta)
74 | 
75 | dim(fit$theta)
76 | summary(head(fit$theta, 1000))
77 | 
78 | #save(fit, file="wikip_word_dat_stm20.Rdata")
79 | #save(fit, file="wikip_word_dat_stm40.Rdata")
80 | save(fit, file="wikip_word_dat_stm100.Rdata")
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/google_trends_stuff/R_Demo_google_trends_Texas.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_google_trends_Human_Rights.R
 2 | #########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-28
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | rm(list=ls())
15 | 
16 | ## load libraries
17 | library(gtrendsR)
18 | library(countrycode)
19 | library(stm)
20 | library(tm)
21 | library(MASS)
22 | library(colorbrewer)
23 | library(bcp)
24 | 
25 | 
26 | COLORS <- c("#fdae61", "#a6cee3", "#1f78b4", "#b2df8a", "#33a02c")
27 | 
28 | 
29 | #pdf("Texas_google_trends.pdf", height=5, width=6)
30 | 
31 | #plot.new()
32 | 
33 | TERMS <- list(c("%2Fg%2F11f06hfpld", "%2Fg%2F11nycqk4xh"),
34 |               c("%2Fg%2F11f06hfpld", "%2Fm%2F03m1n"),
35 |               c("%2Fm%2F03m1n", "%2Fg%2F11nycqk4xh"),
36 |               c("%2Fg%2F11nycqk4xh", "%2Fm%2F01rbhn"),
37 |               c("%2Fg%2F11f06hfpld", "%2Fm%2F01rbhn"),
38 |               c("%2Fm%2F03m1n", "%2Fm%2F01rbhn")
39 | )
40 | length(TERMS)
41 | 
42 | TERMS_names <- list(c("Hurricane Harvey", "Winter Storm"),
43 |                     c("Hurricane Harvey", "Astros"),
44 |                     c("Astros", "Winter Storm"),
45 |                     c("Winter Storm", "Power Outage"),
46 |                     c("Hurricane Harvey", "Power Outage"),
47 |                     c("Astros", "Power Outage")
48 | )
49 | 
50 | #par(mfrow=c(3,3), mar=c(2,2.5,1,.5))
51 | par(mfrow=c(1,1), mar=c(2,2.5,1,.5))
52 | 
53 | for(i in 1:6){
54 |   
55 |   world <- gtrends(TERMS[[i]], geo="US-TX", time="2017-01-01 2021-12-31", low_search_volume=T)$interest_over_time
56 |   
57 |   world$hits[world$hits=="<1"] <- .5
58 |   world$hits <- as.numeric(world$hits)
59 |   
60 |   plot(world$hits[world$keyword==TERMS[[i]][1]], main=paste("Texas:", TERMS_names[[i]][1], "vs.", TERMS_names[[i]][2]), lwd=1, col=grey(.75), ylim=c(0, 100), xaxt="n", yaxt="n", type="n")
61 |   #lines(world$hits[world$keyword==TERMS[[i]][2]], type="l", ylim=c(0,100), col=2)
62 |   #plot.window(xlim=c(1,length(world$hits[world$keyword==TERMS[[i]][1]])), ylim=c(0, 100))
63 |   #mtext(side=3, text=paste("Global:", TERMS[[i]][1], "vs.", TERMS[[i]][2]), line=1)
64 |   
65 |   id2017 <- which(as.Date(world$date[world$keyword==TERMS[[i]][1]]) > as.Date("2016-12-31") & as.Date(world$date[world$keyword==TERMS[[i]][1]]) <= as.Date("2017-12-31"))
66 |   id2018 <- which(as.Date(world$date[world$keyword==TERMS[[i]][1]]) > as.Date("2017-12-31") & as.Date(world$date[world$keyword==TERMS[[i]][1]]) <= as.Date("2018-12-31"))
67 |   id2019 <- which(as.Date(world$date[world$keyword==TERMS[[i]][1]]) > as.Date("2018-12-31") & as.Date(world$date[world$keyword==TERMS[[i]][1]]) <= as.Date("2019-12-31"))
68 |   id2020 <- which(as.Date(world$date[world$keyword==TERMS[[i]][1]]) > as.Date("2019-12-31") & as.Date(world$date[world$keyword==TERMS[[i]][1]]) <= as.Date("2020-12-31"))
69 |   id2021 <- which(as.Date(world$date[world$keyword==TERMS[[i]][1]]) > as.Date("2020-12-31") & as.Date(world$date[world$keyword==TERMS[[i]][1]]) <= as.Date("2021-12-31"))
70 |   
71 |   polygon(x=c(min(id2017), min(id2017), max(id2017), max(id2017)), y=c(-10,110,110,-10), col=grey(.95), border=NA)
72 |   polygon(x=c(min(id2019), min(id2019), max(id2019), max(id2019)), y=c(-10,110,110,-10), col=grey(.95), border=NA)
73 |   polygon(x=c(min(id2021), min(id2021), max(id2021), max(id2021)), y=c(-10,110,110,-10), col=grey(.95), border=NA)
74 |   box()
75 |   
76 |   lines(world$hits[world$keyword==TERMS[[i]][1]], lwd=1, col="#c2a5cf")
77 |   lines(world$hits[world$keyword==TERMS[[i]][2]], lwd=1, col="#a6dba0")
78 |   axis(side=2, at=c(0,25,50,75,100), las=2)
79 |   axis(side=1, at=c(median(id2017),  median(id2018), median(id2019), median(id2020), median(id2021)), labels=c(2017:2021), las=1)
80 |   model <- bcp(y=world$hits[world$keyword==TERMS[[i]][1]])
81 |   lines(model$posterior.mean, lwd=.75, col="#7b3294")
82 |   model <- bcp(y=world$hits[world$keyword==TERMS[[i]][2]])
83 |   lines(model$posterior.mean, lwd=.75, col="#008837")
84 |   
85 |   legend("topleft", legend=c(TERMS_names[[i]][1]), text.col="#7b3294" ,bty="n")
86 |   legend("bottomleft", legend=c(TERMS_names[[i]][2]), text.col="#008837", bty="n")
87 |   
88 | }
89 | 
90 | #dev.off()
91 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/google_trends_stuff/R_Demo_google_trends_trump_brexit.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_google_trends_trump_brexit.R
 2 | #########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-28
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | 
15 | ## clean up workspace
16 | rm(list = ls(all.names = TRUE))
17 | gc()
18 | 
19 | ## load libraries
20 | library(gtrendsR)
21 | library(countrycode)
22 | library(stm)
23 | library(tm)
24 | library(MASS)
25 | library(colorbrewer)
26 | 
27 | ## country codes
28 | data("countries")
29 | ISO <- as.character(unique(countries$country_code))
30 | COUNTRY <- countrycode(ISO, origin="iso2c", destination="country.name")
31 | 
32 | 
33 | COLORS <- c("#fdae61", "#a6cee3", "#1f78b4", "#b2df8a", "#33a02c")
34 | 
35 | 
36 | ##
37 | TERMS <- c("Donald Trump", "Brexit")
38 | 
39 | 
40 | 
41 | world <- gtrends(TERMS[c(1,2)])
42 | plot(world)
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/google_trends_stuff/groundhog_library_func.R:
--------------------------------------------------------------------------------
 1 | 
 2 | groundhog_library_func <- function(groundhog=FALSE, regular_install=FALSE){
 3 |   ## Do this (set to TRUE) to load libraries using the version from when the scripts were originally run
 4 |   if(groundhog){
 5 |       ## load an older version of the libraries
 6 |       remotes::install_github('CredibilityLab/groundhog')
 7 |       library(groundhog)
 8 |       pkgs <- c("gtrendsR", "countrycode", "stm", "tm", "MASS", "bcp", "ngramr", "rvest", "plm", "lmtest", "WDI", "boot", "forecast", "acled.api", "ggplot2", "stargazer", "httr", "lubridate", "xtable")
 9 |       groundhog.library(pkgs,'2022-05-23')
10 |   } else if(regular_install==TRUE){
11 |       ## or install and load the more recent version of the libraries
12 |       install.packages("gtrendsR", "countrycode", "stm", "tm", "MASS", "bcp", "ngramr", "rvest", "plm", "lmtest", "WDI", "boot", "forecast", "acled.api", "ggplot2", "stargazer", "httr", "lubridate", "xtable")
13 |       library(gtrendsR)
14 |       library(countrycode)
15 |       library(stm)
16 |       library(tm)
17 |       library(MASS)
18 |       library(bcp)
19 |       library(ngramr)
20 |       library(rvest)
21 |       library(plm)
22 |       library(lmtest)
23 |       library(WDI)
24 |       library(boot)
25 |       library(forecast)
26 |       library(acled.api)
27 |       library(ggplot2)
28 |       library(stargazer)
29 |       library(httr)
30 |       library(lubridate)
31 |       library(xtable)
32 |   } else{
33 |       ## or just load the more recent version of the libraries
34 |       library(gtrendsR)
35 |       library(countrycode)
36 |       library(stm)
37 |       library(tm)
38 |       library(MASS)
39 |       library(bcp)
40 |       library(ngramr)
41 |       library(rvest)
42 |       library(plm)
43 |       library(lmtest)
44 |       library(WDI)
45 |       library(boot)
46 |       library(forecast)
47 |       library(acled.api)
48 |       library(ggplot2)
49 |       library(stargazer)
50 |       library(httr)
51 |       library(lubridate)
52 |       library(xtable)
53 |   }
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_glm.R:
--------------------------------------------------------------------------------
  1 | ## R_Demo_neuralnet_gradiant_decent_glm.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2022
  6 | ##
  7 | ## Date: 2022-08-08
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ## Introduction to tutorial:
 14 | ##
 15 | ## Overview of Gradient Decent for generalized linear model example.
 16 | ##
 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network.
 18 | ##
 19 | ##########################################################################
 20 | 
 21 | ## set learning rate this varies on the unit interval (0 to 1]
 22 | library(MASS)
 23 | library(boot)
 24 | library(gtools)
 25 | 
 26 | lr <- .2
 27 | n <- 100
 28 | x <- rnorm(n)
 29 | 
 30 | alpha <- -1
 31 | beta <- -2
 32 | 
 33 | y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x))
 34 | #y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x + rnorm(n)))
 35 | 
 36 | ## Start with a random guess
 37 | X_mat <- cbind(1,x)
 38 | 
 39 | alpha_hat <- 4
 40 | beta_hat <- 4
 41 | iterations <- 1000
 42 | 
 43 | loss <- variance <- NA
 44 | 
 45 | y_hat <- matrix(NA, nrow=n, ncol=iterations)
 46 | y_error <- matrix(NA, nrow=n, ncol=iterations)
 47 | delta <- matrix(NA, nrow=2, ncol=iterations)
 48 | 
 49 | ## sequential iterations to evaulate loss function
 50 | for (j in 1:iterations){
 51 |     
 52 |     ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta
 53 |     y_hat[,j] <- inv.logit(alpha_hat[j] + beta_hat[j] * x)
 54 | 
 55 |     ## this works too but not as well as above
 56 |     #y_hat[,j] <- rbinom(1:n,1,prob=inv.logit(alpha_hat[j] + beta_hat[j] * x))
 57 |     
 58 |     ## difference between the predicted y (y_hat) and y is the error for y
 59 |     y_error[,j] <- y_hat[,j] - y
 60 |     
 61 |     ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors
 62 |     loss[j] <- sum(y_error[,j]^2)
 63 |     
 64 |     ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X
 65 |     ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n)
 66 |     delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n
 67 |     delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n
 68 |     
 69 |     ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y)
 70 |     alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j])
 71 |     beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j])
 72 | }
 73 | 
 74 | ## estimate glm model
 75 | fit <- glm(y~x, family=binomial("logit"))
 76 | fit
 77 | 
 78 | ## print the last value of the sequence of parameter estimates
 79 | alpha_hat[length(alpha_hat)]
 80 | beta_hat[length(beta_hat)]
 81 | 
 82 | 
 83 | ## graph the values as a function of the loss statistic
 84 | ALPHA_seq <- seq(from=-4,4,.05)
 85 | BETA_seq <- seq(from=-4,4,.05)
 86 | 
 87 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq))
 88 | 
 89 | ## grid search for graphing
 90 | for(i in 1:length(ALPHA_seq)){
 91 |     for(j in 1:length(ALPHA_seq)){
 92 |         Y_hat <- inv.logit(ALPHA_seq[i] + BETA_seq[j] * X_mat[,2])
 93 |         LOSS[i,j] <- sum((y-Y_hat)^2)
 94 |         
 95 |     }
 96 | }
 97 | 
 98 | ## plot the log loss
 99 | contour(ALPHA_seq, BETA_seq,(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5)
100 | lines(alpha_hat, beta_hat, col=2, lwd=4)
101 | 
102 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_lm.R:
--------------------------------------------------------------------------------
  1 | #### R_Demo_neuralnet_gradiant_decent_lm.R 
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2022
  6 | ##
  7 | ## Date: 2022-08-08
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ## Introduction to tutorial:
 14 | ##
 15 | ## Overview of Gradient Decent for a linear model with two parameters (intercept and slope).
 16 | ##
 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network.
 18 | ##
 19 | ##########################################################################
 20 | 
 21 | 
 22 | ## load libraries
 23 | library(MASS)
 24 | 
 25 | ## set learning rate this varies on the unit interval (0 to 1]
 26 | lr <- .2
 27 | 
 28 | n <- 100
 29 | x <- rnorm(n)
 30 | 
 31 | alpha <- -1
 32 | beta <- -2
 33 | 
 34 | y <- alpha + beta*x + rnorm(n)
 35 | plot(x=x, y=y)
 36 | fit <- lm(y ~ x)
 37 | summary(fit)
 38 | 
 39 | 
 40 | X_mat <- cbind(1,x)
 41 | head(X_mat)
 42 | 
 43 | # Start with a random guess
 44 | alpha_hat <- 1
 45 | beta_hat <- 4
 46 | 
 47 | 
 48 | iterations <- 100
 49 | 
 50 | loss <- NA
 51 | variance <- NA
 52 | 
 53 | y_hat <- matrix(NA, nrow=n, ncol=iterations)
 54 | y_error <- matrix(NA, nrow=n, ncol=iterations)
 55 | delta <- matrix(NA, nrow=2, ncol=iterations)
 56 | 
 57 | ## sequential iterations to evaulate loss function
 58 | for (j in 1:iterations){
 59 |     
 60 |     ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta
 61 |     y_hat[,j] <- alpha_hat[j] + beta_hat[j] * x
 62 |     
 63 |     ## difference between the predicted y (y_hat) and y is the error for y
 64 |     y_error[,j] <- y_hat[,j] - y
 65 |     
 66 |     ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors
 67 |     loss[j] <- sum(y_error[,j]^2)
 68 |     
 69 |     ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X
 70 |     ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n)
 71 |     
 72 |     ##
 73 |     delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n
 74 |     delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n
 75 | 
 76 |     ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y)
 77 |     alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j])
 78 |     beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j])
 79 | }
 80 | 
 81 | ## print lm summary from above
 82 | summary(fit)
 83 | 
 84 | ## print the last value of the sequence of parameter estimates
 85 | alpha_hat[length(alpha_hat)]
 86 | beta_hat[length(beta_hat)]
 87 | 
 88 | 
 89 | ## the least squares solution
 90 | solve(t(X_mat) %*% X_mat) %*% t(X_mat) %*% y
 91 | 
 92 | ## note that these give distinct regression models because the covariance between alpha and beta are not included (so they are different from the one above)
 93 | solve(t(X_mat[,2]) %*% X_mat[,2]) %*% t(X_mat[,2]) %*% y
 94 | solve(t(X_mat[,1]) %*% X_mat[,1]) %*% t(X_mat[,1]) %*% y
 95 | 
 96 | 
 97 | ## graph the values as a function of the loss statistic
 98 | ALPHA_seq <- seq(from=-4,4,.05)
 99 | BETA_seq <- seq(from=-4,4,.05)
100 | 
101 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq))
102 | 
103 | ## grid search for graphing
104 | for(i in 1:length(ALPHA_seq)){
105 |     for(j in 1:length(ALPHA_seq)){
106 |         Y_hat <- ALPHA_seq[i] + BETA_seq[j] * X_mat[,2]
107 |         LOSS[i,j] <- sum((y-Y_hat)^2)
108 |     }
109 | }
110 | 
111 | ## plot the log loss
112 | par(mar=c(5,5,1,1))
113 | contour(ALPHA_seq, BETA_seq, log(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5)
114 | lines(alpha_hat, beta_hat, col=2, lwd=4)
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_mu.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_neuralnet_gradiant_decent_mu.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-08
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## Overview of Gradient Decent for a model with one parameter for a mean.
16 | ##
17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network.
18 | ##
19 | ##########################################################################
20 | 
21 | 
22 | 
23 | 
24 | 
25 | ## load libraries
26 | library(MASS)
27 | 
28 | ## set learning rate this varies on the unit interval (0 to 1]
29 | lr <- .2
30 | #lr <- .01
31 | #lr <- 1
32 | 
33 | y <- c(1,2,3,4,5)
34 | ##y <- rnorm(1000, pi, 1)
35 | n <- length(y)
36 | 
37 | y
38 | n
39 | 
40 | 
41 | iterations <- 100
42 | 
43 | loss <- NA
44 | 
45 | mu_hat <- -1.5
46 | 
47 | y_hat <- matrix(NA, nrow=n, ncol=iterations)
48 | y_error <- matrix(NA, nrow=n, ncol=iterations)
49 | 
50 | delta_mu_hat <- NA
51 | 
52 | ## sequential iterations to evaulate loss function
53 | for (j in 1:iterations){
54 |     
55 |     ## y_hat: calculate the predicted y_hat based on the best guess of mu
56 |     
57 |     y_hat[,j] <- mu_hat[j]
58 |     
59 |     ## risidual: the difference between the predicted y (y_hat) and y is the error for y
60 |     y_error[,j] <- y_hat[,j] - y
61 |     
62 |     ## loss: the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors
63 |     loss[j] <- sum(y_error[,j]^2)
64 |     
65 |     ## difference between current estimate and the unexplained differences, which is the method to calculate the gradient at that point
66 |     delta_mu_hat[j] <- sum(y_error[,j])/n
67 |     
68 |     ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y)
69 |     mu_hat[j+1] <- mu_hat[j] - (lr*delta_mu_hat[j])
70 | }
71 | 
72 | mu_hat[length(mu_hat)]
73 | 
74 | plot(mu_hat)
75 | 
76 | plot(mu_hat[-1], delta_mu_hat)
77 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day08_Applied_Machine_Learning_AML_HuggingFace/R_Demo_huggingface_intro.R:
--------------------------------------------------------------------------------
  1 | ## R_Demo_huggingface_into.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B or 2F)
  5 | ## University of Essex Summer School 2022
  6 | ##
  7 | ## Date: 2022-08-17
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ##
 14 | ## Introduction to tutorial:
 15 | ##
 16 | ## We are going to try out a number of pretrained Large Langugae Models --- LLMs --- that are publicly available at 🤗
 17 | ##
 18 | ## 🤗 huggingface models: https://huggingface.co/models
 19 | ## 
 20 | ## 🤗 huggingface datasets: https://huggingface.co/datasets
 21 | ##
 22 | ## Bert LLM (and many others)
 23 | ## https://huggingface.co/docs/transformers/v4.20.1/en/model_doc/bert
 24 | ##
 25 | ## BloomLLM 
 26 | ## BigScience Large Open-science Open-access Multilingual Language Model
 27 | ## https://huggingface.co/bigscience/bloom
 28 | ##
 29 | ## more examples with R
 30 | ## https://rpubs.com/eR_ic/transfoRmers
 31 | ##
 32 | ## for loading Python libraries and setting up piplines (functions)
 33 | ## https://rstudio.github.io/reticulate/reference/py_install.html
 34 | ## for details see https://rstudio.github.io/reticulate/
 35 | ##
 36 | ##########################################################################
 37 | ##
 38 | ## install.packages("reticulate")
 39 | 
 40 | ## load R libraries
 41 | library(reticulate)
 42 | library(keras)
 43 | library(tensorflow)
 44 | library(dplyr)
 45 | library(tfdatasets)
 46 | library(torch)
 47 | 
 48 | ## load Python libraries using the reticulate package
 49 | reticulate::py_install("transformers", pip = TRUE)
 50 | 
 51 | ## also try these
 52 | reticulate::py_install("PyTorch", pip = TRUE)
 53 | reticulate::py_install("tensorflow", pip = TRUE)
 54 | 
 55 | ## or try this
 56 | #tensorflow::install_tensorflow()
 57 | 
 58 | # Importing 🤗 transformers module into R session using the Python-to-R reticulate package
 59 | transformers <- reticulate::import("transformers")
 60 | 
 61 | # get another pretrained Tokenizer
 62 | transformers$RobertaTokenizer$from_pretrained('roberta-base', do_lower_case=TRUE)
 63 | 
 64 | # Instantiate a Python pipeline (this is now a function that we can pass inputs into)
 65 | ## more information on piplines here: https://huggingface.co/docs/transformers/main_classes/pipelines
 66 | ## the text-classification pipeline is for "sentiment-analysis"
 67 | classifier <- transformers$pipeline(task = "text-classification")
 68 | 
 69 | ## load the simple fake text data
 70 | data <- read.csv("SIMpoliticalTweets.txt", header=FALSE)
 71 | names(data)  <- "text"
 72 | head(data)
 73 | 
 74 | # Generate predictions using the Python pipline classifier() function 
 75 | outputs <- classifier(data$text)
 76 | outputs
 77 | 
 78 | outputs[[1]]$score
 79 | 
 80 | data.frame(outputs)
 81 | 
 82 | 
 83 | 
 84 | # get another pretrained Tokenizer
 85 | transformers$RobertaTokenizer$from_pretrained('roberta-base', do_lower_case=TRUE)
 86 | 
 87 | classifier <- transformers$pipeline(task = "text-classification")
 88 | 
 89 | outputs <- classifier(data$text)
 90 | outputs
 91 | 
 92 | # get Model with weights
 93 | transformers$TFRobertaModel$from_pretrained('roberta-base')
 94 | 
 95 | 
 96 | 
 97 | ## try some others pre-trained tasks with the zero-shot-classification
 98 | ## https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/pipelines#transformers.ZeroShotClassificationPipeline
 99 | classifier <- transformers$pipeline(task = "zero-shot-classification")
100 | 
101 | outputs <- classifier(data$text, c("love", "obama"))
102 | outputs
103 | 
104 | 
105 | ## another classifier example, this time translating an english term into French
106 | 
107 | 
108 | transformers$TFRobertaModel$from_pretrained("t5-base")
109 | en_fr_translator <- transformers$pipeline(task = "translation_en_to_fr")
110 | en_fr_translator("human rights")
111 | 
112 | en_de_translator <- transformers$pipeline(task = "translation_en_to_de")
113 | en_de_translator("human rights")
114 | 
115 | 
116 | ## another classifier example
117 | token_classifier <- transformers$pipeline(task = "token-classification")
118 | 
119 | token_classifier()
120 | 
121 | 


--------------------------------------------------------------------------------
/Archive/R-Programs-lessons-2022/Day09_Model_Evaluation_Review/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/Day09_Model_Evaluation_Review/.DS_Store


--------------------------------------------------------------------------------
/Datasets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Datasets/.DS_Store


--------------------------------------------------------------------------------
/Datasets/Anscombes_quartet.csv:
--------------------------------------------------------------------------------
1 | n,x1,y1,x2,y2,x3,y3,x4,y41,10,8.04,10,9.14,10,7.46,8,6.582,8,6.95,8,8.14,8,6.77,8,5.763,13,7.58,13,8.74,13,12.74,8,7.714,9,8.81,9,8.77,9,7.11,8,8.845,11,8.33,11,9.26,11,7.81,8,8.476,14,9.96,14,8.1,14,8.84,8,7.047,6,7.24,6,6.13,6,6.08,8,5.258,4,4.26,4,3.1,4,5.39,19,12.59,12,10.84,12,9.13,12,8.15,8,5.5610,7,4.82,7,7.26,7,6.42,8,7.9111,5,5.68,5,4.74,5,5.73,8,6.89


--------------------------------------------------------------------------------
/Datasets/Myobject.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Datasets/Myobject.Rdata


--------------------------------------------------------------------------------
/Datasets/Myworkspace.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Datasets/Myworkspace.Rdata


--------------------------------------------------------------------------------
/Datasets/NYT_Text_Articles/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Datasets/NYT_Text_Articles/.DS_Store


--------------------------------------------------------------------------------
/Datasets/SIMpoliticalTweets.txt:
--------------------------------------------------------------------------------
 1 | these are fake tweets about obama
 2 | i love obama
 3 | i love obama
 4 | i hate obama
 5 | i hate pea soup
 6 | i love oatmeal
 7 | obama is the president
 8 | obama has a cool job
 9 | love and hate 
10 | blah blah blah
11 | blah
12 | 


--------------------------------------------------------------------------------
/Datasets/ny_stop_frisk.csv:
--------------------------------------------------------------------------------
1 | year,race,total2012,black,2842292012,latino,1651402012,white,50366


--------------------------------------------------------------------------------
/Datasets/ny_stop_frisk_black.csv:
--------------------------------------------------------------------------------
 1 | year, total
 2 | 2003, 77704
 3 | 2004,155033
 4 | 2005,196570
 5 | 2006,267468
 6 | 2007,243766
 7 | 2008,275588
 8 | 2009,310611
 9 | 2010,315611
10 | 2011,350743
11 | 2012,284229


--------------------------------------------------------------------------------
/Datasets/tweet_data.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Datasets/tweet_data.Rdata


--------------------------------------------------------------------------------
/Datasets/tweet_data.csv:
--------------------------------------------------------------------------------
 1 | "","obama","love","hate"
 2 | "1",1,0,0
 3 | "2",1,1,0
 4 | "3",1,1,0
 5 | "4",1,0,1
 6 | "5",0,0,1
 7 | "6",0,1,0
 8 | "7",1,0,0
 9 | "8",1,0,0
10 | "9",0,1,1
11 | "10",0,0,0
12 | "11",0,0,0
13 | 


--------------------------------------------------------------------------------
/Datasets/users-by-social-media-platform.csv:
--------------------------------------------------------------------------------
  1 | Entity,Year,monthly_active_users
  2 | Facebook,2008,100000000
  3 | Facebook,2009,276000000
  4 | Facebook,2010,517750000
  5 | Facebook,2011,766000000
  6 | Facebook,2012,979750000
  7 | Facebook,2013,1170500000
  8 | Facebook,2014,1334000000
  9 | Facebook,2015,1516750000
 10 | Facebook,2016,1753500000
 11 | Facebook,2017,2035750000
 12 | Facebook,2018,2255250000
 13 | Facebook,2019,2375000000
 14 | Flickr,2004,3675135
 15 | Flickr,2005,7399354
 16 | Flickr,2006,14949270
 17 | Flickr,2007,29299875
 18 | Flickr,2008,30000000
 19 | Flickr,2009,41834525
 20 | Flickr,2010,54708063
 21 | Flickr,2011,66954600
 22 | Flickr,2012,79664888
 23 | Flickr,2013,80000000
 24 | Friendster,2002,3000000
 25 | Friendster,2003,4470000
 26 | Friendster,2004,5970054
 27 | Friendster,2005,7459742
 28 | Friendster,2006,8989854
 29 | Friendster,2007,24253200
 30 | Friendster,2008,51008911
 31 | Friendster,2009,28804331
 32 | Google Buzz,2010,166029650
 33 | Google Buzz,2011,170000000
 34 | Google Buzz,2012,170000000
 35 | Google Buzz,2013,170000000
 36 | Google Buzz,2014,170000000
 37 | Google Buzz,2015,170000000
 38 | Google+,2012,107319100
 39 | Google+,2013,205654700
 40 | Google+,2014,254859015
 41 | Google+,2015,298950015
 42 | Google+,2016,398648000
 43 | Google+,2017,495657000
 44 | Google+,2018,430000000
 45 | Hi5,2005,9731610
 46 | Hi5,2006,19932360
 47 | Hi5,2007,29533250
 48 | Hi5,2008,55045618
 49 | Hi5,2009,57893524
 50 | Hi5,2010,59953290
 51 | Hi5,2011,46610848
 52 | Instagram,2013,117500000
 53 | Instagram,2014,250000000
 54 | Instagram,2015,400000000
 55 | Instagram,2016,550000000
 56 | Instagram,2017,750000000
 57 | Instagram,2018,1000000000
 58 | MySpace,2004,980036
 59 | MySpace,2005,19490059
 60 | MySpace,2006,54763260
 61 | MySpace,2007,69299875
 62 | MySpace,2008,72408233
 63 | MySpace,2009,70133095
 64 | MySpace,2010,68046710
 65 | MySpace,2011,46003536
 66 | Orkut,2004,4900180
 67 | Orkut,2005,9865805
 68 | Orkut,2006,14966180
 69 | Orkut,2007,26916562
 70 | Orkut,2008,44357628
 71 | Orkut,2009,47366905
 72 | Orkut,2010,49941613
 73 | Orkut,2011,47609080
 74 | Orkut,2012,45067022
 75 | Pinterest,2016,143250000
 76 | Pinterest,2017,195000000
 77 | Pinterest,2018,246500000
 78 | Pinterest,2019,291000000
 79 | Reddit,2006,248309
 80 | Reddit,2007,488331
 81 | Reddit,2008,1944940
 82 | Reddit,2009,3893524
 83 | Reddit,2014,135786956
 84 | Reddit,2015,163346676
 85 | Reddit,2016,238972480
 86 | Reddit,2017,297394200
 87 | Reddit,2018,355000000
 88 | Snapchat,2016,238648000
 89 | TikTok,2017,239142500
 90 | TikTok,2018,500000000
 91 | Tumblr,2012,146890156
 92 | Tumblr,2013,293482050
 93 | Tumblr,2014,388721163
 94 | Tumblr,2015,475923363
 95 | Tumblr,2016,565796720
 96 | Tumblr,2017,593783960
 97 | Tumblr,2018,624000000
 98 | Twitter,2010,43250000
 99 | Twitter,2011,92750000
100 | Twitter,2012,160250000
101 | Twitter,2013,223675000
102 | Twitter,2014,223675000
103 | Twitter,2015,304500000
104 | Twitter,2016,314500000
105 | Twitter,2017,328250000
106 | Twitter,2018,329500000
107 | Twitter,2019,330000000
108 | WeChat,2011,47818400
109 | WeChat,2012,118123370
110 | WeChat,2013,196523760
111 | WeChat,2014,444232415
112 | WeChat,2015,660843407
113 | WeChat,2016,847512320
114 | WeChat,2017,921742750
115 | WeChat,2018,1000000000
116 | Weibo,2010,19532900
117 | Weibo,2011,48691040
118 | Weibo,2012,79195730
119 | Weibo,2013,118261880
120 | Weibo,2014,154890345
121 | Weibo,2015,208716685
122 | Weibo,2016,281026560
123 | Weibo,2017,357569030
124 | Weibo,2018,431000000
125 | Whatsapp,2013,300000000
126 | Whatsapp,2014,498750000
127 | Whatsapp,2015,800000000
128 | Whatsapp,2016,1000000000
129 | Whatsapp,2017,1333333333
130 | YouTube,2005,1946322
131 | YouTube,2006,19878248
132 | YouTube,2007,143932250
133 | YouTube,2008,294493950
134 | YouTube,2009,413611440
135 | YouTube,2010,480551990
136 | YouTube,2011,642669824
137 | YouTube,2012,844638200
138 | YouTube,2013,1065223075
139 | YouTube,2014,1249451725
140 | YouTube,2015,1328133360
141 | YouTube,2016,1399053600
142 | YouTube,2017,1495657000
143 | YouTube,2018,1900000000


--------------------------------------------------------------------------------
/R-Program-Lessons/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/.DS_Store


--------------------------------------------------------------------------------
/R-Program-Lessons/Day01_Intro_programs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day01_Intro_programs/.DS_Store


--------------------------------------------------------------------------------
/R-Program-Lessons/Day01_Intro_programs/R_Demo_Intro_Program_Challenge_two_sum.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_Intro_Program_Challenge_two_sum.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
 5 | ## University of Essex Summer School 2023
 6 | ##
 7 | ## Date: 2023-08-07
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Instructions:
14 | ##
15 | ## These challenges are meant to be just that, challenging. 
16 | ## They should also be fun. I encourage you to think creatively and collaboratively. 
17 | ## Getting stuck or not finishing all the steps is expected and encouraged. 
18 | ## This is how learning works!
19 | ## Learn to program, program to learn.
20 | ##
21 | ## Always start with step (1) and then continue to each step as time permits.
22 | ## Don't worry about completing each step. Document your code for each step.
23 | ## You may wish to come back to some of the harder steps as you progress through the course.
24 | ## Note that some of the steps may ask you to use skills we have not yet covered in the course.
25 | ## Don't worry about these steps now but definitely think through the programming logic if you are stuck and make plans to come back to try them once you feel ready.
26 | ##
27 | ##########################################################################
28 | ##
29 | ## Steps for the Challenge
30 | ##
31 | ## (1) create a vector of integers numbers and a scalar integer target
32 | ## (2) write a program in R that determines (returns) two numbers from the vector that add up to the target scalar
33 | ## (3) how many combinations of numbers in the vector of integers add up to the target scalar?
34 | ## (4) write a function to complete steps  1-3 (hint: wrap the program from (3) within a function)
35 | ## (5) write a simulation that explore the relationship between (a) the vector of integers numbers and (b) scalar integer target sum
36 | ## (6) re-write the program or function so that it takes fewer steps to calculate the number of numeric combinations that add up to the scalar target
37 | ##
38 | ##########################################################################
39 | 
40 | ## (1) create a vector of integers numbers and a scalar integer target
41 | x <- c(1,2,3,4,5)
42 | y <- 5
43 | 
44 | ## (2) write a program in R that determines (returns) two numbers from the vector that add up to the target scalar
45 | test <- x + x[1]
46 | test == y
47 | 
48 | x[test==y]
49 | 
50 | value <- c()
51 | for(i in 1:length(x)){
52 |   test <- x + x[i]
53 |   value <- x[test==y]
54 |   print(value)
55 | }
56 | value
57 | 
58 | 
59 | add <- function(a,b){a+b}
60 | add(1,2)
61 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day02_Data_Analysis_Managment/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day02_Data_Analysis_Managment/.DS_Store


--------------------------------------------------------------------------------
/R-Program-Lessons/Day02_Data_Analysis_Managment/R_Demo_Intro_forloop_vs_lapply.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_Intro_forloop_vs_lapply.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
 5 | ## University of Essex Summer School 2023
 6 | ##
 7 | ## Date: 2023-08-08
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## in class coding of loops and lapply
16 | 
17 | 1:10
18 | 
19 | length(1:10)
20 | 
21 | 
22 | ## for loop version
23 | output <- c()
24 | for(i in 1:10){
25 |   
26 |   local_var <- i*i ## the simple but core calculation of the iterative process
27 |   output[i] <- local_var
28 |   
29 | }
30 | output
31 | 
32 | 
33 | ## lapply version
34 | output <- lapply(1:10, function(i){
35 |   
36 |   local_var <- i*i ## the simple but core calculation of the iterative process
37 |   return(local_var)
38 | })
39 | unlist(output)
40 | 
41 | 
42 | 
43 | ## nested for loop version
44 | mat <- matrix(NA, nrow=10, ncol=5)
45 | mat
46 | 
47 | for(i in 1:10){
48 |   for(j in 1:5){
49 |     local_var <- i*j
50 |     mat[i,j] <- local_var
51 |   }  
52 | }
53 | mat
54 | 
55 | ## lapply version with an inner for loop (replicates the nested for loop structure above)
56 | mat_list <- lapply(1:10, function(i){
57 |   
58 |   local_vec <- c()
59 |   for(j in 1:5){
60 |     local_var <- i*j
61 |     local_vec[j] <- local_var
62 |   }  
63 |   
64 |  return(local_vec)
65 | })
66 | 
67 | unlist(mat_list)
68 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T)
69 | mat
70 | 
71 | 
72 | ## nested lapply version (replicated both versions above)
73 | mat_list <- lapply(1:10, function(i){
74 |   local_out <- lapply(1:5, function(j){
75 |     temp <- i*j
76 |     return(temp)
77 |   })  
78 |   return(local_out)
79 | })
80 | 
81 | mat_list
82 | unlist(mat_list)
83 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T)
84 | mat
85 | 
86 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day03_Simulation_Learning/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day03_Simulation_Learning/.DS_Store


--------------------------------------------------------------------------------
/R-Program-Lessons/Day03_Simulation_Learning/R_Demo_Simulation_Learning_Dice_Rolls.R:
--------------------------------------------------------------------------------
  1 | ## R_Demo_Simulation_Learning_Estimate_Mean.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
  5 | ## University of Essex Summer School 2023
  6 | ##
  7 | ## Date: 2023-08-08
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##
 13 | ##########################################################################
 14 | ## For this R tutorial, we will learn how:
 15 | ##
 16 | ## (1) Simulate the roll of a D6 dice
 17 | ## (2) Repeatedly simulate the roll of a D6 dice and see how close the average value is to the expected value. 
 18 | ## (3) Learn about the central limit theorem from the simulation.
 19 | ##
 20 | ## Notes: The Central Limit Theorem (CLT) establishes that when independently generated variables (iid: independent and identically distributed random variables) are added together, the sums or averages of these variables (when normalized) converge towards a normal distribution. 
 21 | ##
 22 | ## This property emerges even if the original variables are not individually normally distributed, as with the roll of a die. 
 23 | ##
 24 | ## The probability of any value from the single roll of die is equivalent to any other value for the same-sided die in the limit (when the number of rolls approaches infinity).
 25 | ## 
 26 | ##########################################################################
 27 | 
 28 | 
 29 | library(MASS)
 30 | 
 31 | 
 32 | ## simulate 20 randomly generated rolls from a D6 (6-sided-die)
 33 | sample(1:6, size=20, replace=TRUE)
 34 | 
 35 | ## true mean is 3.5
 36 | (1 + 2 + 3 + 4 + 5 + 6) / 6
 37 | 
 38 | ## or 
 39 | mean(1:6)
 40 | 
 41 | ## true variance is approximately 2.916667 or exactly 70/24 
 42 | (1 - 3.5)^2 * (1/6) + (2 - 3.5)^2 * (1/6) + (3 - 3.5)^2 * (1/6) + (4 - 3.5)^2 * (1/6) + (5 - 3.5)^2 * (1/6) + (6 - 3.5)^2 * (1/6)
 43 | 
 44 | ## or
 45 | sum((1:6 - mean(1:6))^2 * (1/6))
 46 | 
 47 | ## repeat the simulation 10,000 times and calculate the average
 48 | n_sims <- 2000
 49 | 
 50 | ## number of samples to roll each iteration
 51 | n_samples <- 10
 52 | 
 53 | ## create two objects to hold the calculated mean and variance from each simulated sample
 54 | sim_mean_values <- c()
 55 | sim_var_values <- c()
 56 | 
 57 | ## iterate/repeat the simulation n_sims times
 58 | for(i in 1:n_sims){
 59 |   
 60 |   ## create output
 61 |   sample_output <- sample(1:6, size=n_samples, replace=TRUE)
 62 |   
 63 |   ## save the output in the i_th position of the objects
 64 |   sim_mean_values[i] <- mean(sample_output)
 65 |   sim_var_values[i] <- var(sample_output)
 66 | 
 67 | }
 68 | 
 69 | ## calculate the mean and variance of the 10,000 sample means
 70 | mean(sim_mean_values)
 71 | mean(sim_var_values)
 72 | 
 73 | ## set graphical parameters
 74 | par(mfrow=c(1,2), mar=c(4,3,1,1))
 75 | 
 76 | ## plot histograms 
 77 | truehist(sim_mean_values, main="Mean Estimate")
 78 | truehist(sim_var_values, main="Variance Estimate")
 79 | 
 80 | 
 81 | ## calculate and plot the converging average using increasing sample sizes starting at 1 and ending at all the samples
 82 | ## set graphical parameters
 83 | par(mfrow=c(1,2))
 84 | 
 85 | ##
 86 | plot(0,0, ylim=c(2.5,4), xlim=c(0,n_sims), type="n", main="Mean Estimate")
 87 | value <- c()
 88 | for(i in 1:n_sims){
 89 |   value[i] <- mean(sim_mean_values[1:i])
 90 | }
 91 | lines(value)
 92 | abline(h=3.5, col="orange", lwd=2, lty=2)
 93 | 
 94 | ##
 95 | plot(0,0, ylim=c(2.5,4), xlim=c(0,n_sims), type="n", main="Variance Estimate")
 96 | value <- c()
 97 | for(i in 1:n_sims){
 98 |   value[i] <- mean(sim_var_values[1:i])
 99 | }
100 | lines(value)
101 | abline(h=2.916667, col="orange", lwd=2, lty=2)
102 | 
103 | 
104 | sqrt(2.916667)
105 | 
106 | ## the values converge towards normality 
107 | summary((sim_mean_values - 3.5))
108 | mean((sim_mean_values - 3.5))
109 | var((sim_mean_values - 3.5))
110 | 
111 | 
112 | 
113 | var(sim_mean_values)
114 | var(sim_var_values)
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day03_Simulation_Learning/R_Demo_Simulation_Learning_ProbCalc_BayesRule.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_Simulation_Learning_ProbCalc_BayesRule.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
 5 | ## University of Essex Summer School 2023
 6 | ##
 7 | ## Date: 2023-08-08
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## Overview of Bayes rule and the calculation of conditional probability. Introduces the naive Bayes classifier function from the e1071 library.
16 | ##
17 | ## P(outcome | evidence) = P(outcome) * P(evidence | outcome) / P(evidence)
18 | ##
19 | ## Below is the Bayes’ Theorem:
20 | ## P(A | B) = P(A) * P(B | A) / P(B)
21 | ##
22 | ## Which can be derived from the general multiplication formula for AND events:
23 | ## P(A and B) = P(A) * P(B | A)
24 | ## P(B | A) = P(A and B) / P(A)
25 | ## P(B | A) = P(B) * P(A | B) / P(A)
26 | ## P(y|x) = P(x|y) * P(y) / P(x)
27 | ## P(x|y) = P(x AND y) / P(x)
28 | ##
29 | ## Pr(A[1] = Pr(y==0)
30 | ## Pr(A[2] = Pr(y==1)
31 | ## Pr(B | A[1]) = Pr(Data | y==0)
32 | ## Pr(B | A[2]) = Pr(Data | y==1)
33 | ##
34 | ##########################################################################
35 | 
36 | 
37 | A <- c(0,1,0,1)
38 | B <- c(0,0,0,1)
39 | 
40 | 
41 | ProbCalc <- function(A,B){
42 | 
43 |   len <- length(A)
44 |   data <- as.data.frame(cbind(A,B, A*B, A+B-A*B))
45 |   names(data) <- c("A", "B", "A and B", "A or B")
46 | 
47 |   pr.A <- sum(A)/len
48 |   pr.B <- sum(B)/len
49 |   pr.NotA <- 1 - pr.A
50 |   pr.NotB <- 1 - pr.B
51 |   pr.A.B <- sum(A*B)/len
52 |   pr.NotA.B <- sum(abs(A-1)*B)/len
53 |   pr.A.NotB <- sum(A*abs(B-1))/len
54 |   pr.NotA.NotB <- sum(abs(A-1)*abs(B-1))/len
55 | 
56 |   pr.A.or.B <- pr.A + pr.B - pr.A.B
57 |   
58 |   pr.A.condB <- pr.A.B / pr.B
59 |   pr.B.condA <-pr.A.B / pr.A
60 |   
61 |   out <- list(data=data, pr.A=pr.A, pr.B=pr.B, pr.NotA=pr.NotA, pr.NotB=pr.NotB, pr.A.B=pr.A.B, pr.NotA.B=pr.NotA.B, pr.A.NotB=pr.A.NotB, pr.NotA.NotB=pr.NotA.NotB,  pr.A.or.B= pr.A.or.B, pr.A.condB=pr.A.condB, pr.B.condA=pr.B.condA)
62 |   return(out) 
63 | 
64 | }
65 | 
66 | ProbCalc(A,B)
67 | 
68 | ProbTabCalc <- function(A,B){
69 | 
70 |   tab <- table(A,B)/length(A)
71 | 
72 |   pr.A <- sum(tab[A==1])
73 |   pr.B <- sum(tab[B==1])
74 |   pr.NotA <- sum(tab[A==0])
75 |   pr.NotB <- sum(tab[B==0])
76 |   pr.A.B <- sum(tab[A==1 & B==1])
77 |   pr.NotA.B <- sum(tab[A==0 & B==1])
78 |   pr.A.NotB <- sum(tab[A==1 & B==0])
79 |   pr.NotA.NotB <- sum(tab[A==0 & B==0])
80 | 
81 |   pr.A.or.B <- sum(tab[A==1 | B==1])
82 |   
83 |   pr.A.condB <- pr.A.B / pr.B
84 |   pr.B.condA <-pr.A.B / pr.A
85 | 
86 |   out <- list(table=tab, pr.A=pr.A, pr.B=pr.B, pr.NotA=pr.NotA, pr.NotB=pr.NotB, pr.A.B=pr.A.B, pr.NotA.B=pr.NotA.B, pr.A.NotB=pr.A.NotB, pr.NotA.NotB=pr.NotA.NotB,  pr.A.or.B= pr.A.or.B, pr.A.condB=pr.A.condB, pr.B.condA=pr.B.condA)
87 |     return(out)
88 | 
89 | }
90 | 
91 | ProbTabCalc(A,B)
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day04_Simulation_Inference/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day04_Simulation_Inference/.DS_Store


--------------------------------------------------------------------------------
/R-Program-Lessons/Day04_Simulation_Inference/R_Demo_Simulation_Inference_2_Sample_Hold_Out.R:
--------------------------------------------------------------------------------
  1 | #### R_Demo_Simulation_Inference_2_Sample_Hold_out.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
  5 | ## University of Essex Summer School 2023
  6 | ##
  7 | ## Date: 2023-08-09
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ##
 14 | ## Goal: Improve the predictive power or predictive validity of a model
 15 | ##
 16 | ##########################################################################
 17 | ## Introduction to tutorial:
 18 | ##
 19 | ## (1) Begin building towards and learning about cross-validation
 20 | ## (NOTE: There is no "crossing" yet)
 21 | ##
 22 | ## For this R tutorial we will simulate a dataset and then randomly divide it into two subsets.
 23 | ##
 24 | ## We will fit a model using the observations from one of the subsets of data (training data).
 25 | ##
 26 | ## We will then use the model estimates to predict the value of the dependent variable for the remaining out-of-sample data subset (testing data).
 27 | ##
 28 | ##########################################################################
 29 | 
 30 | 
 31 | #set.seed(940)
 32 | 
 33 | 
 34 | ## set number of observations for simulation
 35 | n <- 100
 36 | 
 37 | ## simulation of variables (This model is one of Anscombe's quartets)
 38 | x <- sample(4:14,n,replace=TRUE)
 39 | table(x)
 40 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,1)
 41 | #y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,2)
 42 | 
 43 | ## plot the simulated relationship
 44 | par(mfrow=c(1,1))
 45 | plot(x=x, y=y)
 46 | 
 47 | ## create a subject/unit ID variable with one values for each unit
 48 | ## here the indicator values takes on 2-Fold values {1,2}
 49 | folds <- sample(rep(1:2, n/2), size=n, replace=FALSE)
 50 | folds
 51 | table(folds)
 52 | 
 53 | ## doesn't always yield 50/50 ratio of 1s and 2s
 54 | #folds <- sample(1:2, size=n, replace=TRUE)
 55 | #folds
 56 | #table(folds)
 57 | 
 58 | ## create a data frame with the dependent variable, independent variable, and randomly created ID
 59 | dat <- data.frame(y, x, folds)
 60 | 
 61 | summary(dat)
 62 | 
 63 | head(dat)
 64 | 
 65 | ## fit a linear model to the full dataset
 66 | model <- lm(y ~ x, data=dat)
 67 | summary(model)
 68 | 
 69 | 
 70 | ## subset the full dataset into to subsets based on the ID variable
 71 | train <- subset(dat, folds==1)
 72 | test <- subset(dat, folds==2)
 73 | 
 74 | train <- dat[dat$folds==1,]
 75 | test <- dat[dat$folds==2,]
 76 | 
 77 | nrow(train)
 78 | nrow(test)
 79 | 
 80 | 
 81 | ## Model 0: fit a linear model
 82 | fit <- lm(y ~ 1, data=train)
 83 | in_sample_rmse <- sqrt(mean((as.numeric(predict(fit))-train$y)^2))
 84 | in_sample_rmse
 85 | 
 86 | pred <- predict(fit, newdata=test)
 87 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2))
 88 | rmse
 89 | 
 90 | ## Model 1: fit a linear model
 91 | fit <- lm(y ~ x, data=train)
 92 | in_sample_rmse <- sqrt(mean((as.numeric(predict(fit))-train$y)^2))
 93 | in_sample_rmse
 94 | 
 95 | pred <- predict(fit, newdata=test)
 96 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2))
 97 | rmse
 98 | 
 99 | ## this is what predict function is doing under the hood
100 | y_hat <- fit$coefficients[1] + fit$coefficients[2] * test$x 
101 | 
102 | ## Model 2: fit a linear model with a squared term
103 | fit <- lm(y ~ x + I(x^2), data=train)
104 | in_sample_rmse <- sqrt(mean((as.numeric(predict(fit))-train$y)^2))
105 | in_sample_rmse
106 | 
107 | pred <- predict(fit, newdata=test)
108 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2))
109 | rmse
110 | 
111 | 
112 | ## Model 3: fit a linear model with a squared term and a cubic term
113 | fit <- lm(y ~ x + I(x^2) + I(x^3), data=train)
114 | in_sample_rmse <- sqrt(mean((as.numeric(predict(fit))-train$y)^2))
115 | in_sample_rmse
116 | 
117 | pred <- predict(fit, newdata=test)
118 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2))
119 | rmse
120 | 
121 | 
122 | ## Model 4: fit a linear model with a squared term and a cubic term and a 4th order term
123 | fit <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4), data=train)
124 | in_sample_rmse <- sqrt(mean((as.numeric(predict(fit))-train$y)^2))
125 | in_sample_rmse
126 | 
127 | pred <- predict(fit, newdata=test)
128 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2))
129 | rmse
130 | 
131 | 
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes.R:
--------------------------------------------------------------------------------
 1 | #### R_Demo_Simulation_Inference_naive_Bayes.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2023
 6 | ##
 7 | ## Date: 2023-08-09
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## Overview of Bayes rule and the calculation of conditional probability. Introduces the naive Bayes classifier function from the e1071 library.
16 | ##
17 | ## P(outcome | evidence) = P(outcome) * P(evidence | outcome) / P(evidence)
18 | ##
19 | ## Below is the Bayes’ Theorem:
20 | ## P(A | B) = P(A) * P(B | A) / P(B)
21 | ##
22 | ## Which can be derived from the general multiplication formula for AND events:
23 | ## P(A and B) = P(A) * P(B | A)
24 | ## P(B | A) = P(A and B) / P(A)
25 | ## P(B | A) = P(B) * P(A | B) / P(A)
26 | ## P(y|x) = P(x|y) * P(y) / P(x)
27 | ## P(x|y) = P(x AND y) / P(x)
28 | ##
29 | ## Pr(A[1] = Pr(y==0)
30 | ## Pr(A[2] = Pr(y==1)
31 | ## Pr(B | A[1]) = Pr(Data | y==0)
32 | ## Pr(B | A[2]) = Pr(Data | y==1)
33 | ##
34 | ##########################################################################
35 | 
36 | 
37 | ## load libraries
38 | library(e1071)
39 | library(LaplacesDemon)
40 | 
41 | 
42 | ## example code from BayesTheorem() function
43 | PrA <- c(0.75,0.25)
44 | PrBA <- c(6/9, 5/7)
45 | BayesTheorem(PrA, PrBA)
46 | 
47 | 
48 | ## create fake data
49 | n <- 10
50 | x <- c(rep(0,n/2), rep(1,n/2))
51 | y <- c(0,0,0,1,1,0,0,1,1,1)
52 | 
53 | ## inspect data
54 | cbind(y,x)
55 | 
56 | ## inspect tabulation of data
57 | table(y,x)
58 | 
59 | 
60 | ## calculate the probability of the evidence/data
61 | PrX <- NA
62 | PrX[1] <- sum(as.numeric(x==1)) / n
63 | PrX[2] <- sum(as.numeric(x==1)) / n
64 | 
65 | ## calculate the probability of the outcome
66 | PrY <- NA
67 | PrY[1] <- sum(as.numeric(y==0))/n
68 | PrY[2] <- sum(as.numeric(y==1))/n
69 | PrY
70 | 
71 | ## calculate the probability of the data conditional on the value of y (the likelihood)
72 | PrXY<- NA
73 | PrXY[1] <- sum(x[y==0])/length(as.numeric(x[y==0]))
74 | PrXY[2] <- sum(x[y==1])/length(as.numeric(x[y==1]))
75 | PrXY
76 | 
77 | ## apply Bayes Rule
78 | PrXY * PrY / PrX
79 | 
80 | ## apply Bayes Rule with BayesTheorem() function
81 | BayesTheorem(PrA=PrY, PrBA=PrXY)
82 | 
83 | 
84 | ## apply Bayes Rule with naiveBayes() function
85 | fit <- naiveBayes(y~x, data=data.frame(y,x))
86 | fit
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R:
--------------------------------------------------------------------------------
  1 | #### R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
  5 | ## University of Essex Summer School 2023
  6 | ##
  7 | ## Date: 2023-08-09
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ## Introduction to tutorial:
 14 | ##
 15 | ## (1) Program simulates count data that is then used to predict a binary outcome variable.
 16 | ##
 17 | ## (2) Three models are evaluated using the count data to predict the outcome:
 18 | ## (2a) linear model
 19 | ## (2b) generalized linear model with a logit link function
 20 | ## (2c) naive Bayes classifier.
 21 | ##
 22 | ##########################################################################
 23 | 
 24 | 
 25 | ## load library
 26 | llibrary(e1071)
 27 | 
 28 | 
 29 | ## simulate x1 and set the "true" population values alpha and beta
 30 | n <- 100
 31 | 
 32 | ## unobserved
 33 | x <- runif(n,0,1)
 34 | 
 35 | ## observed counts
 36 | x1 <- rpois(n, lambda=x)
 37 | x2 <- rpois(n, lambda=x)
 38 | x3 <- rpois(n, lambda=2*x)
 39 | x4 <- rpois(n, lambda=2*x)
 40 | x5 <- rpois(n, lambda=4*x)
 41 | 
 42 | ## systematic component of the model based on observed counts
 43 | xb <- -2 + x1 + x2 + x3 + x4 + x5
 44 | 
 45 | ## transform the linear term xb using
 46 | ## the inverse logit function
 47 | ## so that theta is bound from 0 to 1
 48 | pi <- 1 / (1 + exp(-xb))
 49 | 
 50 | ## generate the dependent variable y with probability pi and measurement error from a Bernoulli trial
 51 | y <- rbinom(n, size=1, prob=pi)
 52 | 
 53 | 
 54 | ## make data frame
 55 | dat <- data.frame(y, x1, x2, x3, x4, x5)
 56 | 
 57 | 
 58 | ## summarize fit using linear model
 59 | summary(lm(y ~ x1 + x2 + x3 + x4 + x5, data=dat))
 60 | 
 61 | 
 62 | ## summarize fit using glm using the logit link function
 63 | summary(glm(y ~ x1 + x2 + x3 + x4 + x5, family=binomial(link="logit")))
 64 | 
 65 | 
 66 | ## summarize fit using naiveBayes model
 67 | naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=dat)
 68 | 
 69 | 
 70 | ## create vectors for storing predictions
 71 | dat$y.hat1 <- NA
 72 | dat$y.hat2 <- NA
 73 | dat$y.hat3 <- NA
 74 | 
 75 | ## select number of folds
 76 | k <- 10
 77 | 
 78 | ## create vector of folds for cross validation
 79 | dat$folds <- sample(rep(1:k, k), n, replace=FALSE)
 80 | 
 81 | ## lapply function to
 82 | for(i in 1:k){
 83 |     
 84 |     ## fit a linear model
 85 |     fit1 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i))
 86 |     pred1 <- predict(fit1, newdata=subset(dat, folds==i))
 87 |     y.hat1 <- as.numeric(pred1)
 88 |     
 89 |     dat$y.hat1[dat$fold==i] <- y.hat1
 90 |     
 91 |     
 92 |     ## fit a glm model
 93 |     fit2 <- glm(y ~ x1 + x2 + x3 + x4 + x5, binomial(link="logit"), data=subset(dat, folds!=i))
 94 |     pred2 <- predict(fit2, newdata=subset(dat, folds==i))
 95 |     y.hat2 <- as.numeric(pred2)
 96 |     
 97 |     dat$y.hat2[dat$fold==i] <- y.hat2
 98 |     
 99 |     
100 |     ## fit a naiveBayes classifier model
101 |     fit3 <- naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i))
102 |     pred3 <- predict(fit3, newdata=subset(dat, folds==i))
103 |     y.hat3 <- as.numeric(pred3)
104 |     
105 |     dat$y.hat3[dat$fold==i] <- y.hat3
106 |     
107 |     #print(summary(dat))
108 | }
109 | 
110 | rmse.fit1 <- sqrt(mean((dat$y.hat1-dat$y)^2))
111 | rmse.fit1
112 | 
113 | cor.fit1 <- cor(dat$y.hat1, dat$y, method="spearman")
114 | 
115 | rmse.fit2 <- sqrt(mean((dat$y.hat2-dat$y)^2))
116 | rmse.fit2
117 | 
118 | cor.fit2 <- cor(dat$y.hat2, dat$y, method="spearman")
119 | 
120 | rmse.fit3 <- sqrt(mean((dat$y.hat3-dat$y)^2))
121 | rmse.fit3
122 | 
123 | cor.fit3 <- cor(dat$y.hat3, dat$y, method="spearman")
124 | 
125 | c(cor.fit1, cor.fit2, cor.fit3)
126 | 
127 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day05_Measurement_RSTAN/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day05_Measurement_RSTAN/.DS_Store


--------------------------------------------------------------------------------
/R-Program-Lessons/Day05_Measurement_RSTAN/RSTAN_logistic_regression_simulation.R:
--------------------------------------------------------------------------------
  1 | ## RSTAN_logistic_regression_simulation.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
  5 | ## University of Essex Summer School 2023
  6 | ##
  7 | ## Date: 2023-08-11
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions
 10 | ## e-mail: cjf0006@gmail.com
 11 | ##
 12 | ##########################################################################
 13 | ##
 14 | ## Introduction to tutorial:
 15 | ##
 16 | ## For this R tutorial we will simulate a binary dependent variable and then estimate the parameters that generate the variable. These parameters will be estimated based on the likelihood function that links them to the data contained in the y and x variables that are simulated observed data. The model produces the slope and intercept from a standard logistic regression model, which is also estimated using glm() in R.
 17 | ##
 18 | ##########################################################################
 19 | 
 20 | ## load library
 21 | library(rstan) # load rstan library
 22 | library(MASS) # loaed library with truehist function
 23 | 
 24 | ## -------------------------------------------------- ##
 25 | ## define STAN model
 26 | ## -------------------------------------------------- ##
 27 | model <- "
 28 |     data {
 29 |         // declared the data in memory
 30 |         int<lower=0> n;
 31 |         int<lower=0, upper=1> y[n];
 32 |         vector[n] x;
 33 |     }
 34 |     // declared the parameters in memory
 35 |     parameters {
 36 |         real alpha;
 37 |         real beta;
 38 |     }
 39 |     model {
 40 |         // priors (these are variances not precision)
 41 |         //alpha ~ normal(0,10);
 42 |         //beta ~ normal(0,10);
 43 |     
 44 |         // likelihood (link data to some combination of parameters and more data)
 45 |         y ~ bernoulli_logit(alpha + beta * x);
 46 |     }
 47 | generated quantities {
 48 |     // posterior predictions
 49 |     vector[n] y_predict;
 50 |     
 51 |     // the loop is necessary within the generated quantities block
 52 |     for(i in 1:n){
 53 |         y_predict[i] = bernoulli_logit_rng(alpha + beta * x[i]);
 54 |     }
 55 | }
 56 | 
 57 | "
 58 | ## -------------------------------------------------- ##
 59 | 
 60 | 
 61 | ## simulate x1 and set the "true" population values alpha and beta
 62 | n <- 100
 63 | x <- rnorm(n,0,1)
 64 | alpha <- 1.25
 65 | beta <- 2.50
 66 | 
 67 | ## systematic component of the model
 68 | xb <- alpha + beta * x
 69 | 
 70 | ## transform the linear term xb using
 71 | ## the inverse logit function
 72 | ## so that theta is bound from 0 to 1
 73 | eta <- 1 / (1 + exp(-xb))
 74 | 
 75 | ## generate the dependent variable y with probability inv.theta and measurment error from a Bernoulli trial
 76 | y <- rbinom(n, size=1, prob=eta)
 77 | 
 78 | table(y)
 79 | 
 80 | ## create data list
 81 | data_list <- list(y = y, x=x, n=n)
 82 | 
 83 | ## fit linear model
 84 | summary(glm(y~x, family=binomial(link="logit")))
 85 | 
 86 | ## fit stan model
 87 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4)
 88 | 
 89 | ## extract draws from stan model object
 90 | output <- extract(fit, permuted = TRUE)
 91 | 
 92 | ## print names
 93 | names(output)
 94 | 
 95 | ## there are number of methods to subset and summarize parameters
 96 | ## keep in mind that the output object is a list that contains vectors or matrices of of posterior estimates for each of the named parameter defined in the model statement above
 97 | ## lapply (list-apply) a function to all of the objects in the list
 98 | lapply(output, mean)
 99 | lapply(output, sd)
100 | 
101 | ## tabulate the simulated binary dependent variable it should be very close to the mean value of the predicted y
102 | table(y)
103 | 
104 | ## create a matrix using some of the named slots in the list
105 | model_parameters <- as.matrix(fit, pars = c("alpha", "beta"))
106 | model_predictions <- as.matrix(fit, pars = "y_predict")
107 | 
108 | ## check the dimensions (they should be the same)
109 | dim(model_predictions)
110 | dim(output$y_predict)
111 | 
112 | ## plot the simulated y variable and the estimated posterior means
113 | plot(apply(model_predictions,2,mean), y)
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day05_Measurement_RSTAN/RSTAN_mean_simulation.R:
--------------------------------------------------------------------------------
 1 | ## RSTAN_mean_simulation.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
 5 | ## University of Essex Summer School 2023
 6 | ##
 7 | ## Date: 2023-08-11
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions
10 | ## e-mail: cjf0006@gmail.com
11 | ##
12 | ##########################################################################
13 | ##
14 | ## Introduction to tutorial:
15 | ##
16 | ## For this R tutorial we will use the normal distribution in STAN to estimate the mean value for an observed variable y. The parameter for the mean will be selected based on the likelihood function that links them to the data contained in the y variable.
17 | ##
18 | ##########################################################################
19 | 
20 | ## load library
21 | library(rstan) # load rstan library
22 | library(MASS) # load library with truehist function
23 | 
24 | ## -------------------------------------------------- #
25 | ## define STAN model as a character
26 | ## -------------------------------------------------- #
27 | model <- "
28 |     data {
29 |         // declared the data in memory
30 |         int<lower=0> n;
31 |         vector[n] y;
32 |     }
33 |     parameters {
34 |         // declared the parameters in memory
35 |         real mu;
36 |         real<lower=0> sigma;
37 |     }
38 |     model {
39 |         // there are no prior statements for mu or sigma; 
40 |         // by default the priors on the parameters are flat unless we provide more information (see the other examples)
41 |         // likelihood (link data to some combination of parameters and more data)
42 |         
43 |         mu ~ normal(0,.1);
44 |         
45 |         for(i in 1:n){
46 |             y[i] ~ normal(mu, sigma);
47 |         }
48 |     }
49 |     generated quantities {
50 |         // posterior predictions
51 |         vector[n] y_predict;
52 | 
53 |         // the loop is necessary within the generated quantities block
54 |         for(i in 1:n){
55 |             y_predict[i] = normal_rng(mu, sigma);
56 |         }
57 |     }
58 | "
59 | ## -------------------------------------------------- #
60 | 
61 | 
62 | ## set data for simulation
63 | #y <- 1:5
64 | y <- rep(1:5,200)
65 | 
66 | n <- length(y)
67 | y
68 | n
69 | 
70 | ## create data list
71 | data_list <- list(y = y, n=n)
72 | data_list
73 | 
74 | ## set time start variable
75 | time1 <- Sys.time()
76 | 
77 | # fit stan model
78 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4)
79 | 
80 | ## calcuate the duration of the program file up to this point
81 | print(Sys.time() - time1)
82 | 
83 | ## extract draws from stan model object (creates a list object)
84 | output <- extract(fit, permuted = TRUE)
85 | 
86 | ## print names of each element/slot in the list
87 | names(output)
88 | 
89 | ## print model fit object
90 | fit
91 | 
92 | ## lapply (list-apply) a function to all of the objects in the list
93 | lapply(output, mean)
94 | lapply(output, sd)
95 | 
96 | truehist(output$mu)
97 | 
98 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day05_Measurement_RSTAN/RSTAN_normal_distribution_simulation.R:
--------------------------------------------------------------------------------
 1 | ## RSTAN_normal_distribution_simulation.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
 5 | ## University of Essex Summer School 2023
 6 | ##
 7 | ## Date: 2023-08-11
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions
10 | ## e-mail: cjf0006@gmail.com
11 | ##
12 | ##########################################################################
13 | ##
14 | ## Introduction to tutorial:
15 | ##
16 | ## For this R tutorial we will draw random samples from the normal distribution using the STAN program. 
17 | ## This program is equivalent to using the rnorm function in R.
18 | ##
19 | ##########################################################################
20 | 
21 | 
22 | ## load library
23 | library(rstan) # load rstan library
24 | library(MASS) # load library with truehist function
25 | 
26 | 
27 | ## -------------------------------------------------- #
28 | ## define STAN model
29 | ## -------------------------------------------------- #
30 | model <- "
31 | 
32 | parameters {
33 |     real mu;
34 | }
35 | 
36 | model {
37 |     mu ~ normal(0,1);
38 | }
39 | "
40 | ## -------------------------------------------------- #
41 | 
42 | 
43 | ## set time start variable
44 | time1 <- Sys.time()
45 | 
46 | ## fit stan model
47 | fit <- stan(model_code = model, iter = 1000, chains = 4)
48 | 
49 | ## calculate the duration of the program file up to this point
50 | print(Sys.time() - time1)
51 | 
52 | ## extract draws from stan model object (creates a list object)
53 | output <- extract(fit, permuted = TRUE)
54 | 
55 | ## print names of each element/slot in the list
56 | names(output)
57 | 
58 | ## print model fit object
59 | fit
60 | 
61 | ## there are number of methods to subset and summarize parameters
62 | ## keep in mind that the output object is a list that contains vectors or matrices of posterior estimates for each of the named parameter defined in the model statement above
63 | ## lapply (list-apply) a function to all of the objects in the list
64 | lapply(output, mean)
65 | lapply(output, sd)
66 | 
67 | length(output$mu)
68 | 
69 | ## create a matrix using some of the named slots in the list
70 | model_parameters <- as.matrix(fit, pars = c("mu"))
71 | 
72 | dim(model_parameters)
73 | 
74 | ## check the dimensions (they should be the same)
75 | length(output$mu)
76 | 
77 | ## make a nice plot
78 | par(mfrow=c(1,1))
79 | truehist(output$mu)
80 | 
81 | #plot(fit[[1]]$sim$permutation[[3]])
82 | 
83 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day06_Text_as_data_programs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day06_Text_as_data_programs/.DS_Store


--------------------------------------------------------------------------------
/R-Program-Lessons/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_stm_package.R:
--------------------------------------------------------------------------------
 1 | #### R_Demo_text_as_data_DTM_stm_package.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
 5 | ## University of Essex Summer School 2023
 6 | ##
 7 | ## Date: 2023-08-13
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ##
14 | ## Introduction to tutorial:
15 | ##
16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file.
17 | ##
18 | ## This code uses functions available in the tm package.
19 | ##
20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions.
21 | ##
22 | ##########################################################################
23 | 
24 | 
25 | ## load libraries
26 | library(stm)
27 | library(tm)
28 | library(SnowballC)
29 | 
30 | ## load data
31 | data <- read.csv("Datasets/SIMpoliticalTweets.txt", header=FALSE)
32 | data
33 | names(data) <- "text"
34 | data
35 | 
36 | #trumptweets <- fromJSON("trump_json_files_20190707.txt")
37 | #data <- trumptweets
38 | 
39 | ##
40 | #data <- data[1:1000,]
41 | 
42 | 
43 | ## preprocess the documents
44 | ## This function uses function from the tm package (see the tm Demo for more details)
45 | ## stem words and remove stop words
46 | prep <- textProcessor(documents=data$text, meta=data)
47 | 
48 | ## list attributes
49 | attributes(prep)
50 | 
51 | ## inspect
52 | head(prep$documents)
53 | head(prep$vocab)
54 | head(prep$meta)
55 | prep$docs.removed
56 | 
57 | 
58 | ## pre Documents
59 | ## additional processing (removes some documents because of word frequencies greater than .99 or less than .01)
60 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta)
61 | out
62 | 
63 | ## inspect
64 | head(out$documents)
65 | head(out$vocab)
66 | head(out$meta)
67 | 
68 | ## fit a structural topic model
69 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=3)
70 | 
71 | ## inspect attributes
72 | attributes(fit)
73 | 
74 | dim(fit$theta)
75 | 
76 | ## display topic probabilities
77 | fit$theta
78 | 
79 | apply(fit$theta, 1, sum)
80 | 
81 | out$meta
82 | 
83 | data.frame(text = out$meta$text, topic1=fit$theta[,1], topic2=fit$theta[,2], topic3=fit$theta[,3])
84 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_tm_package.R:
--------------------------------------------------------------------------------
 1 | #### R_Demo_text_as_data_DTM_tm_package.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
 5 | ## University of Essex Summer School 2023
 6 | ##
 7 | ## Date: 2023-08-13
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ##
14 | ## Introduction to tutorial:
15 | ##
16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file.
17 | ##
18 | ## This code uses functions available in the tm package.
19 | ##
20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions.
21 | ##
22 | ##########################################################################
23 | 
24 | ## load libraries
25 | library(stm)
26 | library(tm)
27 | library(SnowballC)
28 | 
29 | ## load data
30 | data <- read.csv("Datasets/SIMpoliticalTweets.txt", header=FALSE)
31 | data
32 | names(data) <- "text"
33 | data
34 | 
35 | #trumptweets <- fromJSON("trump_json_files_20190707.txt")
36 | #data <- trumptweets
37 | 
38 | ##
39 | ##data <- data[1:1000,]
40 | 
41 | ## create character vector for processing
42 | newtext <- as.character(data$text)
43 | length(newtext)
44 | 
45 | 
46 | ## use gsub to remove special characters that usually cause errors if left for later
47 | newtext <- gsub("[^0-9A-Za-z///' ]", "", newtext)
48 | newtext <-  gsub("[^[:alnum:]///' ]", "", newtext)
49 | newtext <- gsub("[^\x20-\x7F\x0D\x0A]", "", newtext) # remove all non-ascii characters
50 | newtext <- gsub("http.*", "", newtext) # replace all of the urls
51 | newtext <- gsub("www.*", "", newtext) #
52 | 
53 | ## data$newtext
54 | data$newtext <- newtext
55 | 
56 | ## convert to corpus object using additional functions from the tm package
57 | ## the tm_map function takes as its first argument the vector of text and a function as its second argument
58 | corpus <-Corpus(VectorSource(newtext))
59 | corpus <- tm_map(corpus, removePunctuation)
60 | corpus <- tm_map(corpus, removeNumbers)
61 | corpus <- tm_map(corpus, stripWhitespace)
62 | corpus <- tm_map(corpus, tolower)
63 | corpus <- tm_map(corpus, removeWords, stopwords("english"))
64 | corpus <- tm_map(corpus, stemDocument)
65 | 
66 | ## print to screen
67 | inspect(corpus[1:11])
68 | 
69 | 
70 | ## make document by term matrix
71 | DTM <- DocumentTermMatrix(corpus)
72 | DTM
73 | 
74 | 
75 | ## print DTM to screen
76 | inspect(DTM)
77 | inspect(DTM[1:11,1:12])
78 | 
79 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day06_Text_as_data_programs/R_Demo_text_as_wikip_stm.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_text_as_wikip_stm.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
 5 | ## University of Essex Summer School 2023
 6 | ##
 7 | ## Date: 2023-08-13
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## This tutorial demonstrates how the Structural Topic Model functions for a real set of documents.
16 | ##
17 | ## Each unit in the dataset representes a randomly selected sentence from wikipedia.  
18 | ##
19 | ##########################################################################
20 | 
21 | library(stm)
22 | 
23 | wikip_word_dat <- readLines("one_meelyun_sentences.txt")
24 | head(wikip_word_dat)
25 | 
26 | n <- length(wikip_word_dat)
27 | n
28 | 
29 | wikip_word_dat <- data.frame(id=1:n, text=wikip_word_dat)
30 | head(wikip_word_dat)
31 | 
32 | 
33 | wikip_word_dat$human_rights_test <- as.numeric(grepl("human rights", wikip_word_dat$text))
34 | table(wikip_word_dat$human_rights_test)
35 | 
36 | wikip_word_dat$civil_rights_test <- as.numeric(grepl("civil rights", wikip_word_dat$text))
37 | table(wikip_word_dat$civil_rights_test)
38 | 
39 | ## preprocess the documents
40 | ## This function uses function from the tm package (see the tm Demo for more details)
41 | prep <- textProcessor(documents=wikip_word_dat$text, meta=wikip_word_dat)
42 | 
43 | save(out, file="wikip_word_dat_stm_prep.Rdata")
44 | 
45 | ## inspect
46 | head(prep$documents)
47 | head(prep$vocab)
48 | head(prep$meta)
49 | 
50 | 
51 | ## pre Documents
52 | ## stem words and remove stop words
53 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta)
54 | 
55 | save(out, file="wikip_word_dat_stm_out.Rdata")
56 | 
57 | ## inspect
58 | head(out$documents)
59 | head(out$vocab)
60 | head(out$meta)
61 | 
62 | ## fit a structural topic model
63 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=20)
64 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=40)
65 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=100)
66 | 
67 | #fit_2 <- fit
68 | #fit_3 <- fit
69 | #fit_10 <- fit
70 | #fit_20 <- fit
71 | fit_40 <- fit
72 | 
73 | ## display topic probabilities
74 | head(fit$theta)
75 | 
76 | dim(fit$theta)
77 | summary(head(fit$theta, 1000))
78 | 
79 | #save(fit, file="wikip_word_dat_stm20.Rdata")
80 | #save(fit, file="wikip_word_dat_stm40.Rdata")
81 | save(fit, file="wikip_word_dat_stm100.Rdata")
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day06_Text_as_data_programs/R_twitter_examples/R_Demo_text_as_data_twitteR_get_twitter_users.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_text_as_data_twitteR_get_tweeter_users.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-08
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## setup to access the twitter API using R
16 | ##
17 | ## Create a Twitter application at http://dev.twitter.com. Make sure to give the app read, write and direct message authority.
18 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret".
19 | 
20 | ##########################################################################
21 | 
22 | #install.packages("twitteR")
23 | library(twitteR)
24 | 
25 | 
26 | ## (1a) Go to twitter's developer page: https://developer.twitter.com/
27 | ## (1b) Click to create an App
28 | ## (1c) For the desription, you can say something like this "learn to use twitteR API for a college data science course at the University of Michigan."
29 | ##
30 | ## (2) Once you have succefully setup you APP, you will be able to get the four strings that you will fill in below
31 | ##
32 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret".
33 | ##
34 | ## set keys and tokens to access the twitter API
35 | consumer_key <- "your_consumer_key"
36 | consumer_secret <- "your_consumer_secret"
37 | access_token <- "your_access_token"
38 | access_secret <- "your_access_secret"
39 | 
40 | setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
41 | 
42 | ## (3) Run the following tweet to see if you successfully created your account
43 | amnestyusa_tweets <- userTimeline('amnestyusa', n=100)
44 | ##
45 | ## (4) Pick a tweeter account from your country of interest (or you can use the amnestyusa account from above)
46 | ##
47 | ## (5) Make a DTM (Document-by-Term matrix) for the first 100 tweets from the account you have selected or the amnestyusa account
48 | ##
49 | ## (6a) Which words are most commonly used in the corpus of 100 tweets?
50 | ## (6b) Which words are the least commonly used in the corpus of 100 tweets?
51 | ## (6c) Create a barplot of the most frequent words
52 | 
53 | 
54 | ##########################################################################
55 | ## additional examples:
56 | ##
57 | ## grab the most recent 100 tweets from Barak Obama
58 | Obama_ut <- userTimeline('barackobama', n=100)
59 | 
60 | ## grab the max number of tweets for the President of Senegal Macky Sall
61 | Macky_Sall_ut <- userTimeline('Macky_Sall', n=3200)
62 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day06_Text_as_data_programs/R_twitter_examples/R_Demo_text_as_data_twitteR_get_twitter_users_Oauth_v2.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_text_as_data_twitteR_get_tweeter_users.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F)
 5 | ## University of Essex Summer School 2022
 6 | ##
 7 | ## Date: 2022-08-08
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## setup to access the twitter API using R
16 | ##
17 | ## Create a Twitter application at http://dev.twitter.com. Make sure to give the app read, write and direct message authority.
18 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret".
19 | 
20 | ##########################################################################
21 | 
22 | #install.packages("twitteR")
23 | library(twitteR)
24 | 
25 | 
26 | ## (1a) Go to twitter's developer page: https://developer.twitter.com/
27 | ## SEE THIS LINK: https://developer.twitter.com/en/docs/tutorials/getting-started-with-r-and-v2-of-the-twitter-api
28 | ## (1b) Click to create an App
29 | ## (1c) For the desription, you can say something like this "learn to use twitteR API for a college data science course at the University of Michigan."
30 | ##
31 | ## (2) Once you have succefully setup you APP, you will be able to get the four strings that you will fill in below
32 | ##
33 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret".
34 | ##
35 | ## set keys and tokens to access the twitter API
36 | consumer_key <- "your_consumer_key"
37 | consumer_secret <- "your_consumer_secret"
38 | access_token <- "your_access_token"
39 | access_secret <- "your_access_secret"
40 | 
41 | setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
42 | 
43 | ## (3) Run the following tweet to see if you successfully created your account
44 | amnestyusa_tweets <- userTimeline('amnestyusa', n=100)
45 | ##
46 | ## (4) Pick a tweeter account from your country of interest (or you can use the amnestyusa account from above)
47 | ##
48 | ## (5) Make a DTM (Document-by-Term matrix) for the first 100 tweets from the account you have selected or the amnestyusa account
49 | ##
50 | ## (6a) Which words are most commonly used in the corpus of 100 tweets?
51 | ## (6b) Which words are the least commonly used in the corpus of 100 tweets?
52 | ## (6c) Create a barplot of the most frequent words
53 | 
54 | 
55 | ##########################################################################
56 | ## additional examples:
57 | ##
58 | ## grab the most recent 100 tweets from Barak Obama
59 | Obama_ut <- userTimeline('barackobama', n=100)
60 | 
61 | ## grab the max number of tweets for the President of Senegal Macky Sall
62 | Macky_Sall_ut <- userTimeline('Macky_Sall', n=3200)
63 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day06_Text_as_data_programs/groundhog_library_func.R:
--------------------------------------------------------------------------------
 1 | 
 2 | groundhog_library_func <- function(groundhog=FALSE, regular_install=FALSE){
 3 |   ## Do this (set to TRUE) to load libraries using the version from when the scripts were originally run
 4 |   if(groundhog){
 5 |       ## load an older version of the libraries
 6 |       remotes::install_github('CredibilityLab/groundhog')
 7 |       library(groundhog)
 8 |       pkgs <- c("gtrendsR", "countrycode", "stm", "tm", "MASS", "bcp", "ngramr", "rvest", "plm", "lmtest", "WDI", "boot", "forecast", "acled.api", "ggplot2", "stargazer", "httr", "lubridate", "xtable")
 9 |       groundhog.library(pkgs,'2022-05-23')
10 |   } else if(regular_install==TRUE){
11 |       ## or install and load the more recent version of the libraries
12 |       install.packages("gtrendsR", "countrycode", "stm", "tm", "MASS", "bcp", "ngramr", "rvest", "plm", "lmtest", "WDI", "boot", "forecast", "acled.api", "ggplot2", "stargazer", "httr", "lubridate", "xtable")
13 |       library(gtrendsR)
14 |       library(countrycode)
15 |       library(stm)
16 |       library(tm)
17 |       library(MASS)
18 |       library(bcp)
19 |       library(ngramr)
20 |       library(rvest)
21 |       library(plm)
22 |       library(lmtest)
23 |       library(WDI)
24 |       library(boot)
25 |       library(forecast)
26 |       library(acled.api)
27 |       library(ggplot2)
28 |       library(stargazer)
29 |       library(httr)
30 |       library(lubridate)
31 |       library(xtable)
32 |   } else{
33 |       ## or just load the more recent version of the libraries
34 |       library(gtrendsR)
35 |       library(countrycode)
36 |       library(stm)
37 |       library(tm)
38 |       library(MASS)
39 |       library(bcp)
40 |       library(ngramr)
41 |       library(rvest)
42 |       library(plm)
43 |       library(lmtest)
44 |       library(WDI)
45 |       library(boot)
46 |       library(forecast)
47 |       library(acled.api)
48 |       library(ggplot2)
49 |       library(stargazer)
50 |       library(httr)
51 |       library(lubridate)
52 |       library(xtable)
53 |   }
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_MNIST_example.R:
--------------------------------------------------------------------------------
  1 | ## R_Demo_AML_MNIST_example.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
  5 | ## University of Essex Summer School 2023
  6 | ##
  7 | ## Date: 2023-08-13
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##
 13 | ##########################################################################
 14 | ## Introduction to tutorial:
 15 | ##
 16 | ## Fit and evaluate a neural network model to the MINAS hand written digit dataset.
 17 | ##
 18 | ## https://keras.rstudio.com/index.html
 19 | ## https://keras.rstudio.com/articles/faq.html
 20 | ## https://keras.io/losses/
 21 | ## https://cran.r-project.org/web/packages/keras/keras.pdf
 22 | ## https://cran.rstudio.com/web/packages/keras/vignettes/sequential_model.html
 23 | ##
 24 | ##########################################################################
 25 | 
 26 | ## load package
 27 | install.packages("keras")
 28 | library(keras)
 29 | #mnist <- dataset_mnist()
 30 | 
 31 | ## OR
 32 | 
 33 | ## load MNIST data from dslabs package
 34 | library(dslabs)
 35 | mnist <- read_mnist()
 36 | 
 37 | ## or just load it
 38 | ##load("mnist.Rdata")
 39 | 
 40 | ## describe the structure of the data
 41 | names(mnist)
 42 | names(mnist$train)
 43 | names(mnist$test)
 44 | 
 45 | length(mnist)
 46 | length(mnist$train)
 47 | length(mnist$test)
 48 | 
 49 | is.list(mnist)
 50 | is.list(mnist$train)
 51 | is.list(mnist$test)
 52 | 
 53 | dim(mnist$train$images)
 54 | dim(mnist$test$images)
 55 | 
 56 | head(mnist$test$labels)
 57 | head(mnist$train$labels)
 58 | 
 59 | ## transform the data
 60 | x_train <- mnist$train$images/255
 61 | x_test <- mnist$test$images/255
 62 | y_test <- mnist$test$labels
 63 | y_train <- mnist$train$labels
 64 | 
 65 | matrix(x_train[1,], nrow=28, ncol=28)
 66 | 
 67 | image(x=1:28, y=1:28, matrix(x_train[1,], nrow=28, ncol=28)[1:28,28:1])
 68 | 
 69 | par(mfrow=c(3,3), mar=c(.5,.5,.5,.5))
 70 | for(i in 1:9){
 71 |   image(x=1:28, y=1:28, matrix(x_train[i,], nrow=28, ncol=28)[1:28,28:1], yaxt="n", xaxt="n", xlab="", ylab="", col=gray.colors(100, start = 0.99, end = 0.01))
 72 |   grid(nx=28, ny=28, col="black")
 73 | }
 74 | 
 75 | par(mfrow=c(10,10), mar=c(.5,.5,.5,.5))
 76 | for(i in 1:100) image(x=1:28, y=1:28, matrix(x_train[i,], nrow=28, ncol=28)[1:28,28:1], yaxt="n", xaxt="n", xlab="", ylab="", col=gray.colors(100, start = 0.99, end = 0.01))
 77 | 
 78 | ## stretch out the matrix into one visual row
 79 | par(mfrow=c(1,1), mar=c(.5,.5,.5,.5))
 80 | image(x=1:784, y=1, z=matrix(x_train[1,], nrow=784, ncol=1), yaxt="n", xaxt="n", xlab="", ylab="", col=gray.colors(100, start = 0.99, end = 0.01), ylim=c(-5,5))
 81 | 
 82 | ## stretch out the matrix into nine visual row
 83 | par(mfrow=c(1,1), mar=c(.5,.5,.5,.5))
 84 | image(x=1:784, y=1:9, z=t(x_train[9:1,1:784]), yaxt="n", xaxt="n", xlab="", ylab="", col=gray.colors(100, start = 0.99, end = 0.01))
 85 | 
 86 | ## stretch out the matrix into 100 visual row
 87 | par(mfrow=c(1,1), mar=c(.5,.5,.5,.5))
 88 | image(x=1:784, y=1:100, z=t(x_train[100:1,1:784]), yaxt="n", xaxt="n", xlab="", ylab="", col=gray.colors(100, start = 0.99, end = 0.01))
 89 | 
 90 | 
 91 | ## calculate sum for a boxplot
 92 | x_train_sum <- apply(x_train, 1, sum)
 93 | 
 94 | #y_train_value <- apply(y_train, 1, which.max) - 1
 95 | 
 96 | ## simple prediction
 97 | par(mar=c(5,4,1,1))
 98 | boxplot(x_train_sum ~ y_train)
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_glm.R:
--------------------------------------------------------------------------------
  1 | ## R_Demo_AML_neuralnet_gradiant_decent_glm.R
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
  5 | ## University of Essex Summer School 2023
  6 | ##
  7 | ## Date: 2023-08-13
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ## Introduction to tutorial:
 14 | ##
 15 | ## Overview of Gradient Decent for generalized linear model example.
 16 | ##
 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network.
 18 | ##
 19 | ##########################################################################
 20 | 
 21 | ## set learning rate this varies on the unit interval (0 to 1]
 22 | library(MASS)
 23 | library(boot)
 24 | library(gtools)
 25 | 
 26 | lr <- .5
 27 | n <- 100
 28 | x <- rnorm(n)
 29 | 
 30 | alpha <- -1
 31 | beta <- -2
 32 | 
 33 | y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x))
 34 | #y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x + rnorm(n)))
 35 | 
 36 | ## Start with a random guess
 37 | X_mat <- cbind(1,x)
 38 | 
 39 | alpha_hat <- 4
 40 | beta_hat <- 4
 41 | iterations <- 1000
 42 | 
 43 | loss <- variance <- NA
 44 | 
 45 | y_hat <- matrix(NA, nrow=n, ncol=iterations)
 46 | y_error <- matrix(NA, nrow=n, ncol=iterations)
 47 | delta <- matrix(NA, nrow=2, ncol=iterations)
 48 | 
 49 | ## sequential iterations to evaluate loss function
 50 | for (j in 1:iterations){
 51 |     
 52 |     ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta
 53 |     y_hat[,j] <- inv.logit(alpha_hat[j] + beta_hat[j] * x)
 54 | 
 55 |     ## this works too but not as well as above
 56 |     #y_hat[,j] <- rbinom(1:n,1,prob=inv.logit(alpha_hat[j] + beta_hat[j] * x))
 57 |     
 58 |     ## difference between the predicted y (y_hat) and y is the error for y
 59 |     y_error[,j] <- y_hat[,j] - y
 60 |     
 61 |     ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors
 62 |     loss[j] <- sum(y_error[,j]^2)
 63 |     
 64 |     ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X
 65 |     ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n)
 66 |     delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n
 67 |     delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n
 68 |     
 69 |     ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y)
 70 |     alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j])
 71 |     beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j])
 72 | }
 73 | 
 74 | ## estimate glm model
 75 | fit <- glm(y~x, family=binomial("logit"))
 76 | summary(fit)
 77 | 
 78 | ## print the last value of the sequence of parameter estimates
 79 | alpha_hat[length(alpha_hat)]
 80 | beta_hat[length(beta_hat)]
 81 | 
 82 | 
 83 | ## graph the values as a function of the loss statistic
 84 | ALPHA_seq <- seq(from=-4,4,.05)
 85 | BETA_seq <- seq(from=-4,4,.05)
 86 | 
 87 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq))
 88 | 
 89 | ## grid search for graphing
 90 | for(i in 1:length(ALPHA_seq)){
 91 |     for(j in 1:length(ALPHA_seq)){
 92 |         Y_hat <- inv.logit(ALPHA_seq[i] + BETA_seq[j] * X_mat[,2])
 93 |         LOSS[i,j] <- sum((y-Y_hat)^2)
 94 |         
 95 |     }
 96 | }
 97 | 
 98 | ## plot the log loss
 99 | contour(ALPHA_seq, BETA_seq,(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5)
100 | lines(alpha_hat, beta_hat, col=2, lwd=4)
101 | points(alpha_hat, beta_hat, col="navy", pch=.5)
102 | 
103 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_lm.R:
--------------------------------------------------------------------------------
  1 | #### R_Demo_AML_neuralnet_gradiant_decent_lm.R 
  2 | ##########################################################################
  3 | ## INSTRUCTOR: Christopher Fariss
  4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
  5 | ## University of Essex Summer School 2023
  6 | ##
  7 | ## Date: 2023-08-13
  8 | ##
  9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
 10 | ## e-mail: cjf0006@gmail.com
 11 | ## e-mail: cjfariss@umich.edu
 12 | ##########################################################################
 13 | ## Introduction to tutorial:
 14 | ##
 15 | ## Overview of Gradient Decent for a linear model with two parameters (intercept and slope).
 16 | ##
 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network.
 18 | ##
 19 | ##########################################################################
 20 | 
 21 | 
 22 | ## load libraries
 23 | library(MASS)
 24 | 
 25 | ## set learning rate this varies on the unit interval (0 to 1]
 26 | lr <- .2
 27 | 
 28 | n <- 100
 29 | x <- rnorm(n)
 30 | 
 31 | alpha <- -1
 32 | beta <- -2
 33 | 
 34 | y <- alpha + beta*x + rnorm(n)
 35 | plot(x=x, y=y)
 36 | fit <- lm(y ~ x)
 37 | summary(fit)
 38 | abline(reg=fit, col=2)
 39 | 
 40 | X_mat <- cbind(1,x)
 41 | head(X_mat)
 42 | 
 43 | # Start with a random guess
 44 | alpha_hat <- 1
 45 | beta_hat <- 4
 46 | 
 47 | 
 48 | iterations <- 100
 49 | 
 50 | loss <- NA
 51 | variance <- NA
 52 | 
 53 | y_hat <- matrix(NA, nrow=n, ncol=iterations)
 54 | y_error <- matrix(NA, nrow=n, ncol=iterations)
 55 | delta <- matrix(NA, nrow=2, ncol=iterations)
 56 | 
 57 | ## sequential iterations to evaluate loss function
 58 | for (j in 1:iterations){
 59 |     
 60 |     ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta
 61 |     y_hat[,j] <- alpha_hat[j] + beta_hat[j] * x
 62 |     
 63 |     ## difference between the predicted y (y_hat) and y is the error for y
 64 |     y_error[,j] <- y_hat[,j] - y
 65 |     
 66 |     ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors
 67 |     loss[j] <- sum(y_error[,j]^2)
 68 |     
 69 |     ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X
 70 |     ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n)
 71 |     
 72 |     ##
 73 |     delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n
 74 |     delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n
 75 | 
 76 |     ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y)
 77 |     alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j])
 78 |     beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j])
 79 | }
 80 | 
 81 | ## print lm summary from above
 82 | summary(fit)
 83 | 
 84 | ## print the last value of the sequence of parameter estimates
 85 | alpha_hat[length(alpha_hat)]
 86 | beta_hat[length(beta_hat)]
 87 | 
 88 | ## check lm fit
 89 | summary(lm(y~x))
 90 | 
 91 | ## the least squares solution
 92 | solve(t(X_mat) %*% X_mat) %*% t(X_mat) %*% y
 93 | 
 94 | ## note that these give distinct regression models because the covariance between alpha and beta are not included (so they are different from the one above)
 95 | solve(t(X_mat[,2]) %*% X_mat[,2]) %*% t(X_mat[,2]) %*% y
 96 | solve(t(X_mat[,1]) %*% X_mat[,1]) %*% t(X_mat[,1]) %*% y
 97 | 
 98 | 
 99 | ## graph the values as a function of the loss statistic
100 | ALPHA_seq <- seq(from=-4,4,.05)
101 | BETA_seq <- seq(from=-4,4,.05)
102 | 
103 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq))
104 | 
105 | ## grid search for graphing
106 | for(i in 1:length(ALPHA_seq)){
107 |     for(j in 1:length(ALPHA_seq)){
108 |         Y_hat <- ALPHA_seq[i] + BETA_seq[j] * X_mat[,2]
109 |         LOSS[i,j] <- sum((y-Y_hat)^2)
110 |     }
111 | }
112 | 
113 | ## plot the log loss
114 | par(mar=c(5,5,1,1))
115 | contour(ALPHA_seq, BETA_seq, log(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5)
116 | lines(alpha_hat, beta_hat, col=2, lwd=4)
117 | points(alpha_hat, beta_hat, col="navy", pch=.5)
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_mu.R:
--------------------------------------------------------------------------------
 1 | ## R_Demo_AML_neuralnet_gradiant_decent_mu.R
 2 | ##########################################################################
 3 | ## INSTRUCTOR: Christopher Fariss
 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B)
 5 | ## University of Essex Summer School 2023
 6 | ##
 7 | ## Date: 2023-08-13
 8 | ##
 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine)
10 | ## e-mail: cjf0006@gmail.com
11 | ## e-mail: cjfariss@umich.edu
12 | ##########################################################################
13 | ## Introduction to tutorial:
14 | ##
15 | ## Overview of Gradient Decent for a model with one parameter for a mean.
16 | ##
17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network.
18 | ##
19 | ##########################################################################
20 | 
21 | 
22 | 
23 | 
24 | 
25 | ## load libraries
26 | library(MASS)
27 | 
28 | ## set learning rate this varies on the unit interval (0 to 1]
29 | lr <- .2
30 | lr <- .05
31 | lr <- .9
32 | 
33 | y <- c(1,2,3,4,5)
34 | ##y <- rnorm(1000, pi, 1)
35 | n <- length(y)
36 | 
37 | y
38 | n
39 | 
40 | 
41 | iterations <- 100
42 | 
43 | loss <- NA
44 | 
45 | mu_hat <- -1.5
46 | 
47 | y_hat <- matrix(NA, nrow=n, ncol=iterations)
48 | y_error <- matrix(NA, nrow=n, ncol=iterations)
49 | 
50 | delta_mu_hat <- NA
51 | 
52 | ## sequential iterations to evaluate loss function
53 | for (j in 1:iterations){
54 |     
55 |     ## y_hat: calculate the predicted y_hat based on the best guess of mu
56 |     
57 |     y_hat[,j] <- mu_hat[j]
58 |     
59 |     ## residual: the difference between the predicted y (y_hat) and y is the error for y
60 |     y_error[,j] <- y_hat[,j] - y
61 |     
62 |     ## loss: the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors
63 |     loss[j] <- sum(y_error[,j]^2)
64 |     
65 |     ## difference between current estimate and the unexplained differences, which is the method to calculate the gradient at that point
66 |     delta_mu_hat[j] <- sum(y_error[,j])/n
67 |     
68 |     ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y)
69 |     mu_hat[j+1] <- mu_hat[j] - (lr*delta_mu_hat[j])
70 | }
71 | 
72 | mu_hat[length(mu_hat)]
73 | 
74 | plot(mu_hat)
75 | 
76 | plot(mu_hat[-1], delta_mu_hat)
77 | 


--------------------------------------------------------------------------------
/R-Program-Lessons/Day09_Model_Evaluation_Review/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day09_Model_Evaluation_Review/.DS_Store


--------------------------------------------------------------------------------
/R-Program-Lessons/Day10_Additional_Programs/dynamic_IRT_practice_v2.R:
--------------------------------------------------------------------------------
  1 | ##
  2 | library(boot)
  3 | library(rstan)
  4 | 
  5 | time_index <- 1:30
  6 | time_index
  7 | 
  8 | # simulation for checking the distribution of correlations
  9 | #value <- NA
 10 | #for(j in 1:1000){
 11 | x <- rnorm(1, mean=0, sd=1)
 12 | x
 13 | 
 14 | for(i in 2:length(time_index)){
 15 |   x[i] <- rnorm(1, mean=x[i-1], sd=1)
 16 | }
 17 | x
 18 | 
 19 | #plot(x, type="l")
 20 | #MASS::truehist(x)
 21 | 
 22 | #cbind(x[2:length(time_index)], x[1:(length(time_index)-1)])
 23 | 
 24 | #value[j] <- cor(x[2:length(time_index)], x[1:(length(time_index)-1)])
 25 | #}
 26 | 
 27 | MASS::truehist(value)
 28 | 
 29 | alpha1 <- -1
 30 | beta1 <- 2
 31 | prob_y1 <- inv.logit(alpha1 + beta1*x + rnorm(length(time_index)))
 32 | y1 <- rbinom(length(time_index), 1, prob=prob_y1)
 33 | #y1 <- alpha + beta*x + 
 34 | 
 35 | alpha2 <- 0
 36 | beta2 <- 2
 37 | prob_y2 <- inv.logit(alpha2 + beta2*x + rnorm(length(time_index)))
 38 | y2 <- rbinom(length(time_index), 1, prob=prob_y2)
 39 | 
 40 | alpha3 <- 1
 41 | beta3 <- 2
 42 | prob_y3 <- inv.logit(alpha3 + beta3*x + rnorm(length(time_index)))
 43 | y3 <- rbinom(length(time_index), 1, prob=prob_y3)
 44 | 
 45 | alpha4 <- 1
 46 | beta4 <- 2
 47 | prob_y4 <- inv.logit(alpha4 + beta4*x + rnorm(length(time_index)))
 48 | y4 <- rbinom(length(time_index), 1, prob=prob_y4)
 49 | 
 50 | additive_scale <- y1 + y2 + y3 + y4
 51 | 
 52 | plot(additive_scale ~ x)
 53 | 
 54 | par(mfrow=c(1,2), mar=c(4,4,1,1))
 55 | plot(additive_scale, type="l")
 56 | plot(x, type="l")
 57 | 
 58 | cor(additive_scale,x)
 59 | 
 60 | table(additive_scale)
 61 | 
 62 | model <- "
 63 | 
 64 |   data{
 65 |     int n;
 66 |     int k;
 67 |     int<lower=0, upper=1> y1[n];
 68 |     int<lower=0, upper=1> y2[n];
 69 |     int<lower=0, upper=1> y3[n];
 70 |     int<lower=0, upper=1> y4[n];
 71 |   }
 72 |   
 73 |   parameters{
 74 |     real theta[n];
 75 |     real alpha[k];
 76 |     real<lower=0> beta[k];
 77 |     real<lower=0> sigma;
 78 |   }
 79 |   
 80 |   model{
 81 |     // priors
 82 |     theta[1] ~ normal(0, 1);
 83 |     
 84 |     for(i in 2:n){
 85 |       theta[i] ~ normal(theta[i-1], 1);
 86 |     }
 87 |     
 88 |     alpha ~ normal(0,1);
 89 |     beta ~ normal(0,1);
 90 |     sigma ~ normal(0,1);
 91 |     
 92 |     // likelihood
 93 |     for(i in 1:n){
 94 |       y1[i] ~ bernoulli_logit(alpha[1] + beta[1] * theta[i]);
 95 |       y2[i] ~ bernoulli_logit(alpha[2] + beta[2] * theta[i]);
 96 |       y3[i] ~ bernoulli_logit(alpha[3] + beta[3] * theta[i]);
 97 |       y4[i] ~ bernoulli_logit(alpha[4] + beta[4] * theta[i]);
 98 |     }
 99 |   }
100 | 
101 | "
102 | 
103 | data_list <- list(n=length(time_index), k=4, y1=y1, y2=y2, y3=y3, y4=y4)
104 | data_list
105 | 
106 | fit <- stan(model_code=model, data=data_list, iter=1000, chains=4, pars=c("theta_star", "sigma_star"), include=FALSE)
107 | fit
108 | 
109 | output <- extract(fit)
110 | dim(output)
111 | names(output)
112 | 
113 | dim(output$theta)
114 | names(output$theta)
115 | 
116 | theta_hat <- apply(output$theta, MARGIN=2, FUN=mean)
117 | theta_hat
118 | 
119 | par(mfrow=c(1,1))
120 | plot(x=x, y=theta_hat, xlab="true x", ylab="estiamted theta of x")
121 | abline(reg=lm(theta_hat~x),col=2)
122 | cor(x, theta_hat)
123 | cor(additive_scale, theta_hat)
124 | 
125 | apply(output$alpha, MARGIN=2, FUN=mean)
126 | c(alpha1, alpha2, alpha3, alpha4)
127 | 
128 | 
129 | apply(output$beta, MARGIN=2, FUN=mean)
130 | c(beta1, beta2, beta3, beta4)
131 | 
132 | alpha_hat <- apply(output$alpha, MARGIN=2, FUN=mean)
133 | beta_hat <- apply(output$beta, MARGIN=2, FUN=mean)
134 | 
135 | ## inflection points in the latent space
136 | inflection_points <- - alpha_hat / beta_hat
137 | 
138 | x_seq <- seq(-4,4,.1)
139 | prob_y1_hat <- inv.logit(alpha_hat[1] + beta_hat[1] * x_seq)
140 | prob_y2_hat <- inv.logit(alpha_hat[2] + beta_hat[2] * x_seq)
141 | prob_y3_hat <- inv.logit(alpha_hat[3] + beta_hat[3] * x_seq)
142 | prob_y4_hat <- inv.logit(alpha_hat[4] + beta_hat[4] * x_seq)
143 | 
144 | par(mfrow=c(2,2))
145 | plot(x=x_seq, y=prob_y1_hat, type="l")
146 | abline(v=inflection_points[1], col=2); abline(h=.5, lty=2)
147 | 
148 | plot(x=x_seq, y=prob_y2_hat, type="l")
149 | abline(v=inflection_points[2], col=2); abline(h=.5, lty=2)
150 | 
151 | plot(x=x_seq, y=prob_y3_hat, type="l")
152 | abline(v=inflection_points[3], col=2); abline(h=.5, lty=2)
153 | 
154 | plot(x=x_seq, y=prob_y4_hat, type="l")
155 | abline(v=inflection_points[4], col=2); abline(h=.5, lty=2)
156 | 
157 | 
158 | #apply(output$sigma, MARGIN=2, FUN=mean)
159 | mean(output$sigma)
160 | 
161 | 


--------------------------------------------------------------------------------
/Rplots/Google_search_term_pairs_longlist.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Rplots/Google_search_term_pairs_longlist.pdf


--------------------------------------------------------------------------------