├── .DS_Store ├── .Rhistory ├── .gitignore ├── Archive ├── .DS_Store ├── R-Programs-lessons-2021 │ ├── .DS_Store │ ├── Day01_Intro_programs │ │ ├── R_Demo_Intro_Program_Challenge_two_sum.R │ │ └── R_Demo_Intro_start_here.R │ ├── Day02_Data_Analysis_Managment │ │ ├── R_Demo_Into_linear_model_predict.R │ │ ├── R_Demo_Intro_data_visulization.R │ │ ├── R_Demo_Intro_forloop_vs_lapply.R │ │ ├── R_Demo_Intro_functions.R │ │ ├── R_Demo_Intro_lists_dataframes_tables.R │ │ ├── R_Demo_Intro_loops_logic_lists_dataframes_tables.R │ │ └── R_Demo_Intro_vectors_matrices.R │ ├── Day03_Simulation_Learning │ │ ├── R_Demo_Intro_probability_review.R │ │ ├── R_Demo_Simulation_Learning_Estimate_Mean.R │ │ ├── R_Demo_Simulation_Learning_Estimate_linear_model.R │ │ ├── R_Demo_Simulation_Learning_Potential_Outcomes_Framework.R │ │ └── R_Demo_Simulation_Learning_Probability_Distributions.R │ ├── Day04_Simulation_Inference │ │ ├── R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R │ │ ├── R_Demo_Simulation_Inference_2_Sample_Cross_Validation.R │ │ ├── R_Demo_Simulation_Inference_2_Sample_Hold_Out.R │ │ ├── R_Demo_Simulation_Inference_Bootstrap.R │ │ ├── R_Demo_Simulation_Inference_MCMC_Estimate_Mean.R │ │ ├── R_Demo_Simulation_Inference_naive_Bayes.R │ │ └── R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R │ ├── Day05_Measurement │ │ ├── .Rhistory │ │ ├── R_Demo_Measurement_SVD.R │ │ ├── R_Demo_Measurement_factor_analysis.R │ │ └── R_Demo_logit_transformation_latent_variable.R │ ├── Day05_Measurement_RSTAN │ │ ├── RSTAN_binary_IRT_NAs_simulation.R │ │ ├── RSTAN_binary_IRT_simulation.R │ │ ├── RSTAN_dyanamic_binary_IRT_NAs_simulation.R │ │ ├── RSTAN_dynamic_binary_IRT_simulation.R │ │ ├── RSTAN_linear_model_simulation.R │ │ ├── RSTAN_logistic_regression_simulation.R │ │ ├── RSTAN_mean_simulation.R │ │ ├── RSTAN_normal_distribution_simulation.R │ │ ├── RSTAN_ordered_IRT_simulation.R │ │ └── StarTrek20210730.R │ ├── Day06_Text_as_data_programs │ │ ├── R_Demo_Ternary_Diriclet.R │ │ ├── R_Demo_text_as_data_DTM.R │ │ ├── R_Demo_text_as_data_DTM_stm_package.R │ │ ├── R_Demo_text_as_data_DTM_tm_package.R │ │ ├── R_Demo_text_as_data_NYT_Text_Process_v03.R │ │ ├── R_Demo_text_as_data_package_description.R │ │ ├── R_Demo_text_as_data_regular_expressions.R │ │ ├── R_Demo_text_as_data_twitteR_get_twitter_users.R │ │ ├── R_Demo_text_as_data_twitteR_setup.R │ │ ├── R_Demo_text_as_data_word_probabilities_and_STM.R │ │ └── R_Demo_text_as_wikip_stm.R │ ├── Day07_Applied_Machine_Learning_AML │ │ ├── R_Demo_AML_keras_example.R │ │ ├── R_Demo_AML_neuralnet_gradiant_decent_glm.R │ │ ├── R_Demo_AML_neuralnet_gradiant_decent_lm.R │ │ ├── R_Demo_AML_neuralnet_gradiant_decent_mu.R │ │ ├── R_Demo_AML_neuralnet_interaction_example.R │ │ └── R_Demo_AML_neuralnet_squared_term_example.R │ └── Day08_Model_Evaluation │ │ ├── .DS_Store │ │ ├── R_Demo_Model_Evaluation_false_discovery_rate.R │ │ └── R_Demo_Model_Evaluation_precision_recall_accuracy.R └── R-Programs-lessons-2022 │ ├── .DS_Store │ ├── Day01_Intro_programs │ ├── .Rhistory │ ├── R_Demo_Intro_Program_Challenge_two_sum.R │ └── R_Demo_Intro_start_here.R │ ├── Day02_Data_Analysis_Managment │ ├── .DS_Store │ ├── .Rhistory │ ├── R_Demo_Into_linear_model_predict.R │ ├── R_Demo_Intro_data_visulization.R │ ├── R_Demo_Intro_forloop_vs_lapply.R │ ├── R_Demo_Intro_functions.R │ ├── R_Demo_Intro_lists_dataframes_tables.R │ ├── R_Demo_Intro_loops_logic_lists_dataframes_tables.R │ ├── R_Demo_Intro_mean.R │ ├── R_Demo_Intro_variance.R │ └── R_Demo_Intro_vectors_matrices.R │ ├── Day03_Simulation_Learning │ ├── .DS_Store │ ├── R_Demo_Intro_probability_review.R │ ├── R_Demo_Simulation_Learning_Dice_Rolls.R │ ├── R_Demo_Simulation_Learning_Estimate_Mean.R │ ├── R_Demo_Simulation_Learning_Estimate_linear_model.R │ ├── R_Demo_Simulation_Learning_Potential_Outcomes_Framework.R │ ├── R_Demo_Simulation_Learning_ProbCalc_BayesRule.R │ ├── R_Demo_Simulation_Learning_Probability_Continuous_Distributions.R │ └── R_Demo_Simulation_Learning_Probability_Discrete_Distributions.R │ ├── Day04_Simulation_Inference │ ├── .DS_Store │ ├── R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R │ ├── R_Demo_Simulation_Inference_2_Sample_Cross_Validation.R │ ├── R_Demo_Simulation_Inference_2_Sample_Hold_Out.R │ ├── R_Demo_Simulation_Inference_Bootstrap.R │ ├── R_Demo_Simulation_Inference_MCMC_Estimate_Mean.R │ ├── R_Demo_Simulation_Inference_naive_Bayes.R │ └── R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R │ ├── Day05_Measurement_Intro │ ├── .Rhistory │ ├── R_Demo_Measurement_SVD.R │ ├── R_Demo_Measurement_factor_analysis.R │ └── R_Demo_logit_transformation_latent_variable.R │ ├── Day05_Measurement_RSTAN │ ├── RSTAN_binary_IRT_NAs_simulation.R │ ├── RSTAN_binary_IRT_simulation.R │ ├── RSTAN_dyanamic_binary_IRT_NAs_simulation.R │ ├── RSTAN_dynamic_binary_IRT_simulation.R │ ├── RSTAN_linear_model_simulation.R │ ├── RSTAN_logistic_regression_simulation.R │ ├── RSTAN_mean_simulation.R │ ├── RSTAN_normal_distribution_simulation.R │ ├── RSTAN_ordered_IRT_simulation.R │ ├── StarTrek20210730.R │ └── StarTrekQuiz20220829.csv │ ├── Day06_Text_as_data_programs │ ├── .DS_Store │ ├── R_Demo_Ternary_Diriclet.R │ ├── R_Demo_google_trends_Human_Rights.R │ ├── R_Demo_text_as_data_DTM.R │ ├── R_Demo_text_as_data_DTM_stm_package.R │ ├── R_Demo_text_as_data_DTM_tm_package.R │ ├── R_Demo_text_as_data_NYT_Text_Process_v03.R │ ├── R_Demo_text_as_data_package_description.R │ ├── R_Demo_text_as_data_regular_expressions.R │ ├── R_Demo_text_as_data_twitteR_get_twitter_users.R │ ├── R_Demo_text_as_data_twitteR_get_twitter_users_Oauth_v2.R │ ├── R_Demo_text_as_data_word_probabilities_and_STM.R │ ├── R_Demo_text_as_wikip_stm.R │ └── google_trends_stuff │ │ ├── R_Demo_google_trends_Texas.R │ │ ├── R_Demo_google_trends_trump_brexit.R │ │ ├── google_search_trends_paired_comparisons.R │ │ └── groundhog_library_func.R │ ├── Day07_Applied_Machine_Learning_AML_Intro │ ├── R_Demo_AML_keras_example.R │ ├── R_Demo_AML_neuralnet_gradiant_decent_glm.R │ ├── R_Demo_AML_neuralnet_gradiant_decent_lm.R │ ├── R_Demo_AML_neuralnet_gradiant_decent_mu.R │ ├── R_Demo_AML_neuralnet_interaction_example.R │ └── R_Demo_AML_neuralnet_squared_term_example.R │ ├── Day08_Applied_Machine_Learning_AML_HuggingFace │ └── R_Demo_huggingface_intro.R │ └── Day09_Model_Evaluation_Review │ ├── .DS_Store │ ├── R_Demo_Model_Evaluation_false_discovery_rate.R │ ├── R_Demo_Model_Evaluation_precision_recall_accuracy.R │ └── R_Demo_Model_Evaluation_standard_errors_tstats_pvalues_CIs.R ├── Datasets ├── .DS_Store ├── Anscombes_quartet.csv ├── Myobject.Rdata ├── Myworkspace.Rdata ├── NYT_Text_Articles │ ├── .DS_Store │ ├── metadata.csv │ └── unProcessed_Files │ │ ├── NYT_1981 │ │ ├── NYT_1982.txt │ │ ├── NYT_1983 │ │ └── NYT_1984 ├── SIMpoliticalTweets.txt ├── macro.csv ├── ny_stop_frisk.csv ├── ny_stop_frisk_black.csv ├── stopwords_twitter.txt ├── tweet_data.Rdata ├── tweet_data.csv └── users-by-social-media-platform.csv ├── R-Program-Lessons ├── .DS_Store ├── Day01_Intro_programs │ ├── .DS_Store │ ├── R_Demo_Intro_Program_Challenge_two_sum.R │ └── R_Demo_Intro_start_here.R ├── Day02_Data_Analysis_Managment │ ├── .DS_Store │ ├── R_Demo_Into_linear_model_predict.R │ ├── R_Demo_Intro_data_visulization.R │ ├── R_Demo_Intro_forloop_vs_lapply.R │ ├── R_Demo_Intro_functions_part1.R │ ├── R_Demo_Intro_functions_part2.R │ ├── R_Demo_Intro_lists_dataframes_tables.R │ ├── R_Demo_Intro_loops_logic_lists_dataframes_tables.R │ ├── R_Demo_Intro_mean.R │ ├── R_Demo_Intro_variance.R │ └── R_Demo_Intro_vectors_matrices.R ├── Day03_Simulation_Learning │ ├── .DS_Store │ ├── R_Demo_Intro_probability_review.R │ ├── R_Demo_Simulation_Learning_Dice_Rolls.R │ ├── R_Demo_Simulation_Learning_Estimate_Mean.R │ ├── R_Demo_Simulation_Learning_Estimate_linear_model.R │ ├── R_Demo_Simulation_Learning_Potential_Outcomes_Framework.R │ ├── R_Demo_Simulation_Learning_ProbCalc_BayesRule.R │ ├── R_Demo_Simulation_Learning_Probability_Continuous_Distributions.R │ └── R_Demo_Simulation_Learning_Probability_Discrete_Distributions.R ├── Day04_Simulation_Inference │ ├── .DS_Store │ ├── R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R │ ├── R_Demo_Simulation_Inference_2_Sample_Cross_Validation.R │ ├── R_Demo_Simulation_Inference_2_Sample_Hold_Out.R │ ├── R_Demo_Simulation_Inference_Bootstrap.R │ ├── R_Demo_Simulation_Inference_MCMC_Estimate_Mean.R │ ├── R_Demo_Simulation_Inference_naive_Bayes.R │ └── R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R ├── Day05_Measurement_Intro │ ├── .Rhistory │ ├── R_Demo_Measurement_SVD.R │ ├── R_Demo_Measurement_factor_analysis.R │ └── R_Demo_logit_transformation_latent_variable.R ├── Day05_Measurement_RSTAN │ ├── .DS_Store │ ├── RSTAN_binary_IRT_NAs_simulation.R │ ├── RSTAN_binary_IRT_simulation.R │ ├── RSTAN_dyanamic_binary_IRT_NAs_simulation.R │ ├── RSTAN_dynamic_binary_IRT_simulation.R │ ├── RSTAN_linear_model_simulation.R │ ├── RSTAN_logistic_regression_simulation.R │ ├── RSTAN_mean_simulation.R │ ├── RSTAN_normal_distribution_simulation.R │ ├── RSTAN_ordered_IRT_simulation.R │ ├── StarTrek20210730.R │ └── StarTrekQuiz20220829.csv ├── Day06_Text_as_data_programs │ ├── .DS_Store │ ├── R_Demo_Ternary_Diriclet_visulaization.R │ ├── R_Demo_google_search_trends_paired_comparisons.R │ ├── R_Demo_google_trends_Human_Rights.R │ ├── R_Demo_google_trends_Texas.R │ ├── R_Demo_text_as_data_DTM.R │ ├── R_Demo_text_as_data_DTM_stm_package.R │ ├── R_Demo_text_as_data_DTM_tm_package.R │ ├── R_Demo_text_as_data_NYT_Text_Process_v03.R │ ├── R_Demo_text_as_data_package_description.R │ ├── R_Demo_text_as_data_regular_expressions.R │ ├── R_Demo_text_as_data_word_probabilities_and_STM.R │ ├── R_Demo_text_as_wikip_stm.R │ ├── R_twitter_examples │ │ ├── R_Demo_text_as_data_twitteR_get_twitter_users.R │ │ └── R_Demo_text_as_data_twitteR_get_twitter_users_Oauth_v2.R │ └── groundhog_library_func.R ├── Day07_Applied_Machine_Learning_AML_Intro │ ├── R_Demo_AML_MNIST_example.R │ ├── R_Demo_AML_keras_example.R │ ├── R_Demo_AML_neuralnet_gradiant_decent_glm.R │ ├── R_Demo_AML_neuralnet_gradiant_decent_lm.R │ ├── R_Demo_AML_neuralnet_gradiant_decent_mu.R │ ├── R_Demo_AML_neuralnet_interaction_example.R │ └── R_Demo_AML_neuralnet_squared_term_example.R ├── Day08_Applied_Machine_Learning_AML_HuggingFace │ └── R_Demo_huggingface_intro.R ├── Day09_Model_Evaluation_Review │ ├── .DS_Store │ ├── R_Demo_Model_Evaluation_false_discovery_rate.R │ ├── R_Demo_Model_Evaluation_precision_recall_accuracy.R │ └── R_Demo_Model_Evaluation_standard_errors_tstats_pvalues_CIs.R └── Day10_Additional_Programs │ ├── dynamic_IRT_practice_v1.R │ ├── dynamic_IRT_practice_v2.R │ ├── dynamic_IRT_practice_v3.R │ ├── dynamic_IRT_practice_v4.R │ └── dynamic_IRT_practice_v5.R ├── README.md └── Rplots └── Google_search_term_pairs_longlist.pdf /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.Rproj 2 | .Rproj.user/ 3 | -------------------------------------------------------------------------------- /Archive/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/.DS_Store -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2021/.DS_Store -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day01_Intro_programs/R_Demo_Intro_Program_Challenge_two_sum.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Intro_Program_Challenge_two_sum.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Instructions: 14 | ## 15 | ## These challenges are meant to be just that, challenging. 16 | ## They should also be fun. I encourage you to think creatively and collaboratively. 17 | ## Getting stuck or not finishing all the steps is expected and encouraged. 18 | ## This is how learning works! 19 | ## Learn to program, program to learn. 20 | ## 21 | ## Always start with step (1) and then continue to each step as time permits. 22 | ## Don't worry about completing each step. Document your code for each step. 23 | ## You may wish to come back to some of the harder steps as you progress through the course. 24 | ## Note that some of the steps may ask you to use skills we have not yet covered in the course. 25 | ## Don't worry about these steps now but definitely think through the programming logic if you are stuck and make plans to come back to try them once you feel ready. 26 | ## 27 | ########################################################################## 28 | ## 29 | ## Steps for the Challenge 30 | ## 31 | ## (1) create a vector of integers numbers and a scalar integer target 32 | ## (2) write a program in R that determines (returns) two numbers from the vector that add up to the target scalar 33 | ## (3) how many combinations of numbers in the vector of integers add up to the target scalar? 34 | ## (4) write a function to complete steps 1-3 (hint: wrap the program from (3) within a function) 35 | ## (5) write a simulation that explore the relationship between (a) the vector of integers numbers and (b) scalar integer target sum 36 | ## (6) re-write the program or function so that it takes fewer steps to calculate the number of numeric combinations that add up to the scalar target 37 | ## 38 | ########################################################################## 39 | 40 | ## step 1: create a vector of integers numbers 41 | int_numbers <- 1:10 42 | int_numbers 43 | 44 | ## step 1: and a scalar integer target 45 | target_number <- 6 46 | target_number 47 | 48 | ## step 2: write a program in R that determines (returns) two of numbers that add up to the target scalar 49 | count <- 1 50 | count 51 | 52 | value <- c() 53 | value 54 | 55 | mat <- matrix(NA, nrow=length(int_numbers), ncol=length(int_numbers)) 56 | mat 57 | 58 | for(i in 1:length(int_numbers)){ 59 | for(j in 1:length(int_numbers)){ 60 | mat[i,j] <- int_numbers[i] + int_numbers[j] 61 | value[count] <- ifelse(mat[i,j]==target_number, TRUE, FALSE) 62 | count <- count + 1 63 | } 64 | } 65 | count 66 | mat 67 | value 68 | 69 | ## step 3: how many combinations of numbers in the vector of integers add up to the target scalar? 70 | sum(value) 71 | sum(value==TRUE) 72 | 73 | 74 | ## step 4: write a function to complete steps 1-3 75 | 76 | sum_func <- function(int_numbers=1:10, target_number=2){ 77 | count <- 1 78 | value <- c() 79 | mat <- matrix(NA, nrow=length(int_numbers), ncol=length(int_numbers)) 80 | for(i in 1:length(int_numbers)){ 81 | for(j in 1:length(int_numbers)){ 82 | mat[i,j] <- int_numbers[i] + int_numbers[j] 83 | value[count] <- ifelse(mat[i,j]==target_number, TRUE, FALSE) 84 | count <- count + 1 85 | } 86 | } 87 | mat 88 | value 89 | return(sum(value==TRUE)) 90 | } 91 | 92 | ## print function definition to screen 93 | sum_func 94 | 95 | ## use the function with its default arguments 96 | sum_func() 97 | 98 | ## use the function with other arguments 99 | sum_func(int_numbers=1:10, target_number=6) 100 | 101 | sum_func(int_numbers=1:100, target_number=6) 102 | 103 | sum_func(int_numbers=1:10, target_number=60) 104 | 105 | sum_func(int_numbers=1:100, target_number=60) 106 | 107 | 108 | ## try to figure out step 5 and step 6 here 109 | 110 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day02_Data_Analysis_Managment/R_Demo_Intro_forloop_vs_lapply.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Intro_forloop_vs_lapply.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## in class coding of loops and lapply 16 | 17 | 1:10 18 | 19 | lenght(1:10) 20 | 21 | 22 | ## for loop version 23 | output <- c() 24 | for(i in 1:10){ 25 | 26 | local_var <- i*i ## the simple but core calculation of the iterative process 27 | output[i] <- local_var 28 | 29 | } 30 | output 31 | 32 | 33 | ## lapply version 34 | output <- lapply(1:10, function(i){ 35 | 36 | local_var <- i*i ## the simple but core calculation of the iterative process 37 | return(local_var) 38 | }) 39 | unlist(output) 40 | 41 | 42 | 43 | 44 | mat <- matrix(NA, nrow=10, ncol=5) 45 | mat 46 | 47 | for(i in 1:10){ 48 | for(j in 1:5){ 49 | local_var <- i*j 50 | mat[i,j] <- local_var 51 | } 52 | } 53 | mat 54 | 55 | mat_list <- lapply(1:10, function(i){ 56 | 57 | local_vec <- c() 58 | for(j in 1:5){ 59 | local_var <- i*j 60 | local_vec[j] <- local_var 61 | } 62 | 63 | return(local_vec) 64 | }) 65 | 66 | unlist(mat_list) 67 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T) 68 | mat 69 | 70 | 71 | mat_list <- lapply(1:10, function(i){ 72 | local_out <- lapply(1:5, function(j){ 73 | temp <- i*j 74 | return(temp) 75 | }) 76 | return(local_out) 77 | }) 78 | 79 | mat_list 80 | unlist(mat_list) 81 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T) 82 | mat 83 | 84 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day04_Simulation_Inference/R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Goal: Improve the predictive power or predictive validity of a model when applied to new observed values 15 | ## 16 | ########################################################################## 17 | ## Introduction to tutorial: 18 | ## 19 | ## For this R tutorial we will simulate a dataset and then randomly divide it into ten subsets. 20 | ## 21 | ## We will then fit a model using the observations from 1 of the subsets of data (test data) and then use the model estimates to predict the value of the dependent variable for the remaining out of sample data subset (test data). 22 | ## 23 | ## After that step, we will use the other half of the data to fit the model and then predict the other hold out sample. 24 | ## 25 | ## In this way, both halves of the data will be predicted using a model estimated from other data not used in the fitting 26 | ## 27 | ## This process is increasingly common and required in almost all Machine Learning and predictive tasks in data science and increasingly so in the social sciences. 28 | ## 29 | ########################################################################## 30 | 31 | 32 | #rm(list = ls()) 33 | 34 | set.seed(940) 35 | 36 | ## set number of observations for simulation 37 | n <- 100 38 | 39 | ## number of folds (randomly created sub samples of data) 40 | k <- 10 41 | 42 | ## simulation of variables 43 | x <- sample(4:14,n,replace=TRUE) 44 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,1) 45 | #y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,2) 46 | 47 | plot(x,y) 48 | 49 | ## create a subject/unit ID variable with one values for each unit 50 | ## here the indicator values takes on 2-Fold values {1,2} 51 | folds <- sample(rep(1:k, k), n, replace=FALSE) 52 | 53 | table(folds) 54 | 55 | ## create a data frame with the dependent varaible, independent variable, and randomly created ID 56 | dat <- data.frame(y, x, folds) 57 | 58 | ## create vectors for storing predictions 59 | dat$y.hat1 <- NA 60 | dat$y.hat2 <- NA 61 | 62 | #test <- matrix(NA, nrow=k, ncol=2) 63 | 64 | ## function to 65 | for(i in 1:k){ 66 | 67 | ## fit a linear model 68 | fit1 <- lm(y ~ x, data=subset(dat, folds!=i)) 69 | pred1 <- predict(fit1, newdata=subset(dat, folds==i)) 70 | y.hat1 <- as.numeric(pred1) 71 | 72 | dat$y.hat1[dat$fold==i] <- y.hat1 73 | 74 | 75 | ## fit a linear model with a squared term 76 | fit2 <- lm(y ~ x + I(x^2), data=subset(dat, folds!=i)) 77 | pred2 <- predict(fit2, newdata=subset(dat, folds==i)) 78 | y.hat2 <- as.numeric(pred2) 79 | 80 | dat$y.hat2[dat$fold==i] <- y.hat2 81 | 82 | print(summary(dat)) 83 | } 84 | 85 | rmse.fit1 <- sqrt(mean((dat$y.hat1-dat$y)^2)) 86 | rmse.fit1 87 | 88 | rmse.fit2 <- sqrt(mean((dat$y.hat2-dat$y)^2)) 89 | rmse.fit2 90 | 91 | 92 | cor.fit1 <- cor(dat$y.hat1, dat$y, method="spearman") 93 | cor.fit1 94 | 95 | cor.fit2 <- cor(dat$y.hat2, dat$y, method="spearman") 96 | cor.fit2 97 | 98 | c(cor.fit1, cor.fit2) 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day04_Simulation_Inference/R_Demo_Simulation_Inference_2_Sample_Hold_Out.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_Simulation_Inference_2_Sample_Hold_out.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Goal: Improve the predictive power or predictive validity of a model 15 | ## 16 | ########################################################################## 17 | ## Introduction to tutorial: 18 | ## 19 | ## (1) Begin building towards and learning about cross-validation 20 | ## (NOTE: There is no "crossing" yet) 21 | ## 22 | ## For this R tutorial we will simulate a dataset and then randomly divide it into two subsets. 23 | ## 24 | ## We will fit a model using the observations from one of the subsets of data (training data). 25 | ## 26 | ## We will then use the model estimates to predict the value of the dependent variable for the remaining out-of-sample data subset (testing data). 27 | ## 28 | ########################################################################## 29 | 30 | 31 | set.seed(940) 32 | 33 | 34 | ## set number of observations for simulation 35 | n <- 100 36 | 37 | ## simulation of variables (This model is one of Anscombe's quartets) 38 | x <- sample(4:14,n,replace=TRUE) 39 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,1) 40 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,2) 41 | 42 | ## plot the simulated relationship 43 | par(mfrow=c(1,1)) 44 | plot(x=x, y=y) 45 | 46 | ## create a subject/unit ID variable with one values for each unit 47 | ## here the indicator values takes on 2-Fold values {1,2} 48 | folds <- sample(rep(1:2, n/2), n, replace=FALSE) 49 | folds 50 | 51 | table(folds) 52 | 53 | ## create a data frame with the dependent variable, independent variable, and randomly created ID 54 | dat <- data.frame(y, x, folds) 55 | 56 | summary(dat) 57 | 58 | head(dat) 59 | 60 | ## fit a linear model to the full dataset 61 | model <- lm(y ~ x, data=dat) 62 | summary(model) 63 | 64 | 65 | ## subset the full dataset into to subsets based on the ID variable 66 | train <- subset(dat, folds==1) 67 | test <- subset(dat, folds==2) 68 | 69 | train <- dat[dat$folds==1,] 70 | test <- dat[dat$folds==2,] 71 | 72 | nrow(train) 73 | nrow(test) 74 | 75 | 76 | ## Model 0: fit a linear model 77 | fit <- lm(y ~ 1, data=train) 78 | pred <- predict(fit, newdata=test) 79 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2)) 80 | rmse 81 | 82 | ## Model 1: fit a linear model 83 | fit <- lm(y ~ x, data=train) 84 | pred <- predict(fit, newdata=test) 85 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2)) 86 | rmse 87 | 88 | ## Model 2: fit a linear model with a squared term 89 | fit <- lm(y ~ x + I(x^2), data=train) 90 | pred <- predict(fit, newdata=test) 91 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2)) 92 | rmse 93 | 94 | 95 | ################################################## 96 | ## Model 2: fit a linear model with a squared term 97 | fit <- lm(y ~ x + I(x^2), data=train) 98 | pred <- predict(fit, newdata=train) 99 | rmse <- sqrt(mean((as.numeric(pred)-train$y)^2)) 100 | rmse 101 | 102 | 103 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_Simulation_Inference_naive_Bayes.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Bayes rule and the calculation of conditional probability. Introduces the naive Bayes classifier function from the e1071 library. 16 | ## 17 | ## P(outcome | evidence) = P(outcome) * P(evidence | outcome) / P(evidence) 18 | ## 19 | ## Below is the Bayes’ Theorem: 20 | ## P(A | B) = P(A) * P(B | A) / P(B) 21 | ## 22 | ## Which can be derived from the general multiplication formula for AND events: 23 | ## P(A and B) = P(A) * P(B | A) 24 | ## P(B | A) = P(A and B) / P(A) 25 | ## P(B | A) = P(B) * P(A | B) / P(A) 26 | ## P(y|x) = P(x|y) * P(y) / P(x) 27 | ## P(x|y) = P(x AND y) / P(x) 28 | ## 29 | ## Pr(A[1] = Pr(y==0) 30 | ## Pr(A[2] = Pr(y==1) 31 | ## Pr(B | A[1]) = Pr(Data | y==0) 32 | ## Pr(B | A[2]) = Pr(Data | y==1) 33 | ## 34 | ########################################################################## 35 | 36 | 37 | ## load libraries 38 | library(e1071) 39 | library(LaplacesDemon) 40 | 41 | 42 | ## example code from BayesTheorem() function 43 | PrA <- c(0.75,0.25) 44 | PrBA <- c(6/9, 5/7) 45 | BayesTheorem(PrA, PrBA) 46 | 47 | 48 | ## create fake data 49 | n <- 10 50 | x <- c(rep(0,n/2), rep(1,n/2)) 51 | y <- c(0,0,0,1,1,0,0,1,1,1) 52 | 53 | ## inspect data 54 | cbind(y,x) 55 | 56 | ## inspect tabulation of data 57 | table(y,x) 58 | 59 | 60 | ## calculate the probability of the evidence/data 61 | PrX <- NA 62 | PrX[1] <- sum(as.numeric(x==1)) / n 63 | PrX[2] <- sum(as.numeric(x==1)) / n 64 | 65 | ## calculate the probability of the outcome 66 | PrY <- NA 67 | PrY[1] <- sum(as.numeric(y==0))/n 68 | PrY[2] <- sum(as.numeric(y==1))/n 69 | PrY 70 | 71 | ## calculate the probability of the data conditional on the value of y (the likelihood) 72 | PrXY<- NA 73 | PrXY[1] <- sum(x[y==0])/length(as.numeric(x[y==0])) 74 | PrXY[2] <- sum(x[y==1])/length(as.numeric(x[y==1])) 75 | PrXY 76 | 77 | ## apply Bayes Rule 78 | PrXY * PrY / PrX 79 | 80 | ## apply Bayes Rule with BayesTheorem() function 81 | BayesTheorem(PrA=PrY, PrBA=PrXY) 82 | 83 | 84 | ## apply Bayes Rule with naiveBayes() function 85 | fit <- naiveBayes(y~x, data=data.frame(y,x)) 86 | fit 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## (1) Program simulates count data that is then used to predict a binary outcome variable. 16 | ## 17 | ## (2) Three models are evaluated using the count data to predict the outcome: 18 | ## (2a) linear model 19 | ## (2b) generalized linear model with a logit link function 20 | ## (2c) naive Bayes classifier. 21 | ## 22 | ########################################################################## 23 | 24 | 25 | ## load library 26 | llibrary(e1071) 27 | 28 | 29 | ## simulate x1 and set the "true" population values alpha and beta 30 | n <- 100 31 | 32 | ## unobserved 33 | x <- runif(n,0,1) 34 | 35 | ## observed counts 36 | x1 <- rpois(n, lambda=x) 37 | x2 <- rpois(n, lambda=x) 38 | x3 <- rpois(n, lambda=2*x) 39 | x4 <- rpois(n, lambda=2*x) 40 | x5 <- rpois(n, lambda=4*x) 41 | 42 | ## systematic component of the model based on observed counts 43 | xb <- -2 + x1 + x2 + x3 + x4 + x5 44 | 45 | ## transform the linear term xb using 46 | ## the inverse logit function 47 | ## so that theta is bound from 0 to 1 48 | pi <- 1 / (1 + exp(-xb)) 49 | 50 | ## generate the dependent variable y with probability pi and measurement error from a Bernoulli trial 51 | y <- rbinom(n, size=1, prob=pi) 52 | 53 | 54 | ## make data frame 55 | dat <- data.frame(y, x1, x2, x3, x4, x5) 56 | 57 | 58 | ## summarize fit using linear model 59 | summary(lm(y ~ x1 + x2 + x3 + x4 + x5, data=dat)) 60 | 61 | 62 | ## summarize fit using glm using the logit link function 63 | summary(glm(y ~ x1 + x2 + x3 + x4 + x5, family=binomial(link="logit"))) 64 | 65 | 66 | ## summarize fit using naiveBayes model 67 | naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=dat) 68 | 69 | 70 | ## create vectors for storing predictions 71 | dat$y.hat1 <- NA 72 | dat$y.hat2 <- NA 73 | dat$y.hat3 <- NA 74 | 75 | ## select number of folds 76 | k <- 10 77 | 78 | ## create vector of folds for cross validation 79 | dat$folds <- sample(rep(1:k, k), n, replace=FALSE) 80 | 81 | ## lapply function to 82 | for(i in 1:k){ 83 | 84 | ## fit a linear model 85 | fit1 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i)) 86 | pred1 <- predict(fit1, newdata=subset(dat, folds==i)) 87 | y.hat1 <- as.numeric(pred1) 88 | 89 | dat$y.hat1[dat$fold==i] <- y.hat1 90 | 91 | 92 | ## fit a glm model 93 | fit2 <- glm(y ~ x1 + x2 + x3 + x4 + x5, binomial(link="logit"), data=subset(dat, folds!=i)) 94 | pred2 <- predict(fit2, newdata=subset(dat, folds==i)) 95 | y.hat2 <- as.numeric(pred2) 96 | 97 | dat$y.hat2[dat$fold==i] <- y.hat2 98 | 99 | 100 | ## fit a naiveBayes classifier model 101 | fit3 <- naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i)) 102 | pred3 <- predict(fit3, newdata=subset(dat, folds==i)) 103 | y.hat3 <- as.numeric(pred3) 104 | 105 | dat$y.hat3[dat$fold==i] <- y.hat3 106 | 107 | #print(summary(dat)) 108 | } 109 | 110 | rmse.fit1 <- sqrt(mean((dat$y.hat1-dat$y)^2)) 111 | rmse.fit1 112 | 113 | cor.fit1 <- cor(dat$y.hat1, dat$y, method="spearman") 114 | 115 | rmse.fit2 <- sqrt(mean((dat$y.hat2-dat$y)^2)) 116 | rmse.fit2 117 | 118 | cor.fit2 <- cor(dat$y.hat2, dat$y, method="spearman") 119 | 120 | rmse.fit3 <- sqrt(mean((dat$y.hat3-dat$y)^2)) 121 | rmse.fit3 122 | 123 | cor.fit3 <- cor(dat$y.hat3, dat$y, method="spearman") 124 | 125 | c(cor.fit1, cor.fit2, cor.fit3) 126 | 127 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day05_Measurement_RSTAN/RSTAN_linear_model_simulation.R: -------------------------------------------------------------------------------- 1 | ## RSTAN_linear_model_simulation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN 5 | ## 6 | ## Please e-mail me if you find any errors or have and suggestions 7 | ## e-mail: cjf0006@gmail.com 8 | ## 9 | ########################################################################## 10 | ## 11 | ## Introduction to tutorial: 12 | ## 13 | ## For this R tutorial we will draw random samples from the normal distribution using the STAN program. These parameters will be estimated based on the likelihood function that links them to the data contained in the y and x variables that are simulated observed data. The model produces the slope and intercept from a standard linear model, which is also estimated using lm() in R. 14 | ## 15 | ########################################################################## 16 | 17 | ## load library 18 | library(rstan) # load rstan library 19 | library(MASS) # load library with truehist function 20 | 21 | ## -------------------------------------------------- # 22 | ## define STAN model as a character 23 | ## -------------------------------------------------- # 24 | model <- " 25 | data { 26 | // declared the data in memory 27 | int n; 28 | vector[n] y; 29 | vector[n] x; 30 | } 31 | parameters { 32 | // declared the parameters in memory 33 | real alpha; 34 | real beta; 35 | real sigma; 36 | } 37 | model { 38 | // priors (these are variances not precision) 39 | alpha ~ normal(0,10); 40 | beta ~ normal(0,10); 41 | 42 | // likelihood (link data to some combination of parameters and more data) 43 | for(i in 1:n){ 44 | y[i] ~ normal(alpha + beta * x[i], sigma); 45 | } 46 | } 47 | generated quantities { 48 | // posterior predictions 49 | vector[n] y_predict; 50 | 51 | // the loop is necessary within the generated quantities block 52 | for(i in 1:n){ 53 | y_predict[i] = normal_rng(alpha + beta * x[i], sigma); 54 | } 55 | } 56 | " 57 | ## -------------------------------------------------- # 58 | 59 | 60 | ## set data for simulation 61 | n <- 100 62 | x <- rnorm(n,0,1) 63 | alpha <- 1.25 64 | beta <- 2.50 65 | 66 | ## simulate a dependent variable with normally distribtued error using the data and parameter values defined above 67 | error <- rnorm(n) 68 | y <- alpha + beta * x + error 69 | 70 | plot(x=x, y=y) 71 | 72 | ## fit linear model 73 | summary(lm(y~x)) 74 | 75 | ## create data list 76 | data_list <- list(y = y, x=x, n=n) 77 | 78 | ## set time start variable 79 | time1 <- Sys.time() 80 | 81 | # fit stan model 82 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4) 83 | 84 | ## calcuate the duration of the program file up to this point 85 | print(Sys.time() - time1) 86 | 87 | ## extract draws from stan model object (creates a list object) 88 | output <- extract(fit, permuted = TRUE) 89 | 90 | ## print names of each element/slot in the list 91 | names(output) 92 | 93 | ## print model fit object 94 | fit 95 | 96 | ## there are number of methods to subset and summarize parameters 97 | ## keep in mind that the output object is a list that contains vectors or matrices of of posterior estimates for each of the named parameter defined in the model statement above 98 | ## lapply (list-apply) a function to all of the objects in the list 99 | lapply(output, mean)[1:3] 100 | lapply(output, sd)[1:3] 101 | 102 | ## create a matrix using some of the named slots in the list 103 | model_parameters <- as.matrix(fit, pars = c("alpha", "beta", "sigma")) 104 | model_predictions <- as.matrix(fit, pars = "y_predict") 105 | 106 | ## check the dimensions (they should be the same) 107 | dim(model_predictions) 108 | dim(output$y_predict) 109 | 110 | ## plot the simulated y variable and the estimated posterior means 111 | plot(y, apply(model_predictions,2,mean)) 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day05_Measurement_RSTAN/RSTAN_logistic_regression_simulation.R: -------------------------------------------------------------------------------- 1 | ## RSTAN_logistic_regression_simulation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN 5 | ## 6 | ## Please e-mail me if you find any errors or have and suggestions 7 | ## e-mail: cjf0006@gmail.com 8 | ## 9 | ########################################################################## 10 | ## 11 | ## Introduction to tutorial: 12 | ## 13 | ## For this R tutorial we will simulate a binary dependent variable and then estimate the parameters that generate the variable. These parameters will be estimated based on the likelihood function that links them to the data contained in the y and x variables that are simulated observed data. The model produces the slope and intercept from a standard logistic regression model, which is also estimated using glm() in R. 14 | ## 15 | ########################################################################## 16 | 17 | ## load library 18 | library(rstan) # load rstan library 19 | library(MASS) # loaed library with truehist function 20 | 21 | ## -------------------------------------------------- ## 22 | ## define STAN model 23 | ## -------------------------------------------------- ## 24 | model <- " 25 | data { 26 | // declared the data in memory 27 | int n; 28 | int y[n]; 29 | vector[n] x; 30 | } 31 | // declared the parameters in memory 32 | parameters { 33 | real alpha; 34 | real beta; 35 | } 36 | model { 37 | // priors (these are variances not precision) 38 | alpha ~ normal(0,10); 39 | beta ~ normal(0,10); 40 | 41 | // likelihood (link data to some combination of parameters and more data) 42 | y ~ bernoulli_logit(alpha + beta * x); 43 | } 44 | generated quantities { 45 | // posterior predictions 46 | vector[n] y_predict; 47 | 48 | // the loop is necessary within the generated quantities block 49 | for(i in 1:n){ 50 | y_predict[i] = bernoulli_logit_rng(alpha + beta * x[i]); 51 | } 52 | } 53 | 54 | " 55 | ## -------------------------------------------------- ## 56 | 57 | 58 | ## simulate x1 and set the "true" population values alpha and beta 59 | n <- 100 60 | x <- rnorm(n,0,1) 61 | alpha <- 1.25 62 | beta <- 2.50 63 | 64 | ## systematic component of the model 65 | xb <- alpha + beta * x 66 | 67 | ## transform the linear term xb using 68 | ## the inverse logit function 69 | ## so that theta is bound from 0 to 1 70 | eta <- 1 / (1 + exp(-xb)) 71 | 72 | ## generate the dependent variable y with probability inv.theta and measurment error from a Bernoulli trial 73 | y <- rbinom(n, size=1, prob=eta) 74 | 75 | table(y) 76 | 77 | ## create data list 78 | data_list <- list(y = y, x=x, n=n) 79 | 80 | ## fit linear model 81 | summary(glm(y~x, family=binomial(link="logit"))) 82 | 83 | ## fit stan model 84 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4) 85 | 86 | ## extract draws from stan model object 87 | output <- extract(fit, permuted = TRUE) 88 | 89 | ## print names 90 | names(output) 91 | 92 | ## there are number of methods to subset and summarize parameters 93 | ## keep in mind that the output object is a list that contains vectors or matrices of of posterior estimates for each of the named parameter defined in the model statement above 94 | ## lapply (list-apply) a function to all of the objects in the list 95 | lapply(output, mean) 96 | lapply(output, sd) 97 | 98 | ## tabulate the simulated binary dependent variable it should be very close to the mean value of the predicted y 99 | table(y) 100 | 101 | ## create a matrix using some of the named slots in the list 102 | model_parameters <- as.matrix(fit, pars = c("alpha", "beta")) 103 | model_predictions <- as.matrix(fit, pars = "y_predict") 104 | 105 | ## check the dimensions (they should be the same) 106 | dim(model_predictions) 107 | dim(output$y_predict) 108 | 109 | ## plot the simulated y variable and the estimated posterior means 110 | plot(apply(model_predictions,2,mean), y) 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day05_Measurement_RSTAN/RSTAN_mean_simulation.R: -------------------------------------------------------------------------------- 1 | ## RSTAN_mean_simulation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN 5 | ## 6 | ## Please e-mail me if you find any errors or have and suggestions 7 | ## e-mail: cjf0006@gmail.com 8 | ## 9 | ########################################################################## 10 | ## 11 | ## Introduction to tutorial: 12 | ## 13 | ## For this R tutorial we will use the normal distribution in STAN to estimate the mean value for an observed variable y. The parameter for the mean will be selected based on the likelihood function that links them to the data contained in the y variable. 14 | ## 15 | ########################################################################## 16 | 17 | ## load library 18 | library(rstan) # load rstan library 19 | library(MASS) # load library with truehist function 20 | 21 | ## -------------------------------------------------- # 22 | ## define STAN model as a character 23 | ## -------------------------------------------------- # 24 | model <- " 25 | data { 26 | // declared the data in memory 27 | int n; 28 | vector[n] y; 29 | } 30 | parameters { 31 | // declared the parameters in memory 32 | real mu; 33 | real sigma; 34 | } 35 | model { 36 | // there are no prior statements for mu or sigma; 37 | // by default the priors on the parameters are flat unless we provide more information (see the other examples) 38 | // likelihood (link data to some combination of parameters and more data) 39 | 40 | mu ~ normal(0,1); 41 | 42 | for(i in 1:n){ 43 | y[i] ~ normal(mu, sigma); 44 | } 45 | } 46 | generated quantities { 47 | // posterior predictions 48 | vector[n] y_predict; 49 | 50 | // the loop is necessary within the generated quantities block 51 | for(i in 1:n){ 52 | y_predict[i] = normal_rng(mu, sigma); 53 | } 54 | } 55 | " 56 | ## -------------------------------------------------- # 57 | 58 | 59 | ## set data for simulation 60 | y <- 1:5 61 | y <- rep(1:5,200) 62 | 63 | n <- length(y) 64 | y 65 | n 66 | 67 | ## create data list 68 | data_list <- list(y = y, n=n) 69 | data_list 70 | 71 | ## set time start variable 72 | time1 <- Sys.time() 73 | 74 | # fit stan model 75 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4) 76 | 77 | ## calcuate the duration of the program file up to this point 78 | print(Sys.time() - time1) 79 | 80 | ## extract draws from stan model object (creates a list object) 81 | output <- extract(fit, permuted = TRUE) 82 | 83 | ## print names of each element/slot in the list 84 | names(output) 85 | 86 | ## print model fit object 87 | fit 88 | 89 | ## lapply (list-apply) a function to all of the objects in the list 90 | lapply(output, mean) 91 | lapply(output, sd) 92 | 93 | truehist(output$mu) 94 | 95 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day05_Measurement_RSTAN/RSTAN_normal_distribution_simulation.R: -------------------------------------------------------------------------------- 1 | ## RSTAN_normal_distribution_simulation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN 5 | ## 6 | ## Please e-mail me if you find any errors or have and suggestions 7 | ## e-mail: cjf0006@gmail.com 8 | ## 9 | ########################################################################## 10 | ## 11 | ## Introduction to tutorial: 12 | ## 13 | ## For this R tutorial we will draw random samples from the normal distribution using the STAN program. 14 | ## This program is equivalent to using the rnorm function in R. 15 | ## 16 | ########################################################################## 17 | 18 | 19 | ## load library 20 | library(rstan) # load rstan library 21 | library(MASS) # load library with truehist function 22 | 23 | 24 | ## -------------------------------------------------- # 25 | ## define STAN model 26 | ## -------------------------------------------------- # 27 | model <- " 28 | 29 | parameters { 30 | real mu; 31 | } 32 | 33 | model { 34 | mu ~ normal(0,1); 35 | } 36 | " 37 | ## -------------------------------------------------- # 38 | 39 | 40 | ## set time start variable 41 | time1 <- Sys.time() 42 | 43 | ## fit stan model 44 | fit <- stan(model_code = model, iter = 1000, chains = 4) 45 | 46 | ## calculate the duration of the program file up to this point 47 | print(Sys.time() - time1) 48 | 49 | ## extract draws from stan model object (creates a list object) 50 | output <- extract(fit, permuted = TRUE) 51 | 52 | ## print names of each element/slot in the list 53 | names(output) 54 | 55 | ## print model fit object 56 | fit 57 | 58 | ## there are number of methods to subset and summarize parameters 59 | ## keep in mind that the output object is a list that contains vectors or matrices of posterior estimates for each of the named parameter defined in the model statement above 60 | ## lapply (list-apply) a function to all of the objects in the list 61 | lapply(output, mean) 62 | lapply(output, sd) 63 | 64 | ## create a matrix using some of the named slots in the list 65 | model_parameters <- as.matrix(fit, pars = c("mu")) 66 | 67 | ## check the dimensions (they should be the same) 68 | length(output$mu) 69 | 70 | ## make a nice plot 71 | #par(mfrow=c(2,2)) 72 | truehist(output$mu) 73 | 74 | 75 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_Ternary_Diriclet.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Ternary_Diriclet.R 2 | library(Ternary) 3 | library(MCMCpack) 4 | 5 | value1 <- value2 <- value3 <- value4 <- value5 <- value6 <- matrix(NA, nrow=1000, ncol=3) 6 | 7 | value1[,1:3] <- rdirichlet(1000, alpha=c(1,1,5)) 8 | value2[,1:3] <- rdirichlet(1000, alpha=c(1,5,1)) 9 | value3[,1:3] <- rdirichlet(1000, alpha=c(10,10,1)) 10 | 11 | value4[,1:3] <- rdirichlet(1000, alpha=c(1,1,1)) 12 | value5[,1:3] <- rdirichlet(1000, alpha=c(10,10,10)) 13 | value6[,1:3] <- rdirichlet(1000, alpha=c(100,100,100)) 14 | 15 | ## coordinates within the simplex sum to 1 16 | table(apply(value4[,1:3],1,sum)) 17 | 18 | 19 | ## unicode for right arrow: "\u2192" 20 | ## unicode for left arrow: "\u2190" 21 | 22 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192", 23 | point='right', lab.cex=0.8, grid.minor.lines = 0, 24 | grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white', 25 | axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6), 26 | padding=0.08, main="simplex") 27 | TernaryPoints(value1, col=4) 28 | TernaryPoints(value2, col=3) 29 | TernaryPoints(value3, col=2) 30 | 31 | TernaryPoints(value4, col=1) 32 | TernaryPoints(value5, col=4) 33 | TernaryPoints(value6, col=2) 34 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_stm_package.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_text_as_data_DTM_stm_package.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Introduction to tutorial: 15 | ## 16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file. 17 | ## 18 | ## This code uses functions available in the tm package. 19 | ## 20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions. 21 | ## 22 | ########################################################################## 23 | 24 | 25 | ## load libraries 26 | library(stm) 27 | library(tm) 28 | library(SnowballC) 29 | 30 | ## load data 31 | data <- read.csv("SIMpoliticalTweets.txt", header=FALSE) 32 | data 33 | names(data) <- "text" 34 | data 35 | 36 | #trumptweets <- fromJSON("trump_json_files_20190707.txt") 37 | #data <- trumptweets 38 | 39 | ## 40 | #data <- data[1:1000,] 41 | 42 | 43 | ## preprocess the documents 44 | ## This function uses function from the tm package (see the tm Demo for more details) 45 | ## stem words and remove stop words 46 | prep <- textProcessor(documents=data$text, meta=data) 47 | 48 | ## list attributes 49 | attributes(prep) 50 | 51 | ## inspect 52 | head(prep$documents) 53 | head(prep$vocab) 54 | head(prep$meta) 55 | prep$docs.removed 56 | 57 | 58 | ## pre Documents 59 | ## additional processing (removes some documents because of word frequencies greater than .99 or less than .01) 60 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta) 61 | out 62 | 63 | ## inspect 64 | head(out$documents) 65 | head(out$vocab) 66 | head(out$meta) 67 | 68 | ## fit a structural topic model 69 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=2) 70 | 71 | ## inspect attributes 72 | attributes(fit) 73 | 74 | dim(fit$theta) 75 | 76 | ## display topic probabilities 77 | fit$theta 78 | 79 | apply(fit$theta, 1, sum) 80 | 81 | out$meta 82 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_tm_package.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_text_as_data_DTM_tm_package.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Introduction to tutorial: 15 | ## 16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file. 17 | ## 18 | ## This code uses functions available in the tm package. 19 | ## 20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions. 21 | ## 22 | ########################################################################## 23 | 24 | ## load libraries 25 | library(stm) 26 | library(tm) 27 | library(SnowballC) 28 | 29 | ## load data 30 | data <- read.csv("SIMpoliticalTweets.txt", header=FALSE) 31 | data 32 | names(data) <- "text" 33 | data 34 | 35 | #trumptweets <- fromJSON("trump_json_files_20190707.txt") 36 | #data <- trumptweets 37 | 38 | ## 39 | ##data <- data[1:1000,] 40 | 41 | ## create character vector for processing 42 | newtext <- as.character(data$text) 43 | length(newtext) 44 | 45 | 46 | ## use gsub to remove special characters that usually cause errors if left for later 47 | newtext <- gsub("[^0-9A-Za-z///' ]", "", newtext) 48 | newtext <- gsub("[^[:alnum:]///' ]", "", newtext) 49 | newtext <- gsub("[^\x20-\x7F\x0D\x0A]", "", newtext) # remove all non-ascii characters 50 | newtext <- gsub("http.*", "", newtext) # replace all of the urls 51 | newtext <- gsub("www.*", "", newtext) # 52 | 53 | ## data$newtext 54 | data$newtext <- newtext 55 | 56 | ## convert to corpus object using additional functions from the tm package 57 | ## the tm_map function takes as its first argument the vector of text and a function as its second argument 58 | corpus <-Corpus(VectorSource(newtext)) 59 | corpus <- tm_map(corpus, removePunctuation) 60 | corpus <- tm_map(corpus, removeNumbers) 61 | corpus <- tm_map(corpus, stripWhitespace) 62 | corpus <- tm_map(corpus, tolower) 63 | corpus <- tm_map(corpus, removeWords, stopwords("english")) 64 | corpus <- tm_map(corpus, stemDocument) 65 | 66 | ## print to screen 67 | inspect(corpus[1:10]) 68 | 69 | 70 | ## make document by term matrix 71 | DTM <- DocumentTermMatrix(corpus) 72 | DTM 73 | 74 | 75 | ## print DTM to screen 76 | inspect(DTM) 77 | inspect(DTM[1:10,1:25]) 78 | 79 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_data_package_description.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## text adapted from these references: 3 | ## https://hub.packtpub.com/9-useful-r-packages-for-nlp-text-mining/ 4 | ## https://analyticsindiamag.com/top-10-r-packages-for-natural-language-processing-nlp/ 5 | 6 | ## koRpus 7 | ## koRpus is an R package for analysing texts. 8 | ## It includes a diverse collection of functions for automatic language detection. 9 | ## It also includes indices of lexical diversity, such as type token ratio, MTLD, etc. 10 | ## koRpus’ also provides a plugin for R GUI as well as IDE RKWard that assists in providing graphical dialogs for its basic features. 11 | 12 | ## lsa 13 | ## Latent Semantic Analysis or lsa is an R package that provides routines for performing a latent semantic analysis with R. 14 | ## The basic idea of this package is that text do have a higher-order or latent semantic structure which is obscured by word usage e.g. through the use of synonyms or polysemy. 15 | 16 | ## OpenNLP 17 | ## OpenNLP provides an R interface to Apache OpenNLP, which is a collection of natural language processing tools written in Java. 18 | ## OpenNLP supports common natural language processing tasks such as tokenisation, sentence segmentation, part-of-speech tagging, named entity extraction, chunking, parsing and coreference resolution. 19 | 20 | ## Quanteda 21 | ## Quanteda is an R package for managing and analysing text. 22 | ## It is a fast, flexible, and comprehensive framework for quantitative text analysis in R. 23 | ## Quanteda provides functionality for corpus management, creating and manipulating tokens and ngrams, exploring keywords in context, forming and manipulating sparse matrices of documents by features and more. 24 | 25 | ## RWeka 26 | ## RWeka is an interface to Weka, which is a collection of machine learning algorithms for data mining tasks written in Java. 27 | ## It contains tools for data pre-processing, clustering, association rules, visualisation and more. 28 | ## This package contains an interface code, known as the Weka jar that resides in a separate package called ‘RWekajars’. 29 | 30 | ## Spacyr 31 | ## Spacyr is an R wrapper to the Python spaCy NLP library. 32 | ## The package is designed to provide easy access to the functionality of spaCy library in a simple format. 33 | ## One of the easiest methods to install spaCy and spacyr is through the spacyr function spacy_install(). 34 | 35 | ## Stringr 36 | ## Stringr is a consistent, simple and easy to use R package that provides consistent wrappers for the string package and therefore simplifies the manipulation of character strings in R. 37 | ## It includes a set of internally consistent tools for working with character strings, i.e. sequences of characters surrounded by quotation marks. 38 | 39 | ## Text2vec 40 | ## Text2vec is an R package which provides an efficient framework with a concise API for text analysis and natural language processing (NLP). 41 | ## Some of its important features include allowing users to easily solve complex tasks, maximise efficiency per single thread, transparently scale to multiple threads on multicore machines, use streams and iterators, among others. 42 | 43 | ## TM 44 | ## TM or Text Mining Package is a framework for text mining applications within R. 45 | ## The package provides a set of predefined sources, such as DirSource, 46 | ## DataframeSource, etc. which handle a directory, a vector interpreting each component as a document, or data frame like structures (such as CSV files), and more. 47 | 48 | ## Wordcloud 49 | ## Wordcloud is an R package that creates pretty word clouds, visualises differences and similarity between documents, and avoids overplotting in scatter plots with text. 50 | ## The word cloud is a commonly used plot to visualise a speech or set of documents in a clear way. 51 | 52 | ## STM 53 | ## STM is an R package that efficiently estimates the Latent Diriclet Allocation model (LDA) and the related Structural Topic model (STM). 54 | 55 | ## maxent 56 | ## maxent 57 | 58 | ## RKEA 59 | ## RKEA 60 | 61 | ## languageR 62 | ## languageR 63 | 64 | ## RcmdrPlugin.temis 65 | ## RcmdrPlugin.temis 66 | 67 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_data_twitteR_get_twitter_users.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_text_as_data_twitteR_get_tweeter_users.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## setup to access the twitter API using R 16 | ## 17 | ## Create a Twitter application at http://dev.twitter.com. Make sure to give the app read, write and direct message authority. 18 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret". 19 | 20 | ########################################################################## 21 | 22 | #install.packages("twitteR") 23 | library(twitteR) 24 | 25 | ## grab the most recent 100 tweets from Barak Obama 26 | Obama_ut <- userTimeline('barackobama', n=100) 27 | 28 | ## grab the max number of tweets for the President of Senegal Macky Sall 29 | Macky_Sall_ut <- userTimeline('Macky_Sall', n=3200) 30 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_data_twitteR_setup.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_text_as_data_twitteR_setup.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## setup to access the twitter API using R 16 | ## 17 | ########################################################################## 18 | 19 | install.packages("twitteR") 20 | library(twitteR) 21 | 22 | ## go to twitter's developer page: https://developer.twitter.com/ 23 | ## clink to create an App 24 | ## for the desription, you can say something like this "learn to use twitteR API" 25 | ## once you have succefully setup you APP, you will be able to get the four strings that you will fill in below 26 | 27 | ## set keys and tokens to access the twitter API 28 | consumer_key <- "your_consumer_key" 29 | consumer_secret <- "your_consumer_secret" 30 | access_token <- "your_access_token" 31 | access_secret <- "your_access_secret" 32 | 33 | setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) 34 | 35 | setup_twitter_oauth("API key", "API secret") 36 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day06_Text_as_data_programs/R_Demo_text_as_wikip_stm.R: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | ## INSTRUCTOR: Christopher Fariss 3 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 4 | ## University of Essex Summer School 2021 5 | ## 6 | ## Date: 2021-07-24 7 | ## 8 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 9 | ## e-mail: cjf0006@gmail.com 10 | ## e-mail: cjfariss@umich.edu 11 | ########################################################################## 12 | ## Introduction to tutorial: 13 | ## 14 | ## This tutorial demonstrates how the Structural Topic Model functions for a real set of documents. 15 | ## 16 | ## Each unit in the dataset representes a randomly selected sentence from wikipedia. 17 | ## 18 | ########################################################################## 19 | 20 | library(stm) 21 | 22 | wikip_word_dat <- readLines("one_meelyun_sentences.txt") 23 | head(wikip_word_dat) 24 | 25 | n <- length(wikip_word_dat) 26 | n 27 | 28 | wikip_word_dat <- data.frame(id=1:n, text=wikip_word_dat) 29 | head(wikip_word_dat) 30 | 31 | 32 | wikip_word_dat$human_rights_test <- as.numeric(grepl("human rights", wikip_word_dat$text)) 33 | table(wikip_word_dat$human_rights_test) 34 | 35 | wikip_word_dat$civil_rights_test <- as.numeric(grepl("civil rights", wikip_word_dat$text)) 36 | table(wikip_word_dat$civil_rights_test) 37 | 38 | ## preprocess the documents 39 | ## This function uses function from the tm package (see the tm Demo for more details) 40 | prep <- textProcessor(documents=wikip_word_dat$text, meta=wikip_word_dat) 41 | 42 | save(out, file="wikip_word_dat_stm_prep.Rdata") 43 | 44 | ## inspect 45 | head(prep$documents) 46 | head(prep$vocab) 47 | head(prep$meta) 48 | 49 | 50 | ## pre Documents 51 | ## stem words and remove stop words 52 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta) 53 | 54 | save(out, file="wikip_word_dat_stm_out.Rdata") 55 | 56 | ## inspect 57 | head(out$documents) 58 | head(out$vocab) 59 | head(out$meta) 60 | 61 | ## fit a structural topic model 62 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=20) 63 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=40) 64 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=100) 65 | 66 | #fit_2 <- fit 67 | #fit_3 <- fit 68 | #fit_10 <- fit 69 | #fit_20 <- fit 70 | fit_40 <- fit 71 | 72 | ## display topic probabilities 73 | head(fit$theta) 74 | 75 | dim(fit$theta) 76 | summary(head(fit$theta, 1000)) 77 | 78 | #save(fit, file="wikip_word_dat_stm20.Rdata") 79 | #save(fit, file="wikip_word_dat_stm40.Rdata") 80 | save(fit, file="wikip_word_dat_stm100.Rdata") 81 | 82 | 83 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day07_Applied_Machine_Learning_AML/R_Demo_AML_neuralnet_gradiant_decent_glm.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_neuralnet_gradiant_decent_glm.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Gradient Decent for generalized linear model example. 16 | ## 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network. 18 | ## 19 | ########################################################################## 20 | 21 | ## set learning rate this varies on the unit interval (0 to 1] 22 | library(MASS) 23 | library(boot) 24 | library(gtools) 25 | 26 | lr <- .2 27 | n <- 100 28 | x <- rnorm(n) 29 | 30 | alpha <- -1 31 | beta <- -2 32 | 33 | y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x)) 34 | #y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x + rnorm(n))) 35 | 36 | ## Start with a random guess 37 | X_mat <- cbind(1,x) 38 | 39 | alpha_hat <- 4 40 | beta_hat <- 4 41 | iterations <- 1000 42 | 43 | loss <- variance <- NA 44 | 45 | y_hat <- matrix(NA, nrow=n, ncol=iterations) 46 | y_error <- matrix(NA, nrow=n, ncol=iterations) 47 | delta <- matrix(NA, nrow=2, ncol=iterations) 48 | 49 | ## sequential iterations to evaulate loss function 50 | for (j in 1:iterations){ 51 | 52 | ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta 53 | y_hat[,j] <- inv.logit(alpha_hat[j] + beta_hat[j] * x) 54 | 55 | ## this works too but not as well as above 56 | #y_hat[,j] <- rbinom(1:n,1,prob=inv.logit(alpha_hat[j] + beta_hat[j] * x)) 57 | 58 | ## difference between the predicted y (y_hat) and y is the error for y 59 | y_error[,j] <- y_hat[,j] - y 60 | 61 | ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors 62 | loss[j] <- sum(y_error[,j]^2) 63 | 64 | ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X 65 | ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n) 66 | delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n 67 | delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n 68 | 69 | ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y) 70 | alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j]) 71 | beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j]) 72 | } 73 | 74 | ## estimate glm model 75 | fit <- glm(y~x, family=binomial("logit")) 76 | fit 77 | 78 | ## print the last value of the sequence of parameter estimates 79 | alpha_hat[length(alpha_hat)] 80 | beta_hat[length(beta_hat)] 81 | 82 | 83 | ## graph the values as a function of the loss statistic 84 | ALPHA_seq <- seq(from=-4,4,.05) 85 | BETA_seq <- seq(from=-4,4,.05) 86 | 87 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq)) 88 | 89 | ## grid search for graphing 90 | for(i in 1:length(ALPHA_seq)){ 91 | for(j in 1:length(ALPHA_seq)){ 92 | Y_hat <- inv.logit(ALPHA_seq[i] + BETA_seq[j] * X_mat[,2]) 93 | LOSS[i,j] <- sum((y-Y_hat)^2) 94 | 95 | } 96 | } 97 | 98 | ## plot the log loss 99 | contour(ALPHA_seq, BETA_seq,(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5) 100 | lines(alpha_hat, beta_hat, col=2, lwd=4) 101 | 102 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day07_Applied_Machine_Learning_AML/R_Demo_AML_neuralnet_gradiant_decent_lm.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_neuralnet_gradiant_decent_lm.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Gradient Decent for a linear model with two parameters (intercept and slope). 16 | ## 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network. 18 | ## 19 | ########################################################################## 20 | 21 | 22 | ## load libraries 23 | library(MASS) 24 | 25 | ## set learning rate this varies on the unit interval (0 to 1] 26 | lr <- .2 27 | 28 | n <- 100 29 | x <- rnorm(n) 30 | 31 | alpha <- -1 32 | beta <- -2 33 | 34 | y <- alpha + beta*x + rnorm(n) 35 | plot(x=x, y=y) 36 | fit <- lm(y ~ x) 37 | summary(fit) 38 | 39 | 40 | X_mat <- cbind(1,x) 41 | head(X_mat) 42 | 43 | # Start with a random guess 44 | alpha_hat <- 1 45 | beta_hat <- 4 46 | 47 | 48 | iterations <- 100 49 | 50 | loss <- NA 51 | variance <- NA 52 | 53 | y_hat <- matrix(NA, nrow=n, ncol=iterations) 54 | y_error <- matrix(NA, nrow=n, ncol=iterations) 55 | delta <- matrix(NA, nrow=2, ncol=iterations) 56 | 57 | ## sequential iterations to evaulate loss function 58 | for (j in 1:iterations){ 59 | 60 | ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta 61 | y_hat[,j] <- alpha_hat[j] + beta_hat[j] * x 62 | 63 | ## difference between the predicted y (y_hat) and y is the error for y 64 | y_error[,j] <- y_hat[,j] - y 65 | 66 | ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors 67 | loss[j] <- sum(y_error[,j]^2) 68 | 69 | ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X 70 | ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n) 71 | 72 | ## 73 | delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n 74 | delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n 75 | 76 | ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y) 77 | alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j]) 78 | beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j]) 79 | } 80 | 81 | ## print lm summary from above 82 | summary(fit) 83 | 84 | ## print the last value of the sequence of parameter estimates 85 | alpha_hat[length(alpha_hat)] 86 | beta_hat[length(beta_hat)] 87 | 88 | 89 | ## the least squares solution 90 | solve(t(X_mat) %*% X_mat) %*% t(X_mat) %*% y 91 | 92 | ## note that these give distinct regression models because the covariance between alpha and beta are not included (so they are different from the one above) 93 | solve(t(X_mat[,2]) %*% X_mat[,2]) %*% t(X_mat[,2]) %*% y 94 | solve(t(X_mat[,1]) %*% X_mat[,1]) %*% t(X_mat[,1]) %*% y 95 | 96 | 97 | ## graph the values as a function of the loss statistic 98 | ALPHA_seq <- seq(from=-4,4,.05) 99 | BETA_seq <- seq(from=-4,4,.05) 100 | 101 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq)) 102 | 103 | ## grid search for graphing 104 | for(i in 1:length(ALPHA_seq)){ 105 | for(j in 1:length(ALPHA_seq)){ 106 | Y_hat <- ALPHA_seq[i] + BETA_seq[j] * X_mat[,2] 107 | LOSS[i,j] <- sum((y-Y_hat)^2) 108 | } 109 | } 110 | 111 | ## plot the log loss 112 | par(mar=c(5,5,1,1)) 113 | contour(ALPHA_seq, BETA_seq, log(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5) 114 | lines(alpha_hat, beta_hat, col=2, lwd=4) 115 | 116 | 117 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day07_Applied_Machine_Learning_AML/R_Demo_AML_neuralnet_gradiant_decent_mu.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_neuralnet_gradiant_decent_mu.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2021 6 | ## 7 | ## Date: 2021-07-24 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Gradient Decent for a model with one parameter for a mean. 16 | ## 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network. 18 | ## 19 | ########################################################################## 20 | 21 | 22 | 23 | 24 | 25 | ## load libraries 26 | library(MASS) 27 | 28 | ## set learning rate this varies on the unit interval (0 to 1] 29 | lr <- .2 30 | #lr <- .01 31 | #lr <- 1 32 | 33 | y <- c(1,2,3,4,5) 34 | ##y <- rnorm(1000, pi, 1) 35 | n <- length(y) 36 | 37 | y 38 | n 39 | 40 | 41 | iterations <- 100 42 | 43 | loss <- NA 44 | 45 | mu_hat <- -1.5 46 | 47 | y_hat <- matrix(NA, nrow=n, ncol=iterations) 48 | y_error <- matrix(NA, nrow=n, ncol=iterations) 49 | 50 | delta_mu_hat <- NA 51 | 52 | ## sequential iterations to evaulate loss function 53 | for (j in 1:iterations){ 54 | 55 | ## y_hat: calculate the predicted y_hat based on the best guess of mu 56 | 57 | y_hat[,j] <- mu_hat[j] 58 | 59 | ## risidual: the difference between the predicted y (y_hat) and y is the error for y 60 | y_error[,j] <- y_hat[,j] - y 61 | 62 | ## loss: the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors 63 | loss[j] <- sum(y_error[,j]^2) 64 | 65 | ## difference between current estimate and the unexplained differences, which is the method to calculate the gradient at that point 66 | delta_mu_hat[j] <- sum(y_error[,j])/n 67 | 68 | ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y) 69 | mu_hat[j+1] <- mu_hat[j] - (lr*delta_mu_hat[j]) 70 | } 71 | 72 | mu_hat[length(mu_hat)] 73 | 74 | plot(mu_hat) 75 | 76 | plot(mu_hat[-1], delta_mu_hat) 77 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2021/Day08_Model_Evaluation/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2021/Day08_Model_Evaluation/.DS_Store -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/.DS_Store -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day01_Intro_programs/.Rhistory: -------------------------------------------------------------------------------- 1 | ## what is the current working directory? 2 | getwd() 3 | ## set the working directory to an object using the assignment operator <- or = (more on this later) 4 | wd <- getwd() 5 | ## print to screen 6 | wd 7 | ## if you remember the function name use a single question mark 8 | ?rm 9 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day01_Intro_programs/R_Demo_Intro_Program_Challenge_two_sum.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Intro_Program_Challenge_two_sum.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Instructions: 14 | ## 15 | ## These challenges are meant to be just that, challenging. 16 | ## They should also be fun. I encourage you to think creatively and collaboratively. 17 | ## Getting stuck or not finishing all the steps is expected and encouraged. 18 | ## This is how learning works! 19 | ## Learn to program, program to learn. 20 | ## 21 | ## Always start with step (1) and then continue to each step as time permits. 22 | ## Don't worry about completing each step. Document your code for each step. 23 | ## You may wish to come back to some of the harder steps as you progress through the course. 24 | ## Note that some of the steps may ask you to use skills we have not yet covered in the course. 25 | ## Don't worry about these steps now but definitely think through the programming logic if you are stuck and make plans to come back to try them once you feel ready. 26 | ## 27 | ########################################################################## 28 | ## 29 | ## Steps for the Challenge 30 | ## 31 | ## (1) create a vector of integers numbers and a scalar integer target 32 | ## (2) write a program in R that determines (returns) two numbers from the vector that add up to the target scalar 33 | ## (3) how many combinations of numbers in the vector of integers add up to the target scalar? 34 | ## (4) write a function to complete steps 1-3 (hint: wrap the program from (3) within a function) 35 | ## (5) write a simulation that explore the relationship between (a) the vector of integers numbers and (b) scalar integer target sum 36 | ## (6) re-write the program or function so that it takes fewer steps to calculate the number of numeric combinations that add up to the scalar target 37 | ## 38 | ########################################################################## 39 | 40 | ## (1) create a vector of integers numbers and a scalar integer target 41 | x <- c(1,2,3,4,5) 42 | y <- 5 43 | 44 | ## (2) write a program in R that determines (returns) two numbers from the vector that add up to the target scalar 45 | 46 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day02_Data_Analysis_Managment/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/Day02_Data_Analysis_Managment/.DS_Store -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day02_Data_Analysis_Managment/R_Demo_Intro_forloop_vs_lapply.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Intro_forloop_vs_lapply.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## in class coding of loops and lapply 16 | 17 | 1:10 18 | 19 | lenght(1:10) 20 | 21 | 22 | ## for loop version 23 | output <- c() 24 | for(i in 1:10){ 25 | 26 | local_var <- i*i ## the simple but core calculation of the iterative process 27 | output[i] <- local_var 28 | 29 | } 30 | output 31 | 32 | 33 | ## lapply version 34 | output <- lapply(1:10, function(i){ 35 | 36 | local_var <- i*i ## the simple but core calculation of the iterative process 37 | return(local_var) 38 | }) 39 | unlist(output) 40 | 41 | 42 | 43 | ## nested for loop version 44 | mat <- matrix(NA, nrow=10, ncol=5) 45 | mat 46 | 47 | for(i in 1:10){ 48 | for(j in 1:5){ 49 | local_var <- i*j 50 | mat[i,j] <- local_var 51 | } 52 | } 53 | mat 54 | 55 | ## lapply version with an inner for loop (replicates the nested for loop structure above) 56 | mat_list <- lapply(1:10, function(i){ 57 | 58 | local_vec <- c() 59 | for(j in 1:5){ 60 | local_var <- i*j 61 | local_vec[j] <- local_var 62 | } 63 | 64 | return(local_vec) 65 | }) 66 | 67 | unlist(mat_list) 68 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T) 69 | mat 70 | 71 | 72 | ## nested lapply version (replicated both versions above) 73 | mat_list <- lapply(1:10, function(i){ 74 | local_out <- lapply(1:5, function(j){ 75 | temp <- i*j 76 | return(temp) 77 | }) 78 | return(local_out) 79 | }) 80 | 81 | mat_list 82 | unlist(mat_list) 83 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T) 84 | mat 85 | 86 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day03_Simulation_Learning/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/Day03_Simulation_Learning/.DS_Store -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day03_Simulation_Learning/R_Demo_Simulation_Learning_Dice_Rolls.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Simulation_Learning_Estimate_Mean.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-28 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ## 13 | ########################################################################## 14 | ## For this R tutorial, we will learn how: 15 | ## 16 | ## (1) Simulate the roll of a D6 dice 17 | ## (2) Repeatedly simulate the roll of a D6 dice and see how close the average value is to the expected value. 18 | ## (3) Learn about the central limit theorem from the simulation. 19 | ## 20 | ## Notes: The Central Limit Theorem (CLT) establishes that when independently generated variables (iid: independent and identically distributed random variables) are added together, the sums or averages of these variables (when normalized) converge towards a normal distribution. 21 | ## 22 | ## This property emerges even if the original variables are not individually normally distributed, as with the roll of a die. 23 | ## 24 | ## The probability of any value from the single roll of die is equivalent to any other value for the same-sided die in the limit (when the number of rolls approaches infinity). 25 | ## 26 | ########################################################################## 27 | 28 | 29 | library(MASS) 30 | 31 | 32 | ## simulate 20 randomly generated rolls from a D6 (6-sided-die) 33 | sample(1:6, size=20, replace=TRUE) 34 | 35 | ## true mean is 3.5 36 | (1 + 2 + 3 + 4 + 5 + 6) / 6 37 | 38 | ## or 39 | mean(1:6) 40 | 41 | ## true variance is approximately 2.916667 or exactly 70/24 42 | (1 - 3.5)^2 * (1/6) + (2 - 3.5)^2 * (1/6) + (3 - 3.5)^2 * (1/6) + (4 - 3.5)^2 * (1/6) + (5 - 3.5)^2 * (1/6) + (6 - 3.5)^2 * (1/6) 43 | 44 | ## or 45 | sum((1:6 - mean(1:6))^2 * (1/6)) 46 | 47 | ## repeat the simulation 10,000 times and calculate the average 48 | n_sims <- 2000 49 | 50 | ## number of samples to roll each iteration 51 | n_samples <- 10 52 | 53 | ## create two objects to hold the calculated mean and variance from each simulated sample 54 | sim_mean_values <- c() 55 | sim_var_values <- c() 56 | 57 | ## iterate/repeat the simulation n_sims times 58 | for(i in 1:n_sims){ 59 | 60 | ## create output 61 | sample_output <- sample(1:6, size=n_samples, replace=TRUE) 62 | 63 | ## save the output in the i_th position of the objects 64 | sim_mean_values[i] <- mean(sample_output) 65 | sim_var_values[i] <- var(sample_output) 66 | 67 | } 68 | 69 | ## calculate the mean and variance of the 10,000 sample means 70 | mean(sim_mean_values) 71 | mean(sim_var_values) 72 | 73 | ## set graphical parameters 74 | par(mfrow=c(1,2), mar=c(4,3,1,1)) 75 | 76 | ## plot histograms 77 | truehist(sim_mean_values, main="Mean Estimate") 78 | truehist(sim_var_values, main="Variance Estimate") 79 | 80 | 81 | ## calculate and plot the converging average using increasing sample sizes starting at 1 and ending at all the samples 82 | ## set graphical parameters 83 | par(mfrow=c(1,2)) 84 | 85 | ## 86 | plot(0,0, ylim=c(2.5,4), xlim=c(0,n_sims), type="n", main="Mean Estimate") 87 | value <- c() 88 | for(i in 1:n_sims){ 89 | value[i] <- mean(sim_mean_values[1:i]) 90 | } 91 | lines(value) 92 | abline(h=3.5, col="orange", lwd=2, lty=2) 93 | 94 | ## 95 | plot(0,0, ylim=c(2.5,4), xlim=c(0,n_sims), type="n", main="Variance Estimate") 96 | value <- c() 97 | for(i in 1:n_sims){ 98 | value[i] <- mean(sim_var_values[1:i]) 99 | } 100 | lines(value) 101 | abline(h=2.916667, col="orange", lwd=2, lty=2) 102 | 103 | 104 | sqrt(2.916667) 105 | 106 | ## the values converge towards normality 107 | summary((sim_mean_values - 3.5)) 108 | mean((sim_mean_values - 3.5)) 109 | var((sim_mean_values - 3.5)) 110 | 111 | 112 | 113 | var(sim_mean_values) 114 | var(sim_var_values) 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day03_Simulation_Learning/R_Demo_Simulation_Learning_ProbCalc_BayesRule.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Simulation_Learning_ProbCalc_BayesRule.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-28 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Bayes rule and the calculation of conditional probability. Introduces the naive Bayes classifier function from the e1071 library. 16 | ## 17 | ## P(outcome | evidence) = P(outcome) * P(evidence | outcome) / P(evidence) 18 | ## 19 | ## Below is the Bayes’ Theorem: 20 | ## P(A | B) = P(A) * P(B | A) / P(B) 21 | ## 22 | ## Which can be derived from the general multiplication formula for AND events: 23 | ## P(A and B) = P(A) * P(B | A) 24 | ## P(B | A) = P(A and B) / P(A) 25 | ## P(B | A) = P(B) * P(A | B) / P(A) 26 | ## P(y|x) = P(x|y) * P(y) / P(x) 27 | ## P(x|y) = P(x AND y) / P(x) 28 | ## 29 | ## Pr(A[1] = Pr(y==0) 30 | ## Pr(A[2] = Pr(y==1) 31 | ## Pr(B | A[1]) = Pr(Data | y==0) 32 | ## Pr(B | A[2]) = Pr(Data | y==1) 33 | ## 34 | ########################################################################## 35 | 36 | 37 | A <- c(0,1,0,1) 38 | B <- c(0,0,0,1) 39 | 40 | 41 | ProbCalc <- function(A,B){ 42 | 43 | len <- length(A) 44 | data <- as.data.frame(cbind(A,B, A*B, A+B-A*B)) 45 | names(data) <- c("A", "B", "A and B", "A or B") 46 | 47 | pr.A <- sum(A)/len 48 | pr.B <- sum(B)/len 49 | pr.NotA <- 1 - pr.A 50 | pr.NotB <- 1 - pr.B 51 | pr.A.B <- sum(A*B)/len 52 | pr.NotA.B <- sum(abs(A-1)*B)/len 53 | pr.A.NotB <- sum(A*abs(B-1))/len 54 | pr.NotA.NotB <- sum(abs(A-1)*abs(B-1))/len 55 | 56 | pr.A.or.B <- pr.A + pr.B - pr.A.B 57 | 58 | pr.A.condB <- pr.A.B / pr.B 59 | pr.B.condA <-pr.A.B / pr.A 60 | 61 | out <- list(data=data, pr.A=pr.A, pr.B=pr.B, pr.NotA=pr.NotA, pr.NotB=pr.NotB, pr.A.B=pr.A.B, pr.NotA.B=pr.NotA.B, pr.A.NotB=pr.A.NotB, pr.NotA.NotB=pr.NotA.NotB, pr.A.or.B= pr.A.or.B, pr.A.condB=pr.A.condB, pr.B.condA=pr.B.condA) 62 | return(out) 63 | 64 | } 65 | 66 | ProbCalc(A,B) 67 | 68 | ProbTabCalc <- function(A,B){ 69 | 70 | tab <- table(A,B)/length(A) 71 | 72 | pr.A <- sum(tab[A==1]) 73 | pr.B <- sum(tab[B==1]) 74 | pr.NotA <- sum(tab[A==0]) 75 | pr.NotB <- sum(tab[B==0]) 76 | pr.A.B <- sum(tab[A==1 & B==1]) 77 | pr.NotA.B <- sum(tab[A==0 & B==1]) 78 | pr.A.NotB <- sum(tab[A==1 & B==0]) 79 | pr.NotA.NotB <- sum(tab[A==0 & B==0]) 80 | 81 | pr.A.or.B <- sum(tab[A==1 | B==1]) 82 | 83 | pr.A.condB <- pr.A.B / pr.B 84 | pr.B.condA <-pr.A.B / pr.A 85 | 86 | out <- list(table=tab, pr.A=pr.A, pr.B=pr.B, pr.NotA=pr.NotA, pr.NotB=pr.NotB, pr.A.B=pr.A.B, pr.NotA.B=pr.NotA.B, pr.A.NotB=pr.A.NotB, pr.NotA.NotB=pr.NotA.NotB, pr.A.or.B= pr.A.or.B, pr.A.condB=pr.A.condB, pr.B.condA=pr.B.condA) 87 | return(out) 88 | 89 | } 90 | 91 | ProbTabCalc(A,B) 92 | 93 | 94 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/.DS_Store -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_Simulation_Inference_10_Fold_Cross_Validation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-28 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Goal: Improve the predictive power or predictive validity of a model when applied to new observed values 15 | ## 16 | ########################################################################## 17 | ## Introduction to tutorial: 18 | ## 19 | ## For this R tutorial we will simulate a dataset and then randomly divide it into ten subsets. 20 | ## 21 | ## We will then fit a model using the observations from 1 of the subsets of data (test data) and then use the model estimates to predict the value of the dependent variable for the remaining out of sample data subset (test data). 22 | ## 23 | ## After that step, we will use the other half of the data to fit the model and then predict the other hold out sample. 24 | ## 25 | ## In this way, both halves of the data will be predicted using a model estimated from other data not used in the fitting 26 | ## 27 | ## This process is increasingly common and required in almost all Machine Learning and predictive tasks in data science and increasingly so in the social sciences. 28 | ## 29 | ########################################################################## 30 | 31 | 32 | #rm(list = ls()) 33 | 34 | set.seed(940) 35 | 36 | ## set number of observations for simulation 37 | n <- 100 38 | 39 | ## number of folds (randomly created sub samples of data) 40 | k <- 10 41 | 42 | ## simulation of variables 43 | x <- sample(4:14,n,replace=TRUE) 44 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,1) 45 | #y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,2) 46 | 47 | plot(x,y) 48 | 49 | ## create a subject/unit ID variable with one values for each unit 50 | ## here the indicator values takes on 2-Fold values {1,2} 51 | folds <- sample(rep(1:k, k), n, replace=FALSE) 52 | 53 | table(folds) 54 | 55 | ## create a data frame with the dependent varaible, independent variable, and randomly created ID 56 | dat <- data.frame(y, x, folds) 57 | 58 | ## create vectors for storing predictions 59 | dat$y.hat0 <- NA 60 | dat$y.hat1 <- NA 61 | dat$y.hat2 <- NA 62 | 63 | #test <- matrix(NA, nrow=k, ncol=2) 64 | 65 | ## function to 66 | for(i in 1:k){ 67 | 68 | ## fit a linear model 69 | fit0 <- lm(y ~ 1, data=subset(dat, folds!=i)) 70 | pred0 <- predict(fit0, newdata=subset(dat, folds==i)) 71 | y.hat0 <- as.numeric(pred0) 72 | 73 | dat$y.hat0[dat$fold==i] <- y.hat0 74 | 75 | ## fit a linear model 76 | fit1 <- lm(y ~ x, data=subset(dat, folds!=i)) 77 | pred1 <- predict(fit1, newdata=subset(dat, folds==i)) 78 | y.hat1 <- as.numeric(pred1) 79 | 80 | dat$y.hat1[dat$fold==i] <- y.hat1 81 | 82 | 83 | ## fit a linear model with a squared term 84 | fit2 <- lm(y ~ x + I(x^2), data=subset(dat, folds!=i)) 85 | pred2 <- predict(fit2, newdata=subset(dat, folds==i)) 86 | y.hat2 <- as.numeric(pred2) 87 | 88 | dat$y.hat2[dat$fold==i] <- y.hat2 89 | 90 | print(summary(dat)) 91 | } 92 | 93 | rmse.fit0 <- sqrt(mean((dat$y.hat0-dat$y)^2)) 94 | rmse.fit0 95 | 96 | rmse.fit1 <- sqrt(mean((dat$y.hat1-dat$y)^2)) 97 | rmse.fit1 98 | 99 | rmse.fit2 <- sqrt(mean((dat$y.hat2-dat$y)^2)) 100 | rmse.fit2 101 | 102 | c(rmse.fit0, rmse.fit1, rmse.fit2) 103 | 104 | 105 | cor.fit0 <- cor(dat$y.hat0, dat$y, method="spearman") 106 | cor.fit0 107 | 108 | cor.fit1 <- cor(dat$y.hat1, dat$y, method="spearman") 109 | cor.fit1 110 | 111 | cor.fit2 <- cor(dat$y.hat2, dat$y, method="spearman") 112 | cor.fit2 113 | 114 | c(cor.fit0, cor.fit1, cor.fit2) 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/R_Demo_Simulation_Inference_2_Sample_Hold_Out.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_Simulation_Inference_2_Sample_Hold_out.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-28 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Goal: Improve the predictive power or predictive validity of a model 15 | ## 16 | ########################################################################## 17 | ## Introduction to tutorial: 18 | ## 19 | ## (1) Begin building towards and learning about cross-validation 20 | ## (NOTE: There is no "crossing" yet) 21 | ## 22 | ## For this R tutorial we will simulate a dataset and then randomly divide it into two subsets. 23 | ## 24 | ## We will fit a model using the observations from one of the subsets of data (training data). 25 | ## 26 | ## We will then use the model estimates to predict the value of the dependent variable for the remaining out-of-sample data subset (testing data). 27 | ## 28 | ########################################################################## 29 | 30 | 31 | set.seed(940) 32 | 33 | 34 | ## set number of observations for simulation 35 | n <- 100 36 | 37 | ## simulation of variables (This model is one of Anscombe's quartets) 38 | x <- sample(4:14,n,replace=TRUE) 39 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,1) 40 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,2) 41 | 42 | ## plot the simulated relationship 43 | par(mfrow=c(1,1)) 44 | plot(x=x, y=y) 45 | 46 | ## create a subject/unit ID variable with one values for each unit 47 | ## here the indicator values takes on 2-Fold values {1,2} 48 | folds <- sample(rep(1:2, n/2), n, replace=FALSE) 49 | folds 50 | 51 | table(folds) 52 | 53 | ## create a data frame with the dependent variable, independent variable, and randomly created ID 54 | dat <- data.frame(y, x, folds) 55 | 56 | summary(dat) 57 | 58 | head(dat) 59 | 60 | ## fit a linear model to the full dataset 61 | model <- lm(y ~ x, data=dat) 62 | summary(model) 63 | 64 | 65 | ## subset the full dataset into to subsets based on the ID variable 66 | train <- subset(dat, folds==1) 67 | test <- subset(dat, folds==2) 68 | 69 | train <- dat[dat$folds==1,] 70 | test <- dat[dat$folds==2,] 71 | 72 | nrow(train) 73 | nrow(test) 74 | 75 | 76 | ## Model 0: fit a linear model 77 | fit <- lm(y ~ 1, data=train) 78 | pred <- predict(fit, newdata=test) 79 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2)) 80 | rmse 81 | 82 | ## Model 1: fit a linear model 83 | fit <- lm(y ~ x, data=train) 84 | pred <- predict(fit, newdata=test) 85 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2)) 86 | rmse 87 | 88 | ## Model 2: fit a linear model with a squared term 89 | fit <- lm(y ~ x + I(x^2), data=train) 90 | pred <- predict(fit, newdata=test) 91 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2)) 92 | rmse 93 | 94 | 95 | ################################################## 96 | ## Model 2: fit a linear model with a squared term 97 | fit <- lm(y ~ x + I(x^2), data=train) 98 | pred <- predict(fit, newdata=train) 99 | rmse <- sqrt(mean((as.numeric(pred)-train$y)^2)) 100 | rmse 101 | 102 | 103 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_Simulation_Inference_naive_Bayes.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-28 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Bayes rule and the calculation of conditional probability. Introduces the naive Bayes classifier function from the e1071 library. 16 | ## 17 | ## P(outcome | evidence) = P(outcome) * P(evidence | outcome) / P(evidence) 18 | ## 19 | ## Below is the Bayes’ Theorem: 20 | ## P(A | B) = P(A) * P(B | A) / P(B) 21 | ## 22 | ## Which can be derived from the general multiplication formula for AND events: 23 | ## P(A and B) = P(A) * P(B | A) 24 | ## P(B | A) = P(A and B) / P(A) 25 | ## P(B | A) = P(B) * P(A | B) / P(A) 26 | ## P(y|x) = P(x|y) * P(y) / P(x) 27 | ## P(x|y) = P(x AND y) / P(x) 28 | ## 29 | ## Pr(A[1] = Pr(y==0) 30 | ## Pr(A[2] = Pr(y==1) 31 | ## Pr(B | A[1]) = Pr(Data | y==0) 32 | ## Pr(B | A[2]) = Pr(Data | y==1) 33 | ## 34 | ########################################################################## 35 | 36 | 37 | ## load libraries 38 | library(e1071) 39 | library(LaplacesDemon) 40 | 41 | 42 | ## example code from BayesTheorem() function 43 | PrA <- c(0.75,0.25) 44 | PrBA <- c(6/9, 5/7) 45 | BayesTheorem(PrA, PrBA) 46 | 47 | 48 | ## create fake data 49 | n <- 10 50 | x <- c(rep(0,n/2), rep(1,n/2)) 51 | y <- c(0,0,0,1,1,0,0,1,1,1) 52 | 53 | ## inspect data 54 | cbind(y,x) 55 | 56 | ## inspect tabulation of data 57 | table(y,x) 58 | 59 | 60 | ## calculate the probability of the evidence/data 61 | PrX <- NA 62 | PrX[1] <- sum(as.numeric(x==1)) / n 63 | PrX[2] <- sum(as.numeric(x==1)) / n 64 | 65 | ## calculate the probability of the outcome 66 | PrY <- NA 67 | PrY[1] <- sum(as.numeric(y==0))/n 68 | PrY[2] <- sum(as.numeric(y==1))/n 69 | PrY 70 | 71 | ## calculate the probability of the data conditional on the value of y (the likelihood) 72 | PrXY<- NA 73 | PrXY[1] <- sum(x[y==0])/length(as.numeric(x[y==0])) 74 | PrXY[2] <- sum(x[y==1])/length(as.numeric(x[y==1])) 75 | PrXY 76 | 77 | ## apply Bayes Rule 78 | PrXY * PrY / PrX 79 | 80 | ## apply Bayes Rule with BayesTheorem() function 81 | BayesTheorem(PrA=PrY, PrBA=PrXY) 82 | 83 | 84 | ## apply Bayes Rule with naiveBayes() function 85 | fit <- naiveBayes(y~x, data=data.frame(y,x)) 86 | fit 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-28 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## (1) Program simulates count data that is then used to predict a binary outcome variable. 16 | ## 17 | ## (2) Three models are evaluated using the count data to predict the outcome: 18 | ## (2a) linear model 19 | ## (2b) generalized linear model with a logit link function 20 | ## (2c) naive Bayes classifier. 21 | ## 22 | ########################################################################## 23 | 24 | 25 | ## load library 26 | llibrary(e1071) 27 | 28 | 29 | ## simulate x1 and set the "true" population values alpha and beta 30 | n <- 100 31 | 32 | ## unobserved 33 | x <- runif(n,0,1) 34 | 35 | ## observed counts 36 | x1 <- rpois(n, lambda=x) 37 | x2 <- rpois(n, lambda=x) 38 | x3 <- rpois(n, lambda=2*x) 39 | x4 <- rpois(n, lambda=2*x) 40 | x5 <- rpois(n, lambda=4*x) 41 | 42 | ## systematic component of the model based on observed counts 43 | xb <- -2 + x1 + x2 + x3 + x4 + x5 44 | 45 | ## transform the linear term xb using 46 | ## the inverse logit function 47 | ## so that theta is bound from 0 to 1 48 | pi <- 1 / (1 + exp(-xb)) 49 | 50 | ## generate the dependent variable y with probability pi and measurement error from a Bernoulli trial 51 | y <- rbinom(n, size=1, prob=pi) 52 | 53 | 54 | ## make data frame 55 | dat <- data.frame(y, x1, x2, x3, x4, x5) 56 | 57 | 58 | ## summarize fit using linear model 59 | summary(lm(y ~ x1 + x2 + x3 + x4 + x5, data=dat)) 60 | 61 | 62 | ## summarize fit using glm using the logit link function 63 | summary(glm(y ~ x1 + x2 + x3 + x4 + x5, family=binomial(link="logit"))) 64 | 65 | 66 | ## summarize fit using naiveBayes model 67 | naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=dat) 68 | 69 | 70 | ## create vectors for storing predictions 71 | dat$y.hat1 <- NA 72 | dat$y.hat2 <- NA 73 | dat$y.hat3 <- NA 74 | 75 | ## select number of folds 76 | k <- 10 77 | 78 | ## create vector of folds for cross validation 79 | dat$folds <- sample(rep(1:k, k), n, replace=FALSE) 80 | 81 | ## lapply function to 82 | for(i in 1:k){ 83 | 84 | ## fit a linear model 85 | fit1 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i)) 86 | pred1 <- predict(fit1, newdata=subset(dat, folds==i)) 87 | y.hat1 <- as.numeric(pred1) 88 | 89 | dat$y.hat1[dat$fold==i] <- y.hat1 90 | 91 | 92 | ## fit a glm model 93 | fit2 <- glm(y ~ x1 + x2 + x3 + x4 + x5, binomial(link="logit"), data=subset(dat, folds!=i)) 94 | pred2 <- predict(fit2, newdata=subset(dat, folds==i)) 95 | y.hat2 <- as.numeric(pred2) 96 | 97 | dat$y.hat2[dat$fold==i] <- y.hat2 98 | 99 | 100 | ## fit a naiveBayes classifier model 101 | fit3 <- naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i)) 102 | pred3 <- predict(fit3, newdata=subset(dat, folds==i)) 103 | y.hat3 <- as.numeric(pred3) 104 | 105 | dat$y.hat3[dat$fold==i] <- y.hat3 106 | 107 | #print(summary(dat)) 108 | } 109 | 110 | rmse.fit1 <- sqrt(mean((dat$y.hat1-dat$y)^2)) 111 | rmse.fit1 112 | 113 | cor.fit1 <- cor(dat$y.hat1, dat$y, method="spearman") 114 | 115 | rmse.fit2 <- sqrt(mean((dat$y.hat2-dat$y)^2)) 116 | rmse.fit2 117 | 118 | cor.fit2 <- cor(dat$y.hat2, dat$y, method="spearman") 119 | 120 | rmse.fit3 <- sqrt(mean((dat$y.hat3-dat$y)^2)) 121 | rmse.fit3 122 | 123 | cor.fit3 <- cor(dat$y.hat3, dat$y, method="spearman") 124 | 125 | c(cor.fit1, cor.fit2, cor.fit3) 126 | 127 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day05_Measurement_RSTAN/RSTAN_linear_model_simulation.R: -------------------------------------------------------------------------------- 1 | ## RSTAN_linear_model_simulation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN 5 | ## 6 | ## Please e-mail me if you find any errors or have and suggestions 7 | ## e-mail: cjf0006@gmail.com 8 | ## 9 | ########################################################################## 10 | ## 11 | ## Introduction to tutorial: 12 | ## 13 | ## For this R tutorial we will draw random samples from the normal distribution using the STAN program. These parameters will be estimated based on the likelihood function that links them to the data contained in the y and x variables that are simulated observed data. The model produces the slope and intercept from a standard linear model, which is also estimated using lm() in R. 14 | ## 15 | ########################################################################## 16 | 17 | ## load library 18 | library(rstan) # load rstan library 19 | library(MASS) # load library with truehist function 20 | 21 | ## -------------------------------------------------- # 22 | ## define STAN model as a character 23 | ## -------------------------------------------------- # 24 | model <- " 25 | data { 26 | // declared the data in memory 27 | int n; 28 | vector[n] y; 29 | vector[n] x; 30 | } 31 | parameters { 32 | // declared the parameters in memory 33 | real alpha; 34 | real beta; 35 | real sigma; 36 | } 37 | model { 38 | // priors (these are variances not precision) 39 | alpha ~ normal(0,10); 40 | beta ~ normal(0,10); 41 | 42 | // likelihood (link data to some combination of parameters and more data) 43 | for(i in 1:n){ 44 | y[i] ~ normal(alpha + beta * x[i], sigma); 45 | } 46 | } 47 | generated quantities { 48 | // posterior predictions 49 | vector[n] y_predict; 50 | 51 | // the loop is necessary within the generated quantities block 52 | for(i in 1:n){ 53 | y_predict[i] = normal_rng(alpha + beta * x[i], sigma); 54 | } 55 | } 56 | " 57 | ## -------------------------------------------------- # 58 | 59 | 60 | ## set data for simulation 61 | n <- 100 62 | x <- rnorm(n,0,1) 63 | alpha <- 1.25 64 | beta <- 2.50 65 | 66 | ## simulate a dependent variable with normally distribtued error using the data and parameter values defined above 67 | error <- rnorm(n) 68 | y <- alpha + beta * x + error 69 | 70 | plot(x=x, y=y) 71 | 72 | ## fit linear model 73 | summary(lm(y~x)) 74 | 75 | ## create data list 76 | data_list <- list(y = y, x=x, n=n) 77 | 78 | ## set time start variable 79 | time1 <- Sys.time() 80 | 81 | # fit stan model 82 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4) 83 | 84 | ## calcuate the duration of the program file up to this point 85 | print(Sys.time() - time1) 86 | 87 | ## extract draws from stan model object (creates a list object) 88 | output <- extract(fit, permuted = TRUE) 89 | 90 | ## print names of each element/slot in the list 91 | names(output) 92 | 93 | ## print model fit object 94 | fit 95 | 96 | ## there are number of methods to subset and summarize parameters 97 | ## keep in mind that the output object is a list that contains vectors or matrices of of posterior estimates for each of the named parameter defined in the model statement above 98 | ## lapply (list-apply) a function to all of the objects in the list 99 | lapply(output, mean)[1:3] 100 | lapply(output, sd)[1:3] 101 | 102 | ## create a matrix using some of the named slots in the list 103 | model_parameters <- as.matrix(fit, pars = c("alpha", "beta", "sigma")) 104 | model_predictions <- as.matrix(fit, pars = "y_predict") 105 | 106 | ## check the dimensions (they should be the same) 107 | dim(model_predictions) 108 | dim(output$y_predict) 109 | 110 | ## plot the simulated y variable and the estimated posterior means 111 | plot(y, apply(model_predictions,2,mean)) 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day05_Measurement_RSTAN/RSTAN_logistic_regression_simulation.R: -------------------------------------------------------------------------------- 1 | ## RSTAN_logistic_regression_simulation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN 5 | ## 6 | ## Please e-mail me if you find any errors or have and suggestions 7 | ## e-mail: cjf0006@gmail.com 8 | ## 9 | ########################################################################## 10 | ## 11 | ## Introduction to tutorial: 12 | ## 13 | ## For this R tutorial we will simulate a binary dependent variable and then estimate the parameters that generate the variable. These parameters will be estimated based on the likelihood function that links them to the data contained in the y and x variables that are simulated observed data. The model produces the slope and intercept from a standard logistic regression model, which is also estimated using glm() in R. 14 | ## 15 | ########################################################################## 16 | 17 | ## load library 18 | library(rstan) # load rstan library 19 | library(MASS) # loaed library with truehist function 20 | 21 | ## -------------------------------------------------- ## 22 | ## define STAN model 23 | ## -------------------------------------------------- ## 24 | model <- " 25 | data { 26 | // declared the data in memory 27 | int n; 28 | int y[n]; 29 | vector[n] x; 30 | } 31 | // declared the parameters in memory 32 | parameters { 33 | real alpha; 34 | real beta; 35 | } 36 | model { 37 | // priors (these are variances not precision) 38 | alpha ~ normal(0,10); 39 | beta ~ normal(0,10); 40 | 41 | // likelihood (link data to some combination of parameters and more data) 42 | y ~ bernoulli_logit(alpha + beta * x); 43 | } 44 | generated quantities { 45 | // posterior predictions 46 | vector[n] y_predict; 47 | 48 | // the loop is necessary within the generated quantities block 49 | for(i in 1:n){ 50 | y_predict[i] = bernoulli_logit_rng(alpha + beta * x[i]); 51 | } 52 | } 53 | 54 | " 55 | ## -------------------------------------------------- ## 56 | 57 | 58 | ## simulate x1 and set the "true" population values alpha and beta 59 | n <- 100 60 | x <- rnorm(n,0,1) 61 | alpha <- 1.25 62 | beta <- 2.50 63 | 64 | ## systematic component of the model 65 | xb <- alpha + beta * x 66 | 67 | ## transform the linear term xb using 68 | ## the inverse logit function 69 | ## so that theta is bound from 0 to 1 70 | eta <- 1 / (1 + exp(-xb)) 71 | 72 | ## generate the dependent variable y with probability inv.theta and measurment error from a Bernoulli trial 73 | y <- rbinom(n, size=1, prob=eta) 74 | 75 | table(y) 76 | 77 | ## create data list 78 | data_list <- list(y = y, x=x, n=n) 79 | 80 | ## fit linear model 81 | summary(glm(y~x, family=binomial(link="logit"))) 82 | 83 | ## fit stan model 84 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4) 85 | 86 | ## extract draws from stan model object 87 | output <- extract(fit, permuted = TRUE) 88 | 89 | ## print names 90 | names(output) 91 | 92 | ## there are number of methods to subset and summarize parameters 93 | ## keep in mind that the output object is a list that contains vectors or matrices of of posterior estimates for each of the named parameter defined in the model statement above 94 | ## lapply (list-apply) a function to all of the objects in the list 95 | lapply(output, mean) 96 | lapply(output, sd) 97 | 98 | ## tabulate the simulated binary dependent variable it should be very close to the mean value of the predicted y 99 | table(y) 100 | 101 | ## create a matrix using some of the named slots in the list 102 | model_parameters <- as.matrix(fit, pars = c("alpha", "beta")) 103 | model_predictions <- as.matrix(fit, pars = "y_predict") 104 | 105 | ## check the dimensions (they should be the same) 106 | dim(model_predictions) 107 | dim(output$y_predict) 108 | 109 | ## plot the simulated y variable and the estimated posterior means 110 | plot(apply(model_predictions,2,mean), y) 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day05_Measurement_RSTAN/RSTAN_mean_simulation.R: -------------------------------------------------------------------------------- 1 | ## RSTAN_mean_simulation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN 5 | ## 6 | ## Please e-mail me if you find any errors or have and suggestions 7 | ## e-mail: cjf0006@gmail.com 8 | ## 9 | ########################################################################## 10 | ## 11 | ## Introduction to tutorial: 12 | ## 13 | ## For this R tutorial we will use the normal distribution in STAN to estimate the mean value for an observed variable y. The parameter for the mean will be selected based on the likelihood function that links them to the data contained in the y variable. 14 | ## 15 | ########################################################################## 16 | 17 | ## load library 18 | library(rstan) # load rstan library 19 | library(MASS) # load library with truehist function 20 | 21 | ## -------------------------------------------------- # 22 | ## define STAN model as a character 23 | ## -------------------------------------------------- # 24 | model <- " 25 | data { 26 | // declared the data in memory 27 | int n; 28 | vector[n] y; 29 | } 30 | parameters { 31 | // declared the parameters in memory 32 | real mu; 33 | real sigma; 34 | } 35 | model { 36 | // there are no prior statements for mu or sigma; 37 | // by default the priors on the parameters are flat unless we provide more information (see the other examples) 38 | // likelihood (link data to some combination of parameters and more data) 39 | 40 | mu ~ normal(0,1); 41 | 42 | for(i in 1:n){ 43 | y[i] ~ normal(mu, sigma); 44 | } 45 | } 46 | generated quantities { 47 | // posterior predictions 48 | vector[n] y_predict; 49 | 50 | // the loop is necessary within the generated quantities block 51 | for(i in 1:n){ 52 | y_predict[i] = normal_rng(mu, sigma); 53 | } 54 | } 55 | " 56 | ## -------------------------------------------------- # 57 | 58 | 59 | ## set data for simulation 60 | #y <- 1:5 61 | y <- rep(1:5,200) 62 | 63 | n <- length(y) 64 | y 65 | n 66 | 67 | ## create data list 68 | data_list <- list(y = y, n=n) 69 | data_list 70 | 71 | ## set time start variable 72 | time1 <- Sys.time() 73 | 74 | # fit stan model 75 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4) 76 | 77 | ## calcuate the duration of the program file up to this point 78 | print(Sys.time() - time1) 79 | 80 | ## extract draws from stan model object (creates a list object) 81 | output <- extract(fit, permuted = TRUE) 82 | 83 | ## print names of each element/slot in the list 84 | names(output) 85 | 86 | ## print model fit object 87 | fit 88 | 89 | ## lapply (list-apply) a function to all of the objects in the list 90 | lapply(output, mean) 91 | lapply(output, sd) 92 | 93 | truehist(output$mu) 94 | 95 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day05_Measurement_RSTAN/RSTAN_normal_distribution_simulation.R: -------------------------------------------------------------------------------- 1 | ## RSTAN_normal_distribution_simulation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: A Crash Course in Measurement and Latent Variable Modeling with RSTAN 5 | ## 6 | ## Please e-mail me if you find any errors or have and suggestions 7 | ## e-mail: cjf0006@gmail.com 8 | ## 9 | ########################################################################## 10 | ## 11 | ## Introduction to tutorial: 12 | ## 13 | ## For this R tutorial we will draw random samples from the normal distribution using the STAN program. 14 | ## This program is equivalent to using the rnorm function in R. 15 | ## 16 | ########################################################################## 17 | 18 | 19 | ## load library 20 | library(rstan) # load rstan library 21 | library(MASS) # load library with truehist function 22 | 23 | 24 | ## -------------------------------------------------- # 25 | ## define STAN model 26 | ## -------------------------------------------------- # 27 | model <- " 28 | 29 | parameters { 30 | real mu; 31 | } 32 | 33 | model { 34 | mu ~ normal(0,1); 35 | } 36 | " 37 | ## -------------------------------------------------- # 38 | 39 | 40 | ## set time start variable 41 | time1 <- Sys.time() 42 | 43 | ## fit stan model 44 | fit <- stan(model_code = model, iter = 1000, chains = 4) 45 | 46 | ## calculate the duration of the program file up to this point 47 | print(Sys.time() - time1) 48 | 49 | ## extract draws from stan model object (creates a list object) 50 | output <- extract(fit, permuted = TRUE) 51 | 52 | ## print names of each element/slot in the list 53 | names(output) 54 | 55 | ## print model fit object 56 | fit 57 | 58 | ## there are number of methods to subset and summarize parameters 59 | ## keep in mind that the output object is a list that contains vectors or matrices of posterior estimates for each of the named parameter defined in the model statement above 60 | ## lapply (list-apply) a function to all of the objects in the list 61 | lapply(output, mean) 62 | lapply(output, sd) 63 | 64 | ## create a matrix using some of the named slots in the list 65 | model_parameters <- as.matrix(fit, pars = c("mu")) 66 | 67 | ## check the dimensions (they should be the same) 68 | length(output$mu) 69 | 70 | ## make a nice plot 71 | #par(mfrow=c(2,2)) 72 | truehist(output$mu) 73 | 74 | 75 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/.DS_Store -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_Ternary_Diriclet.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Ternary_Diriclet.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-28 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | 15 | library(Ternary) 16 | library(MCMCpack) 17 | 18 | value1 <- value2 <- value3 <- value4 <- value5 <- value6 <- matrix(NA, nrow=1000, ncol=3) 19 | 20 | ## generate 6 different 3D coordinate systems that exists within a simplex such that the coordinates for any point our proportions that always sum to 1 21 | value1[,1:3] <- rdirichlet(1000, alpha=c(1,1,5)) 22 | value2[,1:3] <- rdirichlet(1000, alpha=c(1,5,1)) 23 | value3[,1:3] <- rdirichlet(1000, alpha=c(10,10,1)) 24 | value4[,1:3] <- rdirichlet(1000, alpha=c(1,1,1)) 25 | value5[,1:3] <- rdirichlet(1000, alpha=c(10,10,10)) 26 | value6[,1:3] <- rdirichlet(1000, alpha=c(100,100,100)) 27 | 28 | head(value1[,1:3]) 29 | 30 | 31 | rdirichlet(10, alpha=c(1,1,1,1)) 32 | 33 | ## show coordinates within the simplex sum to 1 34 | table(apply(value1[,1:3],1,sum)) 35 | table(apply(value2[,1:3],1,sum)) 36 | table(apply(value3[,1:3],1,sum)) 37 | table(apply(value4[,1:3],1,sum)) 38 | table(apply(value5[,1:3],1,sum)) 39 | table(apply(value6[,1:3],1,sum)) 40 | 41 | ## test coordinates within the simplex sum to 1 42 | summary(apply(value1[,1:3],1,sum)) 43 | table(apply(value1[,1:3],1,sum)==1) 44 | 45 | ## unicode for right arrow: "\u2192" 46 | ## unicode for left arrow: "\u2190" 47 | 48 | par(mfrow=c(3,2), mar=c(1,1,1,1)) 49 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192", 50 | point='right', lab.cex=0.8, grid.minor.lines = 0, 51 | grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white', 52 | axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6), 53 | padding=0.08, main="simplex") 54 | TernaryPoints(value1, col=grey(.75)) 55 | 56 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192", 57 | point='right', lab.cex=0.8, grid.minor.lines = 0, 58 | grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white', 59 | axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6), 60 | padding=0.08, main="simplex") 61 | TernaryPoints(value1, col=grey(.75)) 62 | 63 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192", 64 | point='right', lab.cex=0.8, grid.minor.lines = 0, 65 | grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white', 66 | axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6), 67 | padding=0.08, main="simplex") 68 | TernaryPoints(value3, col=grey(.75)) 69 | 70 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192", 71 | point='right', lab.cex=0.8, grid.minor.lines = 0, 72 | grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white', 73 | axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6), 74 | padding=0.08, main="simplex") 75 | TernaryPoints(value4, col=grey(.75)) 76 | 77 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192", 78 | point='right', lab.cex=0.8, grid.minor.lines = 0, 79 | grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white', 80 | axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6), 81 | padding=0.08, main="simplex") 82 | TernaryPoints(value5, col=grey(.75)) 83 | 84 | TernaryPlot(alab="Redder \u2192", blab="\u2190 Greener", clab="Bluer\u2192", 85 | point='right', lab.cex=0.8, grid.minor.lines = 0, 86 | grid.lty='solid', col=rgb(0.9, 0.9, 0.9), grid.col='white', 87 | axis.col=rgb(0.6, 0.6, 0.6), ticks.col=rgb(0.6, 0.6, 0.6), 88 | padding=0.08, main="simplex") 89 | TernaryPoints(value6, col=grey(.75)) 90 | 91 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_stm_package.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_text_as_data_DTM_stm_package.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Introduction to tutorial: 15 | ## 16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file. 17 | ## 18 | ## This code uses functions available in the tm package. 19 | ## 20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions. 21 | ## 22 | ########################################################################## 23 | 24 | 25 | ## load libraries 26 | library(stm) 27 | library(tm) 28 | library(SnowballC) 29 | 30 | ## load data 31 | data <- read.csv("SIMpoliticalTweets.txt", header=FALSE) 32 | data 33 | names(data) <- "text" 34 | data 35 | 36 | #trumptweets <- fromJSON("trump_json_files_20190707.txt") 37 | #data <- trumptweets 38 | 39 | ## 40 | #data <- data[1:1000,] 41 | 42 | 43 | ## preprocess the documents 44 | ## This function uses function from the tm package (see the tm Demo for more details) 45 | ## stem words and remove stop words 46 | prep <- textProcessor(documents=data$text, meta=data) 47 | 48 | ## list attributes 49 | attributes(prep) 50 | 51 | ## inspect 52 | head(prep$documents) 53 | head(prep$vocab) 54 | head(prep$meta) 55 | prep$docs.removed 56 | 57 | 58 | ## pre Documents 59 | ## additional processing (removes some documents because of word frequencies greater than .99 or less than .01) 60 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta) 61 | out 62 | 63 | ## inspect 64 | head(out$documents) 65 | head(out$vocab) 66 | head(out$meta) 67 | 68 | ## fit a structural topic model 69 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=10) 70 | 71 | ## inspect attributes 72 | attributes(fit) 73 | 74 | dim(fit$theta) 75 | 76 | ## display topic probabilities 77 | fit$theta 78 | 79 | apply(fit$theta, 1, sum) 80 | 81 | out$meta 82 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_tm_package.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_text_as_data_DTM_tm_package.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Introduction to tutorial: 15 | ## 16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file. 17 | ## 18 | ## This code uses functions available in the tm package. 19 | ## 20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions. 21 | ## 22 | ########################################################################## 23 | 24 | ## load libraries 25 | library(stm) 26 | library(tm) 27 | library(SnowballC) 28 | 29 | ## load data 30 | data <- read.csv("SIMpoliticalTweets.txt", header=FALSE) 31 | data 32 | names(data) <- "text" 33 | data 34 | 35 | #trumptweets <- fromJSON("trump_json_files_20190707.txt") 36 | #data <- trumptweets 37 | 38 | ## 39 | ##data <- data[1:1000,] 40 | 41 | ## create character vector for processing 42 | newtext <- as.character(data$text) 43 | length(newtext) 44 | 45 | 46 | ## use gsub to remove special characters that usually cause errors if left for later 47 | newtext <- gsub("[^0-9A-Za-z///' ]", "", newtext) 48 | newtext <- gsub("[^[:alnum:]///' ]", "", newtext) 49 | newtext <- gsub("[^\x20-\x7F\x0D\x0A]", "", newtext) # remove all non-ascii characters 50 | newtext <- gsub("http.*", "", newtext) # replace all of the urls 51 | newtext <- gsub("www.*", "", newtext) # 52 | 53 | ## data$newtext 54 | data$newtext <- newtext 55 | 56 | ## convert to corpus object using additional functions from the tm package 57 | ## the tm_map function takes as its first argument the vector of text and a function as its second argument 58 | corpus <-Corpus(VectorSource(newtext)) 59 | corpus <- tm_map(corpus, removePunctuation) 60 | corpus <- tm_map(corpus, removeNumbers) 61 | corpus <- tm_map(corpus, stripWhitespace) 62 | corpus <- tm_map(corpus, tolower) 63 | corpus <- tm_map(corpus, removeWords, stopwords("english")) 64 | corpus <- tm_map(corpus, stemDocument) 65 | 66 | ## print to screen 67 | inspect(corpus[1:10]) 68 | 69 | 70 | ## make document by term matrix 71 | DTM <- DocumentTermMatrix(corpus) 72 | DTM 73 | 74 | 75 | ## print DTM to screen 76 | inspect(DTM) 77 | inspect(DTM[1:10,1:25]) 78 | 79 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_text_as_data_twitteR_get_twitter_users.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_text_as_data_twitteR_get_tweeter_users.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## setup to access the twitter API using R 16 | ## 17 | ## Create a Twitter application at http://dev.twitter.com. Make sure to give the app read, write and direct message authority. 18 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret". 19 | 20 | ########################################################################## 21 | 22 | #install.packages("twitteR") 23 | library(twitteR) 24 | 25 | 26 | ## (1a) Go to twitter's developer page: https://developer.twitter.com/ 27 | ## (1b) Click to create an App 28 | ## (1c) For the desription, you can say something like this "learn to use twitteR API for a college data science course at the University of Michigan." 29 | ## 30 | ## (2) Once you have succefully setup you APP, you will be able to get the four strings that you will fill in below 31 | ## 32 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret". 33 | ## 34 | ## set keys and tokens to access the twitter API 35 | consumer_key <- "your_consumer_key" 36 | consumer_secret <- "your_consumer_secret" 37 | access_token <- "your_access_token" 38 | access_secret <- "your_access_secret" 39 | 40 | setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) 41 | 42 | ## (3) Run the following tweet to see if you successfully created your account 43 | amnestyusa_tweets <- userTimeline('amnestyusa', n=100) 44 | ## 45 | ## (4) Pick a tweeter account from your country of interest (or you can use the amnestyusa account from above) 46 | ## 47 | ## (5) Make a DTM (Document-by-Term matrix) for the first 100 tweets from the account you have selected or the amnestyusa account 48 | ## 49 | ## (6a) Which words are most commonly used in the corpus of 100 tweets? 50 | ## (6b) Which words are the least commonly used in the corpus of 100 tweets? 51 | ## (6c) Create a barplot of the most frequent words 52 | 53 | 54 | ########################################################################## 55 | ## additional examples: 56 | ## 57 | ## grab the most recent 100 tweets from Barak Obama 58 | Obama_ut <- userTimeline('barackobama', n=100) 59 | 60 | ## grab the max number of tweets for the President of Senegal Macky Sall 61 | Macky_Sall_ut <- userTimeline('Macky_Sall', n=3200) 62 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_text_as_data_twitteR_get_twitter_users_Oauth_v2.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_text_as_data_twitteR_get_tweeter_users.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## setup to access the twitter API using R 16 | ## 17 | ## Create a Twitter application at http://dev.twitter.com. Make sure to give the app read, write and direct message authority. 18 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret". 19 | 20 | ########################################################################## 21 | 22 | #install.packages("twitteR") 23 | library(twitteR) 24 | 25 | 26 | ## (1a) Go to twitter's developer page: https://developer.twitter.com/ 27 | ## SEE THIS LINK: https://developer.twitter.com/en/docs/tutorials/getting-started-with-r-and-v2-of-the-twitter-api 28 | ## (1b) Click to create an App 29 | ## (1c) For the desription, you can say something like this "learn to use twitteR API for a college data science course at the University of Michigan." 30 | ## 31 | ## (2) Once you have succefully setup you APP, you will be able to get the four strings that you will fill in below 32 | ## 33 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret". 34 | ## 35 | ## set keys and tokens to access the twitter API 36 | consumer_key <- "your_consumer_key" 37 | consumer_secret <- "your_consumer_secret" 38 | access_token <- "your_access_token" 39 | access_secret <- "your_access_secret" 40 | 41 | setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) 42 | 43 | ## (3) Run the following tweet to see if you successfully created your account 44 | amnestyusa_tweets <- userTimeline('amnestyusa', n=100) 45 | ## 46 | ## (4) Pick a tweeter account from your country of interest (or you can use the amnestyusa account from above) 47 | ## 48 | ## (5) Make a DTM (Document-by-Term matrix) for the first 100 tweets from the account you have selected or the amnestyusa account 49 | ## 50 | ## (6a) Which words are most commonly used in the corpus of 100 tweets? 51 | ## (6b) Which words are the least commonly used in the corpus of 100 tweets? 52 | ## (6c) Create a barplot of the most frequent words 53 | 54 | 55 | ########################################################################## 56 | ## additional examples: 57 | ## 58 | ## grab the most recent 100 tweets from Barak Obama 59 | Obama_ut <- userTimeline('barackobama', n=100) 60 | 61 | ## grab the max number of tweets for the President of Senegal Macky Sall 62 | Macky_Sall_ut <- userTimeline('Macky_Sall', n=3200) 63 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/R_Demo_text_as_wikip_stm.R: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | ## INSTRUCTOR: Christopher Fariss 3 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 4 | ## University of Essex Summer School 2021 5 | ## 6 | ## Date: 2022-08-08 7 | ## 8 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 9 | ## e-mail: cjf0006@gmail.com 10 | ## e-mail: cjfariss@umich.edu 11 | ########################################################################## 12 | ## Introduction to tutorial: 13 | ## 14 | ## This tutorial demonstrates how the Structural Topic Model functions for a real set of documents. 15 | ## 16 | ## Each unit in the dataset representes a randomly selected sentence from wikipedia. 17 | ## 18 | ########################################################################## 19 | 20 | library(stm) 21 | 22 | wikip_word_dat <- readLines("one_meelyun_sentences.txt") 23 | head(wikip_word_dat) 24 | 25 | n <- length(wikip_word_dat) 26 | n 27 | 28 | wikip_word_dat <- data.frame(id=1:n, text=wikip_word_dat) 29 | head(wikip_word_dat) 30 | 31 | 32 | wikip_word_dat$human_rights_test <- as.numeric(grepl("human rights", wikip_word_dat$text)) 33 | table(wikip_word_dat$human_rights_test) 34 | 35 | wikip_word_dat$civil_rights_test <- as.numeric(grepl("civil rights", wikip_word_dat$text)) 36 | table(wikip_word_dat$civil_rights_test) 37 | 38 | ## preprocess the documents 39 | ## This function uses function from the tm package (see the tm Demo for more details) 40 | prep <- textProcessor(documents=wikip_word_dat$text, meta=wikip_word_dat) 41 | 42 | save(out, file="wikip_word_dat_stm_prep.Rdata") 43 | 44 | ## inspect 45 | head(prep$documents) 46 | head(prep$vocab) 47 | head(prep$meta) 48 | 49 | 50 | ## pre Documents 51 | ## stem words and remove stop words 52 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta) 53 | 54 | save(out, file="wikip_word_dat_stm_out.Rdata") 55 | 56 | ## inspect 57 | head(out$documents) 58 | head(out$vocab) 59 | head(out$meta) 60 | 61 | ## fit a structural topic model 62 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=20) 63 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=40) 64 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=100) 65 | 66 | #fit_2 <- fit 67 | #fit_3 <- fit 68 | #fit_10 <- fit 69 | #fit_20 <- fit 70 | fit_40 <- fit 71 | 72 | ## display topic probabilities 73 | head(fit$theta) 74 | 75 | dim(fit$theta) 76 | summary(head(fit$theta, 1000)) 77 | 78 | #save(fit, file="wikip_word_dat_stm20.Rdata") 79 | #save(fit, file="wikip_word_dat_stm40.Rdata") 80 | save(fit, file="wikip_word_dat_stm100.Rdata") 81 | 82 | 83 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/google_trends_stuff/R_Demo_google_trends_Texas.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_google_trends_Human_Rights.R 2 | ######################################################################### 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-28 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | rm(list=ls()) 15 | 16 | ## load libraries 17 | library(gtrendsR) 18 | library(countrycode) 19 | library(stm) 20 | library(tm) 21 | library(MASS) 22 | library(colorbrewer) 23 | library(bcp) 24 | 25 | 26 | COLORS <- c("#fdae61", "#a6cee3", "#1f78b4", "#b2df8a", "#33a02c") 27 | 28 | 29 | #pdf("Texas_google_trends.pdf", height=5, width=6) 30 | 31 | #plot.new() 32 | 33 | TERMS <- list(c("%2Fg%2F11f06hfpld", "%2Fg%2F11nycqk4xh"), 34 | c("%2Fg%2F11f06hfpld", "%2Fm%2F03m1n"), 35 | c("%2Fm%2F03m1n", "%2Fg%2F11nycqk4xh"), 36 | c("%2Fg%2F11nycqk4xh", "%2Fm%2F01rbhn"), 37 | c("%2Fg%2F11f06hfpld", "%2Fm%2F01rbhn"), 38 | c("%2Fm%2F03m1n", "%2Fm%2F01rbhn") 39 | ) 40 | length(TERMS) 41 | 42 | TERMS_names <- list(c("Hurricane Harvey", "Winter Storm"), 43 | c("Hurricane Harvey", "Astros"), 44 | c("Astros", "Winter Storm"), 45 | c("Winter Storm", "Power Outage"), 46 | c("Hurricane Harvey", "Power Outage"), 47 | c("Astros", "Power Outage") 48 | ) 49 | 50 | #par(mfrow=c(3,3), mar=c(2,2.5,1,.5)) 51 | par(mfrow=c(1,1), mar=c(2,2.5,1,.5)) 52 | 53 | for(i in 1:6){ 54 | 55 | world <- gtrends(TERMS[[i]], geo="US-TX", time="2017-01-01 2021-12-31", low_search_volume=T)$interest_over_time 56 | 57 | world$hits[world$hits=="<1"] <- .5 58 | world$hits <- as.numeric(world$hits) 59 | 60 | plot(world$hits[world$keyword==TERMS[[i]][1]], main=paste("Texas:", TERMS_names[[i]][1], "vs.", TERMS_names[[i]][2]), lwd=1, col=grey(.75), ylim=c(0, 100), xaxt="n", yaxt="n", type="n") 61 | #lines(world$hits[world$keyword==TERMS[[i]][2]], type="l", ylim=c(0,100), col=2) 62 | #plot.window(xlim=c(1,length(world$hits[world$keyword==TERMS[[i]][1]])), ylim=c(0, 100)) 63 | #mtext(side=3, text=paste("Global:", TERMS[[i]][1], "vs.", TERMS[[i]][2]), line=1) 64 | 65 | id2017 <- which(as.Date(world$date[world$keyword==TERMS[[i]][1]]) > as.Date("2016-12-31") & as.Date(world$date[world$keyword==TERMS[[i]][1]]) <= as.Date("2017-12-31")) 66 | id2018 <- which(as.Date(world$date[world$keyword==TERMS[[i]][1]]) > as.Date("2017-12-31") & as.Date(world$date[world$keyword==TERMS[[i]][1]]) <= as.Date("2018-12-31")) 67 | id2019 <- which(as.Date(world$date[world$keyword==TERMS[[i]][1]]) > as.Date("2018-12-31") & as.Date(world$date[world$keyword==TERMS[[i]][1]]) <= as.Date("2019-12-31")) 68 | id2020 <- which(as.Date(world$date[world$keyword==TERMS[[i]][1]]) > as.Date("2019-12-31") & as.Date(world$date[world$keyword==TERMS[[i]][1]]) <= as.Date("2020-12-31")) 69 | id2021 <- which(as.Date(world$date[world$keyword==TERMS[[i]][1]]) > as.Date("2020-12-31") & as.Date(world$date[world$keyword==TERMS[[i]][1]]) <= as.Date("2021-12-31")) 70 | 71 | polygon(x=c(min(id2017), min(id2017), max(id2017), max(id2017)), y=c(-10,110,110,-10), col=grey(.95), border=NA) 72 | polygon(x=c(min(id2019), min(id2019), max(id2019), max(id2019)), y=c(-10,110,110,-10), col=grey(.95), border=NA) 73 | polygon(x=c(min(id2021), min(id2021), max(id2021), max(id2021)), y=c(-10,110,110,-10), col=grey(.95), border=NA) 74 | box() 75 | 76 | lines(world$hits[world$keyword==TERMS[[i]][1]], lwd=1, col="#c2a5cf") 77 | lines(world$hits[world$keyword==TERMS[[i]][2]], lwd=1, col="#a6dba0") 78 | axis(side=2, at=c(0,25,50,75,100), las=2) 79 | axis(side=1, at=c(median(id2017), median(id2018), median(id2019), median(id2020), median(id2021)), labels=c(2017:2021), las=1) 80 | model <- bcp(y=world$hits[world$keyword==TERMS[[i]][1]]) 81 | lines(model$posterior.mean, lwd=.75, col="#7b3294") 82 | model <- bcp(y=world$hits[world$keyword==TERMS[[i]][2]]) 83 | lines(model$posterior.mean, lwd=.75, col="#008837") 84 | 85 | legend("topleft", legend=c(TERMS_names[[i]][1]), text.col="#7b3294" ,bty="n") 86 | legend("bottomleft", legend=c(TERMS_names[[i]][2]), text.col="#008837", bty="n") 87 | 88 | } 89 | 90 | #dev.off() 91 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/google_trends_stuff/R_Demo_google_trends_trump_brexit.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_google_trends_trump_brexit.R 2 | ######################################################################### 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-28 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | 15 | ## clean up workspace 16 | rm(list = ls(all.names = TRUE)) 17 | gc() 18 | 19 | ## load libraries 20 | library(gtrendsR) 21 | library(countrycode) 22 | library(stm) 23 | library(tm) 24 | library(MASS) 25 | library(colorbrewer) 26 | 27 | ## country codes 28 | data("countries") 29 | ISO <- as.character(unique(countries$country_code)) 30 | COUNTRY <- countrycode(ISO, origin="iso2c", destination="country.name") 31 | 32 | 33 | COLORS <- c("#fdae61", "#a6cee3", "#1f78b4", "#b2df8a", "#33a02c") 34 | 35 | 36 | ## 37 | TERMS <- c("Donald Trump", "Brexit") 38 | 39 | 40 | 41 | world <- gtrends(TERMS[c(1,2)]) 42 | plot(world) 43 | 44 | 45 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day06_Text_as_data_programs/google_trends_stuff/groundhog_library_func.R: -------------------------------------------------------------------------------- 1 | 2 | groundhog_library_func <- function(groundhog=FALSE, regular_install=FALSE){ 3 | ## Do this (set to TRUE) to load libraries using the version from when the scripts were originally run 4 | if(groundhog){ 5 | ## load an older version of the libraries 6 | remotes::install_github('CredibilityLab/groundhog') 7 | library(groundhog) 8 | pkgs <- c("gtrendsR", "countrycode", "stm", "tm", "MASS", "bcp", "ngramr", "rvest", "plm", "lmtest", "WDI", "boot", "forecast", "acled.api", "ggplot2", "stargazer", "httr", "lubridate", "xtable") 9 | groundhog.library(pkgs,'2022-05-23') 10 | } else if(regular_install==TRUE){ 11 | ## or install and load the more recent version of the libraries 12 | install.packages("gtrendsR", "countrycode", "stm", "tm", "MASS", "bcp", "ngramr", "rvest", "plm", "lmtest", "WDI", "boot", "forecast", "acled.api", "ggplot2", "stargazer", "httr", "lubridate", "xtable") 13 | library(gtrendsR) 14 | library(countrycode) 15 | library(stm) 16 | library(tm) 17 | library(MASS) 18 | library(bcp) 19 | library(ngramr) 20 | library(rvest) 21 | library(plm) 22 | library(lmtest) 23 | library(WDI) 24 | library(boot) 25 | library(forecast) 26 | library(acled.api) 27 | library(ggplot2) 28 | library(stargazer) 29 | library(httr) 30 | library(lubridate) 31 | library(xtable) 32 | } else{ 33 | ## or just load the more recent version of the libraries 34 | library(gtrendsR) 35 | library(countrycode) 36 | library(stm) 37 | library(tm) 38 | library(MASS) 39 | library(bcp) 40 | library(ngramr) 41 | library(rvest) 42 | library(plm) 43 | library(lmtest) 44 | library(WDI) 45 | library(boot) 46 | library(forecast) 47 | library(acled.api) 48 | library(ggplot2) 49 | library(stargazer) 50 | library(httr) 51 | library(lubridate) 52 | library(xtable) 53 | } 54 | } 55 | 56 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_glm.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_neuralnet_gradiant_decent_glm.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Gradient Decent for generalized linear model example. 16 | ## 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network. 18 | ## 19 | ########################################################################## 20 | 21 | ## set learning rate this varies on the unit interval (0 to 1] 22 | library(MASS) 23 | library(boot) 24 | library(gtools) 25 | 26 | lr <- .2 27 | n <- 100 28 | x <- rnorm(n) 29 | 30 | alpha <- -1 31 | beta <- -2 32 | 33 | y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x)) 34 | #y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x + rnorm(n))) 35 | 36 | ## Start with a random guess 37 | X_mat <- cbind(1,x) 38 | 39 | alpha_hat <- 4 40 | beta_hat <- 4 41 | iterations <- 1000 42 | 43 | loss <- variance <- NA 44 | 45 | y_hat <- matrix(NA, nrow=n, ncol=iterations) 46 | y_error <- matrix(NA, nrow=n, ncol=iterations) 47 | delta <- matrix(NA, nrow=2, ncol=iterations) 48 | 49 | ## sequential iterations to evaulate loss function 50 | for (j in 1:iterations){ 51 | 52 | ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta 53 | y_hat[,j] <- inv.logit(alpha_hat[j] + beta_hat[j] * x) 54 | 55 | ## this works too but not as well as above 56 | #y_hat[,j] <- rbinom(1:n,1,prob=inv.logit(alpha_hat[j] + beta_hat[j] * x)) 57 | 58 | ## difference between the predicted y (y_hat) and y is the error for y 59 | y_error[,j] <- y_hat[,j] - y 60 | 61 | ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors 62 | loss[j] <- sum(y_error[,j]^2) 63 | 64 | ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X 65 | ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n) 66 | delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n 67 | delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n 68 | 69 | ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y) 70 | alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j]) 71 | beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j]) 72 | } 73 | 74 | ## estimate glm model 75 | fit <- glm(y~x, family=binomial("logit")) 76 | fit 77 | 78 | ## print the last value of the sequence of parameter estimates 79 | alpha_hat[length(alpha_hat)] 80 | beta_hat[length(beta_hat)] 81 | 82 | 83 | ## graph the values as a function of the loss statistic 84 | ALPHA_seq <- seq(from=-4,4,.05) 85 | BETA_seq <- seq(from=-4,4,.05) 86 | 87 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq)) 88 | 89 | ## grid search for graphing 90 | for(i in 1:length(ALPHA_seq)){ 91 | for(j in 1:length(ALPHA_seq)){ 92 | Y_hat <- inv.logit(ALPHA_seq[i] + BETA_seq[j] * X_mat[,2]) 93 | LOSS[i,j] <- sum((y-Y_hat)^2) 94 | 95 | } 96 | } 97 | 98 | ## plot the log loss 99 | contour(ALPHA_seq, BETA_seq,(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5) 100 | lines(alpha_hat, beta_hat, col=2, lwd=4) 101 | 102 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_lm.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_neuralnet_gradiant_decent_lm.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Gradient Decent for a linear model with two parameters (intercept and slope). 16 | ## 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network. 18 | ## 19 | ########################################################################## 20 | 21 | 22 | ## load libraries 23 | library(MASS) 24 | 25 | ## set learning rate this varies on the unit interval (0 to 1] 26 | lr <- .2 27 | 28 | n <- 100 29 | x <- rnorm(n) 30 | 31 | alpha <- -1 32 | beta <- -2 33 | 34 | y <- alpha + beta*x + rnorm(n) 35 | plot(x=x, y=y) 36 | fit <- lm(y ~ x) 37 | summary(fit) 38 | 39 | 40 | X_mat <- cbind(1,x) 41 | head(X_mat) 42 | 43 | # Start with a random guess 44 | alpha_hat <- 1 45 | beta_hat <- 4 46 | 47 | 48 | iterations <- 100 49 | 50 | loss <- NA 51 | variance <- NA 52 | 53 | y_hat <- matrix(NA, nrow=n, ncol=iterations) 54 | y_error <- matrix(NA, nrow=n, ncol=iterations) 55 | delta <- matrix(NA, nrow=2, ncol=iterations) 56 | 57 | ## sequential iterations to evaulate loss function 58 | for (j in 1:iterations){ 59 | 60 | ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta 61 | y_hat[,j] <- alpha_hat[j] + beta_hat[j] * x 62 | 63 | ## difference between the predicted y (y_hat) and y is the error for y 64 | y_error[,j] <- y_hat[,j] - y 65 | 66 | ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors 67 | loss[j] <- sum(y_error[,j]^2) 68 | 69 | ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X 70 | ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n) 71 | 72 | ## 73 | delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n 74 | delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n 75 | 76 | ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y) 77 | alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j]) 78 | beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j]) 79 | } 80 | 81 | ## print lm summary from above 82 | summary(fit) 83 | 84 | ## print the last value of the sequence of parameter estimates 85 | alpha_hat[length(alpha_hat)] 86 | beta_hat[length(beta_hat)] 87 | 88 | 89 | ## the least squares solution 90 | solve(t(X_mat) %*% X_mat) %*% t(X_mat) %*% y 91 | 92 | ## note that these give distinct regression models because the covariance between alpha and beta are not included (so they are different from the one above) 93 | solve(t(X_mat[,2]) %*% X_mat[,2]) %*% t(X_mat[,2]) %*% y 94 | solve(t(X_mat[,1]) %*% X_mat[,1]) %*% t(X_mat[,1]) %*% y 95 | 96 | 97 | ## graph the values as a function of the loss statistic 98 | ALPHA_seq <- seq(from=-4,4,.05) 99 | BETA_seq <- seq(from=-4,4,.05) 100 | 101 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq)) 102 | 103 | ## grid search for graphing 104 | for(i in 1:length(ALPHA_seq)){ 105 | for(j in 1:length(ALPHA_seq)){ 106 | Y_hat <- ALPHA_seq[i] + BETA_seq[j] * X_mat[,2] 107 | LOSS[i,j] <- sum((y-Y_hat)^2) 108 | } 109 | } 110 | 111 | ## plot the log loss 112 | par(mar=c(5,5,1,1)) 113 | contour(ALPHA_seq, BETA_seq, log(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5) 114 | lines(alpha_hat, beta_hat, col=2, lwd=4) 115 | 116 | 117 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_mu.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_neuralnet_gradiant_decent_mu.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Gradient Decent for a model with one parameter for a mean. 16 | ## 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network. 18 | ## 19 | ########################################################################## 20 | 21 | 22 | 23 | 24 | 25 | ## load libraries 26 | library(MASS) 27 | 28 | ## set learning rate this varies on the unit interval (0 to 1] 29 | lr <- .2 30 | #lr <- .01 31 | #lr <- 1 32 | 33 | y <- c(1,2,3,4,5) 34 | ##y <- rnorm(1000, pi, 1) 35 | n <- length(y) 36 | 37 | y 38 | n 39 | 40 | 41 | iterations <- 100 42 | 43 | loss <- NA 44 | 45 | mu_hat <- -1.5 46 | 47 | y_hat <- matrix(NA, nrow=n, ncol=iterations) 48 | y_error <- matrix(NA, nrow=n, ncol=iterations) 49 | 50 | delta_mu_hat <- NA 51 | 52 | ## sequential iterations to evaulate loss function 53 | for (j in 1:iterations){ 54 | 55 | ## y_hat: calculate the predicted y_hat based on the best guess of mu 56 | 57 | y_hat[,j] <- mu_hat[j] 58 | 59 | ## risidual: the difference between the predicted y (y_hat) and y is the error for y 60 | y_error[,j] <- y_hat[,j] - y 61 | 62 | ## loss: the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors 63 | loss[j] <- sum(y_error[,j]^2) 64 | 65 | ## difference between current estimate and the unexplained differences, which is the method to calculate the gradient at that point 66 | delta_mu_hat[j] <- sum(y_error[,j])/n 67 | 68 | ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y) 69 | mu_hat[j+1] <- mu_hat[j] - (lr*delta_mu_hat[j]) 70 | } 71 | 72 | mu_hat[length(mu_hat)] 73 | 74 | plot(mu_hat) 75 | 76 | plot(mu_hat[-1], delta_mu_hat) 77 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day08_Applied_Machine_Learning_AML_HuggingFace/R_Demo_huggingface_intro.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_huggingface_into.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B or 2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-17 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Introduction to tutorial: 15 | ## 16 | ## We are going to try out a number of pretrained Large Langugae Models --- LLMs --- that are publicly available at 🤗 17 | ## 18 | ## 🤗 huggingface models: https://huggingface.co/models 19 | ## 20 | ## 🤗 huggingface datasets: https://huggingface.co/datasets 21 | ## 22 | ## Bert LLM (and many others) 23 | ## https://huggingface.co/docs/transformers/v4.20.1/en/model_doc/bert 24 | ## 25 | ## BloomLLM 26 | ## BigScience Large Open-science Open-access Multilingual Language Model 27 | ## https://huggingface.co/bigscience/bloom 28 | ## 29 | ## more examples with R 30 | ## https://rpubs.com/eR_ic/transfoRmers 31 | ## 32 | ## for loading Python libraries and setting up piplines (functions) 33 | ## https://rstudio.github.io/reticulate/reference/py_install.html 34 | ## for details see https://rstudio.github.io/reticulate/ 35 | ## 36 | ########################################################################## 37 | ## 38 | ## install.packages("reticulate") 39 | 40 | ## load R libraries 41 | library(reticulate) 42 | library(keras) 43 | library(tensorflow) 44 | library(dplyr) 45 | library(tfdatasets) 46 | library(torch) 47 | 48 | ## load Python libraries using the reticulate package 49 | reticulate::py_install("transformers", pip = TRUE) 50 | 51 | ## also try these 52 | reticulate::py_install("PyTorch", pip = TRUE) 53 | reticulate::py_install("tensorflow", pip = TRUE) 54 | 55 | ## or try this 56 | #tensorflow::install_tensorflow() 57 | 58 | # Importing 🤗 transformers module into R session using the Python-to-R reticulate package 59 | transformers <- reticulate::import("transformers") 60 | 61 | # get another pretrained Tokenizer 62 | transformers$RobertaTokenizer$from_pretrained('roberta-base', do_lower_case=TRUE) 63 | 64 | # Instantiate a Python pipeline (this is now a function that we can pass inputs into) 65 | ## more information on piplines here: https://huggingface.co/docs/transformers/main_classes/pipelines 66 | ## the text-classification pipeline is for "sentiment-analysis" 67 | classifier <- transformers$pipeline(task = "text-classification") 68 | 69 | ## load the simple fake text data 70 | data <- read.csv("SIMpoliticalTweets.txt", header=FALSE) 71 | names(data) <- "text" 72 | head(data) 73 | 74 | # Generate predictions using the Python pipline classifier() function 75 | outputs <- classifier(data$text) 76 | outputs 77 | 78 | outputs[[1]]$score 79 | 80 | data.frame(outputs) 81 | 82 | 83 | 84 | # get another pretrained Tokenizer 85 | transformers$RobertaTokenizer$from_pretrained('roberta-base', do_lower_case=TRUE) 86 | 87 | classifier <- transformers$pipeline(task = "text-classification") 88 | 89 | outputs <- classifier(data$text) 90 | outputs 91 | 92 | # get Model with weights 93 | transformers$TFRobertaModel$from_pretrained('roberta-base') 94 | 95 | 96 | 97 | ## try some others pre-trained tasks with the zero-shot-classification 98 | ## https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/pipelines#transformers.ZeroShotClassificationPipeline 99 | classifier <- transformers$pipeline(task = "zero-shot-classification") 100 | 101 | outputs <- classifier(data$text, c("love", "obama")) 102 | outputs 103 | 104 | 105 | ## another classifier example, this time translating an english term into French 106 | 107 | 108 | transformers$TFRobertaModel$from_pretrained("t5-base") 109 | en_fr_translator <- transformers$pipeline(task = "translation_en_to_fr") 110 | en_fr_translator("human rights") 111 | 112 | en_de_translator <- transformers$pipeline(task = "translation_en_to_de") 113 | en_de_translator("human rights") 114 | 115 | 116 | ## another classifier example 117 | token_classifier <- transformers$pipeline(task = "token-classification") 118 | 119 | token_classifier() 120 | 121 | -------------------------------------------------------------------------------- /Archive/R-Programs-lessons-2022/Day09_Model_Evaluation_Review/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Archive/R-Programs-lessons-2022/Day09_Model_Evaluation_Review/.DS_Store -------------------------------------------------------------------------------- /Datasets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Datasets/.DS_Store -------------------------------------------------------------------------------- /Datasets/Anscombes_quartet.csv: -------------------------------------------------------------------------------- 1 | n,x1,y1,x2,y2,x3,y3,x4,y4 1,10,8.04,10,9.14,10,7.46,8,6.58 2,8,6.95,8,8.14,8,6.77,8,5.76 3,13,7.58,13,8.74,13,12.74,8,7.71 4,9,8.81,9,8.77,9,7.11,8,8.84 5,11,8.33,11,9.26,11,7.81,8,8.47 6,14,9.96,14,8.1,14,8.84,8,7.04 7,6,7.24,6,6.13,6,6.08,8,5.25 8,4,4.26,4,3.1,4,5.39,19,12.5 9,12,10.84,12,9.13,12,8.15,8,5.56 10,7,4.82,7,7.26,7,6.42,8,7.91 11,5,5.68,5,4.74,5,5.73,8,6.89 -------------------------------------------------------------------------------- /Datasets/Myobject.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Datasets/Myobject.Rdata -------------------------------------------------------------------------------- /Datasets/Myworkspace.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Datasets/Myworkspace.Rdata -------------------------------------------------------------------------------- /Datasets/NYT_Text_Articles/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Datasets/NYT_Text_Articles/.DS_Store -------------------------------------------------------------------------------- /Datasets/SIMpoliticalTweets.txt: -------------------------------------------------------------------------------- 1 | these are fake tweets about obama 2 | i love obama 3 | i love obama 4 | i hate obama 5 | i hate pea soup 6 | i love oatmeal 7 | obama is the president 8 | obama has a cool job 9 | love and hate 10 | blah blah blah 11 | blah 12 | -------------------------------------------------------------------------------- /Datasets/ny_stop_frisk.csv: -------------------------------------------------------------------------------- 1 | year,race,total 2012,black,284229 2012,latino,165140 2012,white,50366 -------------------------------------------------------------------------------- /Datasets/ny_stop_frisk_black.csv: -------------------------------------------------------------------------------- 1 | year, total 2 | 2003, 77704 3 | 2004,155033 4 | 2005,196570 5 | 2006,267468 6 | 2007,243766 7 | 2008,275588 8 | 2009,310611 9 | 2010,315611 10 | 2011,350743 11 | 2012,284229 -------------------------------------------------------------------------------- /Datasets/tweet_data.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Datasets/tweet_data.Rdata -------------------------------------------------------------------------------- /Datasets/tweet_data.csv: -------------------------------------------------------------------------------- 1 | "","obama","love","hate" 2 | "1",1,0,0 3 | "2",1,1,0 4 | "3",1,1,0 5 | "4",1,0,1 6 | "5",0,0,1 7 | "6",0,1,0 8 | "7",1,0,0 9 | "8",1,0,0 10 | "9",0,1,1 11 | "10",0,0,0 12 | "11",0,0,0 13 | -------------------------------------------------------------------------------- /Datasets/users-by-social-media-platform.csv: -------------------------------------------------------------------------------- 1 | Entity,Year,monthly_active_users 2 | Facebook,2008,100000000 3 | Facebook,2009,276000000 4 | Facebook,2010,517750000 5 | Facebook,2011,766000000 6 | Facebook,2012,979750000 7 | Facebook,2013,1170500000 8 | Facebook,2014,1334000000 9 | Facebook,2015,1516750000 10 | Facebook,2016,1753500000 11 | Facebook,2017,2035750000 12 | Facebook,2018,2255250000 13 | Facebook,2019,2375000000 14 | Flickr,2004,3675135 15 | Flickr,2005,7399354 16 | Flickr,2006,14949270 17 | Flickr,2007,29299875 18 | Flickr,2008,30000000 19 | Flickr,2009,41834525 20 | Flickr,2010,54708063 21 | Flickr,2011,66954600 22 | Flickr,2012,79664888 23 | Flickr,2013,80000000 24 | Friendster,2002,3000000 25 | Friendster,2003,4470000 26 | Friendster,2004,5970054 27 | Friendster,2005,7459742 28 | Friendster,2006,8989854 29 | Friendster,2007,24253200 30 | Friendster,2008,51008911 31 | Friendster,2009,28804331 32 | Google Buzz,2010,166029650 33 | Google Buzz,2011,170000000 34 | Google Buzz,2012,170000000 35 | Google Buzz,2013,170000000 36 | Google Buzz,2014,170000000 37 | Google Buzz,2015,170000000 38 | Google+,2012,107319100 39 | Google+,2013,205654700 40 | Google+,2014,254859015 41 | Google+,2015,298950015 42 | Google+,2016,398648000 43 | Google+,2017,495657000 44 | Google+,2018,430000000 45 | Hi5,2005,9731610 46 | Hi5,2006,19932360 47 | Hi5,2007,29533250 48 | Hi5,2008,55045618 49 | Hi5,2009,57893524 50 | Hi5,2010,59953290 51 | Hi5,2011,46610848 52 | Instagram,2013,117500000 53 | Instagram,2014,250000000 54 | Instagram,2015,400000000 55 | Instagram,2016,550000000 56 | Instagram,2017,750000000 57 | Instagram,2018,1000000000 58 | MySpace,2004,980036 59 | MySpace,2005,19490059 60 | MySpace,2006,54763260 61 | MySpace,2007,69299875 62 | MySpace,2008,72408233 63 | MySpace,2009,70133095 64 | MySpace,2010,68046710 65 | MySpace,2011,46003536 66 | Orkut,2004,4900180 67 | Orkut,2005,9865805 68 | Orkut,2006,14966180 69 | Orkut,2007,26916562 70 | Orkut,2008,44357628 71 | Orkut,2009,47366905 72 | Orkut,2010,49941613 73 | Orkut,2011,47609080 74 | Orkut,2012,45067022 75 | Pinterest,2016,143250000 76 | Pinterest,2017,195000000 77 | Pinterest,2018,246500000 78 | Pinterest,2019,291000000 79 | Reddit,2006,248309 80 | Reddit,2007,488331 81 | Reddit,2008,1944940 82 | Reddit,2009,3893524 83 | Reddit,2014,135786956 84 | Reddit,2015,163346676 85 | Reddit,2016,238972480 86 | Reddit,2017,297394200 87 | Reddit,2018,355000000 88 | Snapchat,2016,238648000 89 | TikTok,2017,239142500 90 | TikTok,2018,500000000 91 | Tumblr,2012,146890156 92 | Tumblr,2013,293482050 93 | Tumblr,2014,388721163 94 | Tumblr,2015,475923363 95 | Tumblr,2016,565796720 96 | Tumblr,2017,593783960 97 | Tumblr,2018,624000000 98 | Twitter,2010,43250000 99 | Twitter,2011,92750000 100 | Twitter,2012,160250000 101 | Twitter,2013,223675000 102 | Twitter,2014,223675000 103 | Twitter,2015,304500000 104 | Twitter,2016,314500000 105 | Twitter,2017,328250000 106 | Twitter,2018,329500000 107 | Twitter,2019,330000000 108 | WeChat,2011,47818400 109 | WeChat,2012,118123370 110 | WeChat,2013,196523760 111 | WeChat,2014,444232415 112 | WeChat,2015,660843407 113 | WeChat,2016,847512320 114 | WeChat,2017,921742750 115 | WeChat,2018,1000000000 116 | Weibo,2010,19532900 117 | Weibo,2011,48691040 118 | Weibo,2012,79195730 119 | Weibo,2013,118261880 120 | Weibo,2014,154890345 121 | Weibo,2015,208716685 122 | Weibo,2016,281026560 123 | Weibo,2017,357569030 124 | Weibo,2018,431000000 125 | Whatsapp,2013,300000000 126 | Whatsapp,2014,498750000 127 | Whatsapp,2015,800000000 128 | Whatsapp,2016,1000000000 129 | Whatsapp,2017,1333333333 130 | YouTube,2005,1946322 131 | YouTube,2006,19878248 132 | YouTube,2007,143932250 133 | YouTube,2008,294493950 134 | YouTube,2009,413611440 135 | YouTube,2010,480551990 136 | YouTube,2011,642669824 137 | YouTube,2012,844638200 138 | YouTube,2013,1065223075 139 | YouTube,2014,1249451725 140 | YouTube,2015,1328133360 141 | YouTube,2016,1399053600 142 | YouTube,2017,1495657000 143 | YouTube,2018,1900000000 -------------------------------------------------------------------------------- /R-Program-Lessons/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/.DS_Store -------------------------------------------------------------------------------- /R-Program-Lessons/Day01_Intro_programs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day01_Intro_programs/.DS_Store -------------------------------------------------------------------------------- /R-Program-Lessons/Day01_Intro_programs/R_Demo_Intro_Program_Challenge_two_sum.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Intro_Program_Challenge_two_sum.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-07 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Instructions: 14 | ## 15 | ## These challenges are meant to be just that, challenging. 16 | ## They should also be fun. I encourage you to think creatively and collaboratively. 17 | ## Getting stuck or not finishing all the steps is expected and encouraged. 18 | ## This is how learning works! 19 | ## Learn to program, program to learn. 20 | ## 21 | ## Always start with step (1) and then continue to each step as time permits. 22 | ## Don't worry about completing each step. Document your code for each step. 23 | ## You may wish to come back to some of the harder steps as you progress through the course. 24 | ## Note that some of the steps may ask you to use skills we have not yet covered in the course. 25 | ## Don't worry about these steps now but definitely think through the programming logic if you are stuck and make plans to come back to try them once you feel ready. 26 | ## 27 | ########################################################################## 28 | ## 29 | ## Steps for the Challenge 30 | ## 31 | ## (1) create a vector of integers numbers and a scalar integer target 32 | ## (2) write a program in R that determines (returns) two numbers from the vector that add up to the target scalar 33 | ## (3) how many combinations of numbers in the vector of integers add up to the target scalar? 34 | ## (4) write a function to complete steps 1-3 (hint: wrap the program from (3) within a function) 35 | ## (5) write a simulation that explore the relationship between (a) the vector of integers numbers and (b) scalar integer target sum 36 | ## (6) re-write the program or function so that it takes fewer steps to calculate the number of numeric combinations that add up to the scalar target 37 | ## 38 | ########################################################################## 39 | 40 | ## (1) create a vector of integers numbers and a scalar integer target 41 | x <- c(1,2,3,4,5) 42 | y <- 5 43 | 44 | ## (2) write a program in R that determines (returns) two numbers from the vector that add up to the target scalar 45 | test <- x + x[1] 46 | test == y 47 | 48 | x[test==y] 49 | 50 | value <- c() 51 | for(i in 1:length(x)){ 52 | test <- x + x[i] 53 | value <- x[test==y] 54 | print(value) 55 | } 56 | value 57 | 58 | 59 | add <- function(a,b){a+b} 60 | add(1,2) 61 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day02_Data_Analysis_Managment/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day02_Data_Analysis_Managment/.DS_Store -------------------------------------------------------------------------------- /R-Program-Lessons/Day02_Data_Analysis_Managment/R_Demo_Intro_forloop_vs_lapply.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Intro_forloop_vs_lapply.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## in class coding of loops and lapply 16 | 17 | 1:10 18 | 19 | length(1:10) 20 | 21 | 22 | ## for loop version 23 | output <- c() 24 | for(i in 1:10){ 25 | 26 | local_var <- i*i ## the simple but core calculation of the iterative process 27 | output[i] <- local_var 28 | 29 | } 30 | output 31 | 32 | 33 | ## lapply version 34 | output <- lapply(1:10, function(i){ 35 | 36 | local_var <- i*i ## the simple but core calculation of the iterative process 37 | return(local_var) 38 | }) 39 | unlist(output) 40 | 41 | 42 | 43 | ## nested for loop version 44 | mat <- matrix(NA, nrow=10, ncol=5) 45 | mat 46 | 47 | for(i in 1:10){ 48 | for(j in 1:5){ 49 | local_var <- i*j 50 | mat[i,j] <- local_var 51 | } 52 | } 53 | mat 54 | 55 | ## lapply version with an inner for loop (replicates the nested for loop structure above) 56 | mat_list <- lapply(1:10, function(i){ 57 | 58 | local_vec <- c() 59 | for(j in 1:5){ 60 | local_var <- i*j 61 | local_vec[j] <- local_var 62 | } 63 | 64 | return(local_vec) 65 | }) 66 | 67 | unlist(mat_list) 68 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T) 69 | mat 70 | 71 | 72 | ## nested lapply version (replicated both versions above) 73 | mat_list <- lapply(1:10, function(i){ 74 | local_out <- lapply(1:5, function(j){ 75 | temp <- i*j 76 | return(temp) 77 | }) 78 | return(local_out) 79 | }) 80 | 81 | mat_list 82 | unlist(mat_list) 83 | mat <- matrix(unlist(mat_list), nrow=10, ncol=5, byrow=T) 84 | mat 85 | 86 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day03_Simulation_Learning/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day03_Simulation_Learning/.DS_Store -------------------------------------------------------------------------------- /R-Program-Lessons/Day03_Simulation_Learning/R_Demo_Simulation_Learning_Dice_Rolls.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Simulation_Learning_Estimate_Mean.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ## 13 | ########################################################################## 14 | ## For this R tutorial, we will learn how: 15 | ## 16 | ## (1) Simulate the roll of a D6 dice 17 | ## (2) Repeatedly simulate the roll of a D6 dice and see how close the average value is to the expected value. 18 | ## (3) Learn about the central limit theorem from the simulation. 19 | ## 20 | ## Notes: The Central Limit Theorem (CLT) establishes that when independently generated variables (iid: independent and identically distributed random variables) are added together, the sums or averages of these variables (when normalized) converge towards a normal distribution. 21 | ## 22 | ## This property emerges even if the original variables are not individually normally distributed, as with the roll of a die. 23 | ## 24 | ## The probability of any value from the single roll of die is equivalent to any other value for the same-sided die in the limit (when the number of rolls approaches infinity). 25 | ## 26 | ########################################################################## 27 | 28 | 29 | library(MASS) 30 | 31 | 32 | ## simulate 20 randomly generated rolls from a D6 (6-sided-die) 33 | sample(1:6, size=20, replace=TRUE) 34 | 35 | ## true mean is 3.5 36 | (1 + 2 + 3 + 4 + 5 + 6) / 6 37 | 38 | ## or 39 | mean(1:6) 40 | 41 | ## true variance is approximately 2.916667 or exactly 70/24 42 | (1 - 3.5)^2 * (1/6) + (2 - 3.5)^2 * (1/6) + (3 - 3.5)^2 * (1/6) + (4 - 3.5)^2 * (1/6) + (5 - 3.5)^2 * (1/6) + (6 - 3.5)^2 * (1/6) 43 | 44 | ## or 45 | sum((1:6 - mean(1:6))^2 * (1/6)) 46 | 47 | ## repeat the simulation 10,000 times and calculate the average 48 | n_sims <- 2000 49 | 50 | ## number of samples to roll each iteration 51 | n_samples <- 10 52 | 53 | ## create two objects to hold the calculated mean and variance from each simulated sample 54 | sim_mean_values <- c() 55 | sim_var_values <- c() 56 | 57 | ## iterate/repeat the simulation n_sims times 58 | for(i in 1:n_sims){ 59 | 60 | ## create output 61 | sample_output <- sample(1:6, size=n_samples, replace=TRUE) 62 | 63 | ## save the output in the i_th position of the objects 64 | sim_mean_values[i] <- mean(sample_output) 65 | sim_var_values[i] <- var(sample_output) 66 | 67 | } 68 | 69 | ## calculate the mean and variance of the 10,000 sample means 70 | mean(sim_mean_values) 71 | mean(sim_var_values) 72 | 73 | ## set graphical parameters 74 | par(mfrow=c(1,2), mar=c(4,3,1,1)) 75 | 76 | ## plot histograms 77 | truehist(sim_mean_values, main="Mean Estimate") 78 | truehist(sim_var_values, main="Variance Estimate") 79 | 80 | 81 | ## calculate and plot the converging average using increasing sample sizes starting at 1 and ending at all the samples 82 | ## set graphical parameters 83 | par(mfrow=c(1,2)) 84 | 85 | ## 86 | plot(0,0, ylim=c(2.5,4), xlim=c(0,n_sims), type="n", main="Mean Estimate") 87 | value <- c() 88 | for(i in 1:n_sims){ 89 | value[i] <- mean(sim_mean_values[1:i]) 90 | } 91 | lines(value) 92 | abline(h=3.5, col="orange", lwd=2, lty=2) 93 | 94 | ## 95 | plot(0,0, ylim=c(2.5,4), xlim=c(0,n_sims), type="n", main="Variance Estimate") 96 | value <- c() 97 | for(i in 1:n_sims){ 98 | value[i] <- mean(sim_var_values[1:i]) 99 | } 100 | lines(value) 101 | abline(h=2.916667, col="orange", lwd=2, lty=2) 102 | 103 | 104 | sqrt(2.916667) 105 | 106 | ## the values converge towards normality 107 | summary((sim_mean_values - 3.5)) 108 | mean((sim_mean_values - 3.5)) 109 | var((sim_mean_values - 3.5)) 110 | 111 | 112 | 113 | var(sim_mean_values) 114 | var(sim_var_values) 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day03_Simulation_Learning/R_Demo_Simulation_Learning_ProbCalc_BayesRule.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_Simulation_Learning_ProbCalc_BayesRule.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Bayes rule and the calculation of conditional probability. Introduces the naive Bayes classifier function from the e1071 library. 16 | ## 17 | ## P(outcome | evidence) = P(outcome) * P(evidence | outcome) / P(evidence) 18 | ## 19 | ## Below is the Bayes’ Theorem: 20 | ## P(A | B) = P(A) * P(B | A) / P(B) 21 | ## 22 | ## Which can be derived from the general multiplication formula for AND events: 23 | ## P(A and B) = P(A) * P(B | A) 24 | ## P(B | A) = P(A and B) / P(A) 25 | ## P(B | A) = P(B) * P(A | B) / P(A) 26 | ## P(y|x) = P(x|y) * P(y) / P(x) 27 | ## P(x|y) = P(x AND y) / P(x) 28 | ## 29 | ## Pr(A[1] = Pr(y==0) 30 | ## Pr(A[2] = Pr(y==1) 31 | ## Pr(B | A[1]) = Pr(Data | y==0) 32 | ## Pr(B | A[2]) = Pr(Data | y==1) 33 | ## 34 | ########################################################################## 35 | 36 | 37 | A <- c(0,1,0,1) 38 | B <- c(0,0,0,1) 39 | 40 | 41 | ProbCalc <- function(A,B){ 42 | 43 | len <- length(A) 44 | data <- as.data.frame(cbind(A,B, A*B, A+B-A*B)) 45 | names(data) <- c("A", "B", "A and B", "A or B") 46 | 47 | pr.A <- sum(A)/len 48 | pr.B <- sum(B)/len 49 | pr.NotA <- 1 - pr.A 50 | pr.NotB <- 1 - pr.B 51 | pr.A.B <- sum(A*B)/len 52 | pr.NotA.B <- sum(abs(A-1)*B)/len 53 | pr.A.NotB <- sum(A*abs(B-1))/len 54 | pr.NotA.NotB <- sum(abs(A-1)*abs(B-1))/len 55 | 56 | pr.A.or.B <- pr.A + pr.B - pr.A.B 57 | 58 | pr.A.condB <- pr.A.B / pr.B 59 | pr.B.condA <-pr.A.B / pr.A 60 | 61 | out <- list(data=data, pr.A=pr.A, pr.B=pr.B, pr.NotA=pr.NotA, pr.NotB=pr.NotB, pr.A.B=pr.A.B, pr.NotA.B=pr.NotA.B, pr.A.NotB=pr.A.NotB, pr.NotA.NotB=pr.NotA.NotB, pr.A.or.B= pr.A.or.B, pr.A.condB=pr.A.condB, pr.B.condA=pr.B.condA) 62 | return(out) 63 | 64 | } 65 | 66 | ProbCalc(A,B) 67 | 68 | ProbTabCalc <- function(A,B){ 69 | 70 | tab <- table(A,B)/length(A) 71 | 72 | pr.A <- sum(tab[A==1]) 73 | pr.B <- sum(tab[B==1]) 74 | pr.NotA <- sum(tab[A==0]) 75 | pr.NotB <- sum(tab[B==0]) 76 | pr.A.B <- sum(tab[A==1 & B==1]) 77 | pr.NotA.B <- sum(tab[A==0 & B==1]) 78 | pr.A.NotB <- sum(tab[A==1 & B==0]) 79 | pr.NotA.NotB <- sum(tab[A==0 & B==0]) 80 | 81 | pr.A.or.B <- sum(tab[A==1 | B==1]) 82 | 83 | pr.A.condB <- pr.A.B / pr.B 84 | pr.B.condA <-pr.A.B / pr.A 85 | 86 | out <- list(table=tab, pr.A=pr.A, pr.B=pr.B, pr.NotA=pr.NotA, pr.NotB=pr.NotB, pr.A.B=pr.A.B, pr.NotA.B=pr.NotA.B, pr.A.NotB=pr.A.NotB, pr.NotA.NotB=pr.NotA.NotB, pr.A.or.B= pr.A.or.B, pr.A.condB=pr.A.condB, pr.B.condA=pr.B.condA) 87 | return(out) 88 | 89 | } 90 | 91 | ProbTabCalc(A,B) 92 | 93 | 94 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day04_Simulation_Inference/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day04_Simulation_Inference/.DS_Store -------------------------------------------------------------------------------- /R-Program-Lessons/Day04_Simulation_Inference/R_Demo_Simulation_Inference_2_Sample_Hold_Out.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_Simulation_Inference_2_Sample_Hold_out.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-09 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Goal: Improve the predictive power or predictive validity of a model 15 | ## 16 | ########################################################################## 17 | ## Introduction to tutorial: 18 | ## 19 | ## (1) Begin building towards and learning about cross-validation 20 | ## (NOTE: There is no "crossing" yet) 21 | ## 22 | ## For this R tutorial we will simulate a dataset and then randomly divide it into two subsets. 23 | ## 24 | ## We will fit a model using the observations from one of the subsets of data (training data). 25 | ## 26 | ## We will then use the model estimates to predict the value of the dependent variable for the remaining out-of-sample data subset (testing data). 27 | ## 28 | ########################################################################## 29 | 30 | 31 | #set.seed(940) 32 | 33 | 34 | ## set number of observations for simulation 35 | n <- 100 36 | 37 | ## simulation of variables (This model is one of Anscombe's quartets) 38 | x <- sample(4:14,n,replace=TRUE) 39 | table(x) 40 | y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,1) 41 | #y <- -5.996 + 2.781*x -0.127*x^2 + rnorm(n,0,2) 42 | 43 | ## plot the simulated relationship 44 | par(mfrow=c(1,1)) 45 | plot(x=x, y=y) 46 | 47 | ## create a subject/unit ID variable with one values for each unit 48 | ## here the indicator values takes on 2-Fold values {1,2} 49 | folds <- sample(rep(1:2, n/2), size=n, replace=FALSE) 50 | folds 51 | table(folds) 52 | 53 | ## doesn't always yield 50/50 ratio of 1s and 2s 54 | #folds <- sample(1:2, size=n, replace=TRUE) 55 | #folds 56 | #table(folds) 57 | 58 | ## create a data frame with the dependent variable, independent variable, and randomly created ID 59 | dat <- data.frame(y, x, folds) 60 | 61 | summary(dat) 62 | 63 | head(dat) 64 | 65 | ## fit a linear model to the full dataset 66 | model <- lm(y ~ x, data=dat) 67 | summary(model) 68 | 69 | 70 | ## subset the full dataset into to subsets based on the ID variable 71 | train <- subset(dat, folds==1) 72 | test <- subset(dat, folds==2) 73 | 74 | train <- dat[dat$folds==1,] 75 | test <- dat[dat$folds==2,] 76 | 77 | nrow(train) 78 | nrow(test) 79 | 80 | 81 | ## Model 0: fit a linear model 82 | fit <- lm(y ~ 1, data=train) 83 | in_sample_rmse <- sqrt(mean((as.numeric(predict(fit))-train$y)^2)) 84 | in_sample_rmse 85 | 86 | pred <- predict(fit, newdata=test) 87 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2)) 88 | rmse 89 | 90 | ## Model 1: fit a linear model 91 | fit <- lm(y ~ x, data=train) 92 | in_sample_rmse <- sqrt(mean((as.numeric(predict(fit))-train$y)^2)) 93 | in_sample_rmse 94 | 95 | pred <- predict(fit, newdata=test) 96 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2)) 97 | rmse 98 | 99 | ## this is what predict function is doing under the hood 100 | y_hat <- fit$coefficients[1] + fit$coefficients[2] * test$x 101 | 102 | ## Model 2: fit a linear model with a squared term 103 | fit <- lm(y ~ x + I(x^2), data=train) 104 | in_sample_rmse <- sqrt(mean((as.numeric(predict(fit))-train$y)^2)) 105 | in_sample_rmse 106 | 107 | pred <- predict(fit, newdata=test) 108 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2)) 109 | rmse 110 | 111 | 112 | ## Model 3: fit a linear model with a squared term and a cubic term 113 | fit <- lm(y ~ x + I(x^2) + I(x^3), data=train) 114 | in_sample_rmse <- sqrt(mean((as.numeric(predict(fit))-train$y)^2)) 115 | in_sample_rmse 116 | 117 | pred <- predict(fit, newdata=test) 118 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2)) 119 | rmse 120 | 121 | 122 | ## Model 4: fit a linear model with a squared term and a cubic term and a 4th order term 123 | fit <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4), data=train) 124 | in_sample_rmse <- sqrt(mean((as.numeric(predict(fit))-train$y)^2)) 125 | in_sample_rmse 126 | 127 | pred <- predict(fit, newdata=test) 128 | rmse <- sqrt(mean((as.numeric(pred)-test$y)^2)) 129 | rmse 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_Simulation_Inference_naive_Bayes.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-09 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Bayes rule and the calculation of conditional probability. Introduces the naive Bayes classifier function from the e1071 library. 16 | ## 17 | ## P(outcome | evidence) = P(outcome) * P(evidence | outcome) / P(evidence) 18 | ## 19 | ## Below is the Bayes’ Theorem: 20 | ## P(A | B) = P(A) * P(B | A) / P(B) 21 | ## 22 | ## Which can be derived from the general multiplication formula for AND events: 23 | ## P(A and B) = P(A) * P(B | A) 24 | ## P(B | A) = P(A and B) / P(A) 25 | ## P(B | A) = P(B) * P(A | B) / P(A) 26 | ## P(y|x) = P(x|y) * P(y) / P(x) 27 | ## P(x|y) = P(x AND y) / P(x) 28 | ## 29 | ## Pr(A[1] = Pr(y==0) 30 | ## Pr(A[2] = Pr(y==1) 31 | ## Pr(B | A[1]) = Pr(Data | y==0) 32 | ## Pr(B | A[2]) = Pr(Data | y==1) 33 | ## 34 | ########################################################################## 35 | 36 | 37 | ## load libraries 38 | library(e1071) 39 | library(LaplacesDemon) 40 | 41 | 42 | ## example code from BayesTheorem() function 43 | PrA <- c(0.75,0.25) 44 | PrBA <- c(6/9, 5/7) 45 | BayesTheorem(PrA, PrBA) 46 | 47 | 48 | ## create fake data 49 | n <- 10 50 | x <- c(rep(0,n/2), rep(1,n/2)) 51 | y <- c(0,0,0,1,1,0,0,1,1,1) 52 | 53 | ## inspect data 54 | cbind(y,x) 55 | 56 | ## inspect tabulation of data 57 | table(y,x) 58 | 59 | 60 | ## calculate the probability of the evidence/data 61 | PrX <- NA 62 | PrX[1] <- sum(as.numeric(x==1)) / n 63 | PrX[2] <- sum(as.numeric(x==1)) / n 64 | 65 | ## calculate the probability of the outcome 66 | PrY <- NA 67 | PrY[1] <- sum(as.numeric(y==0))/n 68 | PrY[2] <- sum(as.numeric(y==1))/n 69 | PrY 70 | 71 | ## calculate the probability of the data conditional on the value of y (the likelihood) 72 | PrXY<- NA 73 | PrXY[1] <- sum(x[y==0])/length(as.numeric(x[y==0])) 74 | PrXY[2] <- sum(x[y==1])/length(as.numeric(x[y==1])) 75 | PrXY 76 | 77 | ## apply Bayes Rule 78 | PrXY * PrY / PrX 79 | 80 | ## apply Bayes Rule with BayesTheorem() function 81 | BayesTheorem(PrA=PrY, PrBA=PrXY) 82 | 83 | 84 | ## apply Bayes Rule with naiveBayes() function 85 | fit <- naiveBayes(y~x, data=data.frame(y,x)) 86 | fit 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day04_Simulation_Inference/R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_Simulation_Inference_naive_Bayes_10_Fold_Cross_Validation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-09 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## (1) Program simulates count data that is then used to predict a binary outcome variable. 16 | ## 17 | ## (2) Three models are evaluated using the count data to predict the outcome: 18 | ## (2a) linear model 19 | ## (2b) generalized linear model with a logit link function 20 | ## (2c) naive Bayes classifier. 21 | ## 22 | ########################################################################## 23 | 24 | 25 | ## load library 26 | llibrary(e1071) 27 | 28 | 29 | ## simulate x1 and set the "true" population values alpha and beta 30 | n <- 100 31 | 32 | ## unobserved 33 | x <- runif(n,0,1) 34 | 35 | ## observed counts 36 | x1 <- rpois(n, lambda=x) 37 | x2 <- rpois(n, lambda=x) 38 | x3 <- rpois(n, lambda=2*x) 39 | x4 <- rpois(n, lambda=2*x) 40 | x5 <- rpois(n, lambda=4*x) 41 | 42 | ## systematic component of the model based on observed counts 43 | xb <- -2 + x1 + x2 + x3 + x4 + x5 44 | 45 | ## transform the linear term xb using 46 | ## the inverse logit function 47 | ## so that theta is bound from 0 to 1 48 | pi <- 1 / (1 + exp(-xb)) 49 | 50 | ## generate the dependent variable y with probability pi and measurement error from a Bernoulli trial 51 | y <- rbinom(n, size=1, prob=pi) 52 | 53 | 54 | ## make data frame 55 | dat <- data.frame(y, x1, x2, x3, x4, x5) 56 | 57 | 58 | ## summarize fit using linear model 59 | summary(lm(y ~ x1 + x2 + x3 + x4 + x5, data=dat)) 60 | 61 | 62 | ## summarize fit using glm using the logit link function 63 | summary(glm(y ~ x1 + x2 + x3 + x4 + x5, family=binomial(link="logit"))) 64 | 65 | 66 | ## summarize fit using naiveBayes model 67 | naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=dat) 68 | 69 | 70 | ## create vectors for storing predictions 71 | dat$y.hat1 <- NA 72 | dat$y.hat2 <- NA 73 | dat$y.hat3 <- NA 74 | 75 | ## select number of folds 76 | k <- 10 77 | 78 | ## create vector of folds for cross validation 79 | dat$folds <- sample(rep(1:k, k), n, replace=FALSE) 80 | 81 | ## lapply function to 82 | for(i in 1:k){ 83 | 84 | ## fit a linear model 85 | fit1 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i)) 86 | pred1 <- predict(fit1, newdata=subset(dat, folds==i)) 87 | y.hat1 <- as.numeric(pred1) 88 | 89 | dat$y.hat1[dat$fold==i] <- y.hat1 90 | 91 | 92 | ## fit a glm model 93 | fit2 <- glm(y ~ x1 + x2 + x3 + x4 + x5, binomial(link="logit"), data=subset(dat, folds!=i)) 94 | pred2 <- predict(fit2, newdata=subset(dat, folds==i)) 95 | y.hat2 <- as.numeric(pred2) 96 | 97 | dat$y.hat2[dat$fold==i] <- y.hat2 98 | 99 | 100 | ## fit a naiveBayes classifier model 101 | fit3 <- naiveBayes(as.factor(y) ~ x1 + x2 + x3 + x4 + x5, data=subset(dat, folds!=i)) 102 | pred3 <- predict(fit3, newdata=subset(dat, folds==i)) 103 | y.hat3 <- as.numeric(pred3) 104 | 105 | dat$y.hat3[dat$fold==i] <- y.hat3 106 | 107 | #print(summary(dat)) 108 | } 109 | 110 | rmse.fit1 <- sqrt(mean((dat$y.hat1-dat$y)^2)) 111 | rmse.fit1 112 | 113 | cor.fit1 <- cor(dat$y.hat1, dat$y, method="spearman") 114 | 115 | rmse.fit2 <- sqrt(mean((dat$y.hat2-dat$y)^2)) 116 | rmse.fit2 117 | 118 | cor.fit2 <- cor(dat$y.hat2, dat$y, method="spearman") 119 | 120 | rmse.fit3 <- sqrt(mean((dat$y.hat3-dat$y)^2)) 121 | rmse.fit3 122 | 123 | cor.fit3 <- cor(dat$y.hat3, dat$y, method="spearman") 124 | 125 | c(cor.fit1, cor.fit2, cor.fit3) 126 | 127 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day05_Measurement_RSTAN/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day05_Measurement_RSTAN/.DS_Store -------------------------------------------------------------------------------- /R-Program-Lessons/Day05_Measurement_RSTAN/RSTAN_logistic_regression_simulation.R: -------------------------------------------------------------------------------- 1 | ## RSTAN_logistic_regression_simulation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-11 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions 10 | ## e-mail: cjf0006@gmail.com 11 | ## 12 | ########################################################################## 13 | ## 14 | ## Introduction to tutorial: 15 | ## 16 | ## For this R tutorial we will simulate a binary dependent variable and then estimate the parameters that generate the variable. These parameters will be estimated based on the likelihood function that links them to the data contained in the y and x variables that are simulated observed data. The model produces the slope and intercept from a standard logistic regression model, which is also estimated using glm() in R. 17 | ## 18 | ########################################################################## 19 | 20 | ## load library 21 | library(rstan) # load rstan library 22 | library(MASS) # loaed library with truehist function 23 | 24 | ## -------------------------------------------------- ## 25 | ## define STAN model 26 | ## -------------------------------------------------- ## 27 | model <- " 28 | data { 29 | // declared the data in memory 30 | int n; 31 | int y[n]; 32 | vector[n] x; 33 | } 34 | // declared the parameters in memory 35 | parameters { 36 | real alpha; 37 | real beta; 38 | } 39 | model { 40 | // priors (these are variances not precision) 41 | //alpha ~ normal(0,10); 42 | //beta ~ normal(0,10); 43 | 44 | // likelihood (link data to some combination of parameters and more data) 45 | y ~ bernoulli_logit(alpha + beta * x); 46 | } 47 | generated quantities { 48 | // posterior predictions 49 | vector[n] y_predict; 50 | 51 | // the loop is necessary within the generated quantities block 52 | for(i in 1:n){ 53 | y_predict[i] = bernoulli_logit_rng(alpha + beta * x[i]); 54 | } 55 | } 56 | 57 | " 58 | ## -------------------------------------------------- ## 59 | 60 | 61 | ## simulate x1 and set the "true" population values alpha and beta 62 | n <- 100 63 | x <- rnorm(n,0,1) 64 | alpha <- 1.25 65 | beta <- 2.50 66 | 67 | ## systematic component of the model 68 | xb <- alpha + beta * x 69 | 70 | ## transform the linear term xb using 71 | ## the inverse logit function 72 | ## so that theta is bound from 0 to 1 73 | eta <- 1 / (1 + exp(-xb)) 74 | 75 | ## generate the dependent variable y with probability inv.theta and measurment error from a Bernoulli trial 76 | y <- rbinom(n, size=1, prob=eta) 77 | 78 | table(y) 79 | 80 | ## create data list 81 | data_list <- list(y = y, x=x, n=n) 82 | 83 | ## fit linear model 84 | summary(glm(y~x, family=binomial(link="logit"))) 85 | 86 | ## fit stan model 87 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4) 88 | 89 | ## extract draws from stan model object 90 | output <- extract(fit, permuted = TRUE) 91 | 92 | ## print names 93 | names(output) 94 | 95 | ## there are number of methods to subset and summarize parameters 96 | ## keep in mind that the output object is a list that contains vectors or matrices of of posterior estimates for each of the named parameter defined in the model statement above 97 | ## lapply (list-apply) a function to all of the objects in the list 98 | lapply(output, mean) 99 | lapply(output, sd) 100 | 101 | ## tabulate the simulated binary dependent variable it should be very close to the mean value of the predicted y 102 | table(y) 103 | 104 | ## create a matrix using some of the named slots in the list 105 | model_parameters <- as.matrix(fit, pars = c("alpha", "beta")) 106 | model_predictions <- as.matrix(fit, pars = "y_predict") 107 | 108 | ## check the dimensions (they should be the same) 109 | dim(model_predictions) 110 | dim(output$y_predict) 111 | 112 | ## plot the simulated y variable and the estimated posterior means 113 | plot(apply(model_predictions,2,mean), y) 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day05_Measurement_RSTAN/RSTAN_mean_simulation.R: -------------------------------------------------------------------------------- 1 | ## RSTAN_mean_simulation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-11 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions 10 | ## e-mail: cjf0006@gmail.com 11 | ## 12 | ########################################################################## 13 | ## 14 | ## Introduction to tutorial: 15 | ## 16 | ## For this R tutorial we will use the normal distribution in STAN to estimate the mean value for an observed variable y. The parameter for the mean will be selected based on the likelihood function that links them to the data contained in the y variable. 17 | ## 18 | ########################################################################## 19 | 20 | ## load library 21 | library(rstan) # load rstan library 22 | library(MASS) # load library with truehist function 23 | 24 | ## -------------------------------------------------- # 25 | ## define STAN model as a character 26 | ## -------------------------------------------------- # 27 | model <- " 28 | data { 29 | // declared the data in memory 30 | int n; 31 | vector[n] y; 32 | } 33 | parameters { 34 | // declared the parameters in memory 35 | real mu; 36 | real sigma; 37 | } 38 | model { 39 | // there are no prior statements for mu or sigma; 40 | // by default the priors on the parameters are flat unless we provide more information (see the other examples) 41 | // likelihood (link data to some combination of parameters and more data) 42 | 43 | mu ~ normal(0,.1); 44 | 45 | for(i in 1:n){ 46 | y[i] ~ normal(mu, sigma); 47 | } 48 | } 49 | generated quantities { 50 | // posterior predictions 51 | vector[n] y_predict; 52 | 53 | // the loop is necessary within the generated quantities block 54 | for(i in 1:n){ 55 | y_predict[i] = normal_rng(mu, sigma); 56 | } 57 | } 58 | " 59 | ## -------------------------------------------------- # 60 | 61 | 62 | ## set data for simulation 63 | #y <- 1:5 64 | y <- rep(1:5,200) 65 | 66 | n <- length(y) 67 | y 68 | n 69 | 70 | ## create data list 71 | data_list <- list(y = y, n=n) 72 | data_list 73 | 74 | ## set time start variable 75 | time1 <- Sys.time() 76 | 77 | # fit stan model 78 | fit <- stan(model_code = model, data = data_list, iter = 1000, chains = 4) 79 | 80 | ## calcuate the duration of the program file up to this point 81 | print(Sys.time() - time1) 82 | 83 | ## extract draws from stan model object (creates a list object) 84 | output <- extract(fit, permuted = TRUE) 85 | 86 | ## print names of each element/slot in the list 87 | names(output) 88 | 89 | ## print model fit object 90 | fit 91 | 92 | ## lapply (list-apply) a function to all of the objects in the list 93 | lapply(output, mean) 94 | lapply(output, sd) 95 | 96 | truehist(output$mu) 97 | 98 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day05_Measurement_RSTAN/RSTAN_normal_distribution_simulation.R: -------------------------------------------------------------------------------- 1 | ## RSTAN_normal_distribution_simulation.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-11 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions 10 | ## e-mail: cjf0006@gmail.com 11 | ## 12 | ########################################################################## 13 | ## 14 | ## Introduction to tutorial: 15 | ## 16 | ## For this R tutorial we will draw random samples from the normal distribution using the STAN program. 17 | ## This program is equivalent to using the rnorm function in R. 18 | ## 19 | ########################################################################## 20 | 21 | 22 | ## load library 23 | library(rstan) # load rstan library 24 | library(MASS) # load library with truehist function 25 | 26 | 27 | ## -------------------------------------------------- # 28 | ## define STAN model 29 | ## -------------------------------------------------- # 30 | model <- " 31 | 32 | parameters { 33 | real mu; 34 | } 35 | 36 | model { 37 | mu ~ normal(0,1); 38 | } 39 | " 40 | ## -------------------------------------------------- # 41 | 42 | 43 | ## set time start variable 44 | time1 <- Sys.time() 45 | 46 | ## fit stan model 47 | fit <- stan(model_code = model, iter = 1000, chains = 4) 48 | 49 | ## calculate the duration of the program file up to this point 50 | print(Sys.time() - time1) 51 | 52 | ## extract draws from stan model object (creates a list object) 53 | output <- extract(fit, permuted = TRUE) 54 | 55 | ## print names of each element/slot in the list 56 | names(output) 57 | 58 | ## print model fit object 59 | fit 60 | 61 | ## there are number of methods to subset and summarize parameters 62 | ## keep in mind that the output object is a list that contains vectors or matrices of posterior estimates for each of the named parameter defined in the model statement above 63 | ## lapply (list-apply) a function to all of the objects in the list 64 | lapply(output, mean) 65 | lapply(output, sd) 66 | 67 | length(output$mu) 68 | 69 | ## create a matrix using some of the named slots in the list 70 | model_parameters <- as.matrix(fit, pars = c("mu")) 71 | 72 | dim(model_parameters) 73 | 74 | ## check the dimensions (they should be the same) 75 | length(output$mu) 76 | 77 | ## make a nice plot 78 | par(mfrow=c(1,1)) 79 | truehist(output$mu) 80 | 81 | #plot(fit[[1]]$sim$permutation[[3]]) 82 | 83 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day06_Text_as_data_programs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day06_Text_as_data_programs/.DS_Store -------------------------------------------------------------------------------- /R-Program-Lessons/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_stm_package.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_text_as_data_DTM_stm_package.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-13 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Introduction to tutorial: 15 | ## 16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file. 17 | ## 18 | ## This code uses functions available in the tm package. 19 | ## 20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions. 21 | ## 22 | ########################################################################## 23 | 24 | 25 | ## load libraries 26 | library(stm) 27 | library(tm) 28 | library(SnowballC) 29 | 30 | ## load data 31 | data <- read.csv("Datasets/SIMpoliticalTweets.txt", header=FALSE) 32 | data 33 | names(data) <- "text" 34 | data 35 | 36 | #trumptweets <- fromJSON("trump_json_files_20190707.txt") 37 | #data <- trumptweets 38 | 39 | ## 40 | #data <- data[1:1000,] 41 | 42 | 43 | ## preprocess the documents 44 | ## This function uses function from the tm package (see the tm Demo for more details) 45 | ## stem words and remove stop words 46 | prep <- textProcessor(documents=data$text, meta=data) 47 | 48 | ## list attributes 49 | attributes(prep) 50 | 51 | ## inspect 52 | head(prep$documents) 53 | head(prep$vocab) 54 | head(prep$meta) 55 | prep$docs.removed 56 | 57 | 58 | ## pre Documents 59 | ## additional processing (removes some documents because of word frequencies greater than .99 or less than .01) 60 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta) 61 | out 62 | 63 | ## inspect 64 | head(out$documents) 65 | head(out$vocab) 66 | head(out$meta) 67 | 68 | ## fit a structural topic model 69 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=3) 70 | 71 | ## inspect attributes 72 | attributes(fit) 73 | 74 | dim(fit$theta) 75 | 76 | ## display topic probabilities 77 | fit$theta 78 | 79 | apply(fit$theta, 1, sum) 80 | 81 | out$meta 82 | 83 | data.frame(text = out$meta$text, topic1=fit$theta[,1], topic2=fit$theta[,2], topic3=fit$theta[,3]) 84 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day06_Text_as_data_programs/R_Demo_text_as_data_DTM_tm_package.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_text_as_data_DTM_tm_package.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-13 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## 14 | ## Introduction to tutorial: 15 | ## 16 | ## This tutorial replicates the processes demonstrated in the R_Demo_text_as_data_DTM.R file. 17 | ## 18 | ## This code uses functions available in the tm package. 19 | ## 20 | ## The code is much much faster at processing the text data though some of the steps are difficul to lean from these functions. 21 | ## 22 | ########################################################################## 23 | 24 | ## load libraries 25 | library(stm) 26 | library(tm) 27 | library(SnowballC) 28 | 29 | ## load data 30 | data <- read.csv("Datasets/SIMpoliticalTweets.txt", header=FALSE) 31 | data 32 | names(data) <- "text" 33 | data 34 | 35 | #trumptweets <- fromJSON("trump_json_files_20190707.txt") 36 | #data <- trumptweets 37 | 38 | ## 39 | ##data <- data[1:1000,] 40 | 41 | ## create character vector for processing 42 | newtext <- as.character(data$text) 43 | length(newtext) 44 | 45 | 46 | ## use gsub to remove special characters that usually cause errors if left for later 47 | newtext <- gsub("[^0-9A-Za-z///' ]", "", newtext) 48 | newtext <- gsub("[^[:alnum:]///' ]", "", newtext) 49 | newtext <- gsub("[^\x20-\x7F\x0D\x0A]", "", newtext) # remove all non-ascii characters 50 | newtext <- gsub("http.*", "", newtext) # replace all of the urls 51 | newtext <- gsub("www.*", "", newtext) # 52 | 53 | ## data$newtext 54 | data$newtext <- newtext 55 | 56 | ## convert to corpus object using additional functions from the tm package 57 | ## the tm_map function takes as its first argument the vector of text and a function as its second argument 58 | corpus <-Corpus(VectorSource(newtext)) 59 | corpus <- tm_map(corpus, removePunctuation) 60 | corpus <- tm_map(corpus, removeNumbers) 61 | corpus <- tm_map(corpus, stripWhitespace) 62 | corpus <- tm_map(corpus, tolower) 63 | corpus <- tm_map(corpus, removeWords, stopwords("english")) 64 | corpus <- tm_map(corpus, stemDocument) 65 | 66 | ## print to screen 67 | inspect(corpus[1:11]) 68 | 69 | 70 | ## make document by term matrix 71 | DTM <- DocumentTermMatrix(corpus) 72 | DTM 73 | 74 | 75 | ## print DTM to screen 76 | inspect(DTM) 77 | inspect(DTM[1:11,1:12]) 78 | 79 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day06_Text_as_data_programs/R_Demo_text_as_wikip_stm.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_text_as_wikip_stm.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-13 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## This tutorial demonstrates how the Structural Topic Model functions for a real set of documents. 16 | ## 17 | ## Each unit in the dataset representes a randomly selected sentence from wikipedia. 18 | ## 19 | ########################################################################## 20 | 21 | library(stm) 22 | 23 | wikip_word_dat <- readLines("one_meelyun_sentences.txt") 24 | head(wikip_word_dat) 25 | 26 | n <- length(wikip_word_dat) 27 | n 28 | 29 | wikip_word_dat <- data.frame(id=1:n, text=wikip_word_dat) 30 | head(wikip_word_dat) 31 | 32 | 33 | wikip_word_dat$human_rights_test <- as.numeric(grepl("human rights", wikip_word_dat$text)) 34 | table(wikip_word_dat$human_rights_test) 35 | 36 | wikip_word_dat$civil_rights_test <- as.numeric(grepl("civil rights", wikip_word_dat$text)) 37 | table(wikip_word_dat$civil_rights_test) 38 | 39 | ## preprocess the documents 40 | ## This function uses function from the tm package (see the tm Demo for more details) 41 | prep <- textProcessor(documents=wikip_word_dat$text, meta=wikip_word_dat) 42 | 43 | save(out, file="wikip_word_dat_stm_prep.Rdata") 44 | 45 | ## inspect 46 | head(prep$documents) 47 | head(prep$vocab) 48 | head(prep$meta) 49 | 50 | 51 | ## pre Documents 52 | ## stem words and remove stop words 53 | out <- prepDocuments(prep$documents, prep$vocab, prep$meta) 54 | 55 | save(out, file="wikip_word_dat_stm_out.Rdata") 56 | 57 | ## inspect 58 | head(out$documents) 59 | head(out$vocab) 60 | head(out$meta) 61 | 62 | ## fit a structural topic model 63 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=20) 64 | #fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=40) 65 | fit <- stm(documents=out$documents, vocab=out$vocab, data=out$meta, K=100) 66 | 67 | #fit_2 <- fit 68 | #fit_3 <- fit 69 | #fit_10 <- fit 70 | #fit_20 <- fit 71 | fit_40 <- fit 72 | 73 | ## display topic probabilities 74 | head(fit$theta) 75 | 76 | dim(fit$theta) 77 | summary(head(fit$theta, 1000)) 78 | 79 | #save(fit, file="wikip_word_dat_stm20.Rdata") 80 | #save(fit, file="wikip_word_dat_stm40.Rdata") 81 | save(fit, file="wikip_word_dat_stm100.Rdata") 82 | 83 | 84 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day06_Text_as_data_programs/R_twitter_examples/R_Demo_text_as_data_twitteR_get_twitter_users.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_text_as_data_twitteR_get_tweeter_users.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## setup to access the twitter API using R 16 | ## 17 | ## Create a Twitter application at http://dev.twitter.com. Make sure to give the app read, write and direct message authority. 18 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret". 19 | 20 | ########################################################################## 21 | 22 | #install.packages("twitteR") 23 | library(twitteR) 24 | 25 | 26 | ## (1a) Go to twitter's developer page: https://developer.twitter.com/ 27 | ## (1b) Click to create an App 28 | ## (1c) For the desription, you can say something like this "learn to use twitteR API for a college data science course at the University of Michigan." 29 | ## 30 | ## (2) Once you have succefully setup you APP, you will be able to get the four strings that you will fill in below 31 | ## 32 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret". 33 | ## 34 | ## set keys and tokens to access the twitter API 35 | consumer_key <- "your_consumer_key" 36 | consumer_secret <- "your_consumer_secret" 37 | access_token <- "your_access_token" 38 | access_secret <- "your_access_secret" 39 | 40 | setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) 41 | 42 | ## (3) Run the following tweet to see if you successfully created your account 43 | amnestyusa_tweets <- userTimeline('amnestyusa', n=100) 44 | ## 45 | ## (4) Pick a tweeter account from your country of interest (or you can use the amnestyusa account from above) 46 | ## 47 | ## (5) Make a DTM (Document-by-Term matrix) for the first 100 tweets from the account you have selected or the amnestyusa account 48 | ## 49 | ## (6a) Which words are most commonly used in the corpus of 100 tweets? 50 | ## (6b) Which words are the least commonly used in the corpus of 100 tweets? 51 | ## (6c) Create a barplot of the most frequent words 52 | 53 | 54 | ########################################################################## 55 | ## additional examples: 56 | ## 57 | ## grab the most recent 100 tweets from Barak Obama 58 | Obama_ut <- userTimeline('barackobama', n=100) 59 | 60 | ## grab the max number of tweets for the President of Senegal Macky Sall 61 | Macky_Sall_ut <- userTimeline('Macky_Sall', n=3200) 62 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day06_Text_as_data_programs/R_twitter_examples/R_Demo_text_as_data_twitteR_get_twitter_users_Oauth_v2.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_text_as_data_twitteR_get_tweeter_users.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (2F) 5 | ## University of Essex Summer School 2022 6 | ## 7 | ## Date: 2022-08-08 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## setup to access the twitter API using R 16 | ## 17 | ## Create a Twitter application at http://dev.twitter.com. Make sure to give the app read, write and direct message authority. 18 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret". 19 | 20 | ########################################################################## 21 | 22 | #install.packages("twitteR") 23 | library(twitteR) 24 | 25 | 26 | ## (1a) Go to twitter's developer page: https://developer.twitter.com/ 27 | ## SEE THIS LINK: https://developer.twitter.com/en/docs/tutorials/getting-started-with-r-and-v2-of-the-twitter-api 28 | ## (1b) Click to create an App 29 | ## (1c) For the desription, you can say something like this "learn to use twitteR API for a college data science course at the University of Michigan." 30 | ## 31 | ## (2) Once you have succefully setup you APP, you will be able to get the four strings that you will fill in below 32 | ## 33 | ## Take note of the following values from the Twitter app page: "API key", "API secret", "Access token", and "Access token secret". 34 | ## 35 | ## set keys and tokens to access the twitter API 36 | consumer_key <- "your_consumer_key" 37 | consumer_secret <- "your_consumer_secret" 38 | access_token <- "your_access_token" 39 | access_secret <- "your_access_secret" 40 | 41 | setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) 42 | 43 | ## (3) Run the following tweet to see if you successfully created your account 44 | amnestyusa_tweets <- userTimeline('amnestyusa', n=100) 45 | ## 46 | ## (4) Pick a tweeter account from your country of interest (or you can use the amnestyusa account from above) 47 | ## 48 | ## (5) Make a DTM (Document-by-Term matrix) for the first 100 tweets from the account you have selected or the amnestyusa account 49 | ## 50 | ## (6a) Which words are most commonly used in the corpus of 100 tweets? 51 | ## (6b) Which words are the least commonly used in the corpus of 100 tweets? 52 | ## (6c) Create a barplot of the most frequent words 53 | 54 | 55 | ########################################################################## 56 | ## additional examples: 57 | ## 58 | ## grab the most recent 100 tweets from Barak Obama 59 | Obama_ut <- userTimeline('barackobama', n=100) 60 | 61 | ## grab the max number of tweets for the President of Senegal Macky Sall 62 | Macky_Sall_ut <- userTimeline('Macky_Sall', n=3200) 63 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day06_Text_as_data_programs/groundhog_library_func.R: -------------------------------------------------------------------------------- 1 | 2 | groundhog_library_func <- function(groundhog=FALSE, regular_install=FALSE){ 3 | ## Do this (set to TRUE) to load libraries using the version from when the scripts were originally run 4 | if(groundhog){ 5 | ## load an older version of the libraries 6 | remotes::install_github('CredibilityLab/groundhog') 7 | library(groundhog) 8 | pkgs <- c("gtrendsR", "countrycode", "stm", "tm", "MASS", "bcp", "ngramr", "rvest", "plm", "lmtest", "WDI", "boot", "forecast", "acled.api", "ggplot2", "stargazer", "httr", "lubridate", "xtable") 9 | groundhog.library(pkgs,'2022-05-23') 10 | } else if(regular_install==TRUE){ 11 | ## or install and load the more recent version of the libraries 12 | install.packages("gtrendsR", "countrycode", "stm", "tm", "MASS", "bcp", "ngramr", "rvest", "plm", "lmtest", "WDI", "boot", "forecast", "acled.api", "ggplot2", "stargazer", "httr", "lubridate", "xtable") 13 | library(gtrendsR) 14 | library(countrycode) 15 | library(stm) 16 | library(tm) 17 | library(MASS) 18 | library(bcp) 19 | library(ngramr) 20 | library(rvest) 21 | library(plm) 22 | library(lmtest) 23 | library(WDI) 24 | library(boot) 25 | library(forecast) 26 | library(acled.api) 27 | library(ggplot2) 28 | library(stargazer) 29 | library(httr) 30 | library(lubridate) 31 | library(xtable) 32 | } else{ 33 | ## or just load the more recent version of the libraries 34 | library(gtrendsR) 35 | library(countrycode) 36 | library(stm) 37 | library(tm) 38 | library(MASS) 39 | library(bcp) 40 | library(ngramr) 41 | library(rvest) 42 | library(plm) 43 | library(lmtest) 44 | library(WDI) 45 | library(boot) 46 | library(forecast) 47 | library(acled.api) 48 | library(ggplot2) 49 | library(stargazer) 50 | library(httr) 51 | library(lubridate) 52 | library(xtable) 53 | } 54 | } 55 | 56 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_MNIST_example.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_AML_MNIST_example.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-13 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ## 13 | ########################################################################## 14 | ## Introduction to tutorial: 15 | ## 16 | ## Fit and evaluate a neural network model to the MINAS hand written digit dataset. 17 | ## 18 | ## https://keras.rstudio.com/index.html 19 | ## https://keras.rstudio.com/articles/faq.html 20 | ## https://keras.io/losses/ 21 | ## https://cran.r-project.org/web/packages/keras/keras.pdf 22 | ## https://cran.rstudio.com/web/packages/keras/vignettes/sequential_model.html 23 | ## 24 | ########################################################################## 25 | 26 | ## load package 27 | install.packages("keras") 28 | library(keras) 29 | #mnist <- dataset_mnist() 30 | 31 | ## OR 32 | 33 | ## load MNIST data from dslabs package 34 | library(dslabs) 35 | mnist <- read_mnist() 36 | 37 | ## or just load it 38 | ##load("mnist.Rdata") 39 | 40 | ## describe the structure of the data 41 | names(mnist) 42 | names(mnist$train) 43 | names(mnist$test) 44 | 45 | length(mnist) 46 | length(mnist$train) 47 | length(mnist$test) 48 | 49 | is.list(mnist) 50 | is.list(mnist$train) 51 | is.list(mnist$test) 52 | 53 | dim(mnist$train$images) 54 | dim(mnist$test$images) 55 | 56 | head(mnist$test$labels) 57 | head(mnist$train$labels) 58 | 59 | ## transform the data 60 | x_train <- mnist$train$images/255 61 | x_test <- mnist$test$images/255 62 | y_test <- mnist$test$labels 63 | y_train <- mnist$train$labels 64 | 65 | matrix(x_train[1,], nrow=28, ncol=28) 66 | 67 | image(x=1:28, y=1:28, matrix(x_train[1,], nrow=28, ncol=28)[1:28,28:1]) 68 | 69 | par(mfrow=c(3,3), mar=c(.5,.5,.5,.5)) 70 | for(i in 1:9){ 71 | image(x=1:28, y=1:28, matrix(x_train[i,], nrow=28, ncol=28)[1:28,28:1], yaxt="n", xaxt="n", xlab="", ylab="", col=gray.colors(100, start = 0.99, end = 0.01)) 72 | grid(nx=28, ny=28, col="black") 73 | } 74 | 75 | par(mfrow=c(10,10), mar=c(.5,.5,.5,.5)) 76 | for(i in 1:100) image(x=1:28, y=1:28, matrix(x_train[i,], nrow=28, ncol=28)[1:28,28:1], yaxt="n", xaxt="n", xlab="", ylab="", col=gray.colors(100, start = 0.99, end = 0.01)) 77 | 78 | ## stretch out the matrix into one visual row 79 | par(mfrow=c(1,1), mar=c(.5,.5,.5,.5)) 80 | image(x=1:784, y=1, z=matrix(x_train[1,], nrow=784, ncol=1), yaxt="n", xaxt="n", xlab="", ylab="", col=gray.colors(100, start = 0.99, end = 0.01), ylim=c(-5,5)) 81 | 82 | ## stretch out the matrix into nine visual row 83 | par(mfrow=c(1,1), mar=c(.5,.5,.5,.5)) 84 | image(x=1:784, y=1:9, z=t(x_train[9:1,1:784]), yaxt="n", xaxt="n", xlab="", ylab="", col=gray.colors(100, start = 0.99, end = 0.01)) 85 | 86 | ## stretch out the matrix into 100 visual row 87 | par(mfrow=c(1,1), mar=c(.5,.5,.5,.5)) 88 | image(x=1:784, y=1:100, z=t(x_train[100:1,1:784]), yaxt="n", xaxt="n", xlab="", ylab="", col=gray.colors(100, start = 0.99, end = 0.01)) 89 | 90 | 91 | ## calculate sum for a boxplot 92 | x_train_sum <- apply(x_train, 1, sum) 93 | 94 | #y_train_value <- apply(y_train, 1, which.max) - 1 95 | 96 | ## simple prediction 97 | par(mar=c(5,4,1,1)) 98 | boxplot(x_train_sum ~ y_train) 99 | 100 | 101 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_glm.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_AML_neuralnet_gradiant_decent_glm.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-13 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Gradient Decent for generalized linear model example. 16 | ## 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network. 18 | ## 19 | ########################################################################## 20 | 21 | ## set learning rate this varies on the unit interval (0 to 1] 22 | library(MASS) 23 | library(boot) 24 | library(gtools) 25 | 26 | lr <- .5 27 | n <- 100 28 | x <- rnorm(n) 29 | 30 | alpha <- -1 31 | beta <- -2 32 | 33 | y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x)) 34 | #y <- rbinom(1:n, 1, prob=inv.logit(alpha + beta*x + rnorm(n))) 35 | 36 | ## Start with a random guess 37 | X_mat <- cbind(1,x) 38 | 39 | alpha_hat <- 4 40 | beta_hat <- 4 41 | iterations <- 1000 42 | 43 | loss <- variance <- NA 44 | 45 | y_hat <- matrix(NA, nrow=n, ncol=iterations) 46 | y_error <- matrix(NA, nrow=n, ncol=iterations) 47 | delta <- matrix(NA, nrow=2, ncol=iterations) 48 | 49 | ## sequential iterations to evaluate loss function 50 | for (j in 1:iterations){ 51 | 52 | ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta 53 | y_hat[,j] <- inv.logit(alpha_hat[j] + beta_hat[j] * x) 54 | 55 | ## this works too but not as well as above 56 | #y_hat[,j] <- rbinom(1:n,1,prob=inv.logit(alpha_hat[j] + beta_hat[j] * x)) 57 | 58 | ## difference between the predicted y (y_hat) and y is the error for y 59 | y_error[,j] <- y_hat[,j] - y 60 | 61 | ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors 62 | loss[j] <- sum(y_error[,j]^2) 63 | 64 | ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X 65 | ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n) 66 | delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n 67 | delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n 68 | 69 | ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y) 70 | alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j]) 71 | beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j]) 72 | } 73 | 74 | ## estimate glm model 75 | fit <- glm(y~x, family=binomial("logit")) 76 | summary(fit) 77 | 78 | ## print the last value of the sequence of parameter estimates 79 | alpha_hat[length(alpha_hat)] 80 | beta_hat[length(beta_hat)] 81 | 82 | 83 | ## graph the values as a function of the loss statistic 84 | ALPHA_seq <- seq(from=-4,4,.05) 85 | BETA_seq <- seq(from=-4,4,.05) 86 | 87 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq)) 88 | 89 | ## grid search for graphing 90 | for(i in 1:length(ALPHA_seq)){ 91 | for(j in 1:length(ALPHA_seq)){ 92 | Y_hat <- inv.logit(ALPHA_seq[i] + BETA_seq[j] * X_mat[,2]) 93 | LOSS[i,j] <- sum((y-Y_hat)^2) 94 | 95 | } 96 | } 97 | 98 | ## plot the log loss 99 | contour(ALPHA_seq, BETA_seq,(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5) 100 | lines(alpha_hat, beta_hat, col=2, lwd=4) 101 | points(alpha_hat, beta_hat, col="navy", pch=.5) 102 | 103 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_lm.R: -------------------------------------------------------------------------------- 1 | #### R_Demo_AML_neuralnet_gradiant_decent_lm.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-13 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Gradient Decent for a linear model with two parameters (intercept and slope). 16 | ## 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network. 18 | ## 19 | ########################################################################## 20 | 21 | 22 | ## load libraries 23 | library(MASS) 24 | 25 | ## set learning rate this varies on the unit interval (0 to 1] 26 | lr <- .2 27 | 28 | n <- 100 29 | x <- rnorm(n) 30 | 31 | alpha <- -1 32 | beta <- -2 33 | 34 | y <- alpha + beta*x + rnorm(n) 35 | plot(x=x, y=y) 36 | fit <- lm(y ~ x) 37 | summary(fit) 38 | abline(reg=fit, col=2) 39 | 40 | X_mat <- cbind(1,x) 41 | head(X_mat) 42 | 43 | # Start with a random guess 44 | alpha_hat <- 1 45 | beta_hat <- 4 46 | 47 | 48 | iterations <- 100 49 | 50 | loss <- NA 51 | variance <- NA 52 | 53 | y_hat <- matrix(NA, nrow=n, ncol=iterations) 54 | y_error <- matrix(NA, nrow=n, ncol=iterations) 55 | delta <- matrix(NA, nrow=2, ncol=iterations) 56 | 57 | ## sequential iterations to evaluate loss function 58 | for (j in 1:iterations){ 59 | 60 | ## calculate the predicted y_hat based on the observed x variable and the best guess of alpha and beta 61 | y_hat[,j] <- alpha_hat[j] + beta_hat[j] * x 62 | 63 | ## difference between the predicted y (y_hat) and y is the error for y 64 | y_error[,j] <- y_hat[,j] - y 65 | 66 | ## the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors 67 | loss[j] <- sum(y_error[,j]^2) 68 | 69 | ## calculate the gradient at that point (this works because the errors are independent to the column vectors in X 70 | ##delta[1:2,j] <- (t(X_mat) %*% y_error[,j]) * (1/n) 71 | 72 | ## 73 | delta[1,j] <- sum(X_mat[,1] * y_error[,j])/n 74 | delta[2,j] <- sum(X_mat[,2] * y_error[,j])/n 75 | 76 | ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y) 77 | alpha_hat[j+1] <- alpha_hat[j] - (lr*delta[1,j]) 78 | beta_hat[j+1] <- beta_hat[j] - (lr*delta[2,j]) 79 | } 80 | 81 | ## print lm summary from above 82 | summary(fit) 83 | 84 | ## print the last value of the sequence of parameter estimates 85 | alpha_hat[length(alpha_hat)] 86 | beta_hat[length(beta_hat)] 87 | 88 | ## check lm fit 89 | summary(lm(y~x)) 90 | 91 | ## the least squares solution 92 | solve(t(X_mat) %*% X_mat) %*% t(X_mat) %*% y 93 | 94 | ## note that these give distinct regression models because the covariance between alpha and beta are not included (so they are different from the one above) 95 | solve(t(X_mat[,2]) %*% X_mat[,2]) %*% t(X_mat[,2]) %*% y 96 | solve(t(X_mat[,1]) %*% X_mat[,1]) %*% t(X_mat[,1]) %*% y 97 | 98 | 99 | ## graph the values as a function of the loss statistic 100 | ALPHA_seq <- seq(from=-4,4,.05) 101 | BETA_seq <- seq(from=-4,4,.05) 102 | 103 | LOSS <- matrix(NA, nrow=length(ALPHA_seq), ncol=length(BETA_seq)) 104 | 105 | ## grid search for graphing 106 | for(i in 1:length(ALPHA_seq)){ 107 | for(j in 1:length(ALPHA_seq)){ 108 | Y_hat <- ALPHA_seq[i] + BETA_seq[j] * X_mat[,2] 109 | LOSS[i,j] <- sum((y-Y_hat)^2) 110 | } 111 | } 112 | 113 | ## plot the log loss 114 | par(mar=c(5,5,1,1)) 115 | contour(ALPHA_seq, BETA_seq, log(LOSS), xlab=expression(hat(alpha)), ylab=expression(hat(beta)), cex.lab=1.5) 116 | lines(alpha_hat, beta_hat, col=2, lwd=4) 117 | points(alpha_hat, beta_hat, col="navy", pch=.5) 118 | 119 | 120 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day07_Applied_Machine_Learning_AML_Intro/R_Demo_AML_neuralnet_gradiant_decent_mu.R: -------------------------------------------------------------------------------- 1 | ## R_Demo_AML_neuralnet_gradiant_decent_mu.R 2 | ########################################################################## 3 | ## INSTRUCTOR: Christopher Fariss 4 | ## COURSE NAME: Advanced Computational Methods for Social Media and Textual Data (3B) 5 | ## University of Essex Summer School 2023 6 | ## 7 | ## Date: 2023-08-13 8 | ## 9 | ## Please e-mail me if you find any errors or have and suggestions (either email is fine) 10 | ## e-mail: cjf0006@gmail.com 11 | ## e-mail: cjfariss@umich.edu 12 | ########################################################################## 13 | ## Introduction to tutorial: 14 | ## 15 | ## Overview of Gradient Decent for a model with one parameter for a mean. 16 | ## 17 | ## Gradient decent is an iterative process used to find the best parameter value(s) in any model and especially a neural network. 18 | ## 19 | ########################################################################## 20 | 21 | 22 | 23 | 24 | 25 | ## load libraries 26 | library(MASS) 27 | 28 | ## set learning rate this varies on the unit interval (0 to 1] 29 | lr <- .2 30 | lr <- .05 31 | lr <- .9 32 | 33 | y <- c(1,2,3,4,5) 34 | ##y <- rnorm(1000, pi, 1) 35 | n <- length(y) 36 | 37 | y 38 | n 39 | 40 | 41 | iterations <- 100 42 | 43 | loss <- NA 44 | 45 | mu_hat <- -1.5 46 | 47 | y_hat <- matrix(NA, nrow=n, ncol=iterations) 48 | y_error <- matrix(NA, nrow=n, ncol=iterations) 49 | 50 | delta_mu_hat <- NA 51 | 52 | ## sequential iterations to evaluate loss function 53 | for (j in 1:iterations){ 54 | 55 | ## y_hat: calculate the predicted y_hat based on the best guess of mu 56 | 57 | y_hat[,j] <- mu_hat[j] 58 | 59 | ## residual: the difference between the predicted y (y_hat) and y is the error for y 60 | y_error[,j] <- y_hat[,j] - y 61 | 62 | ## loss: the estimated error is used to calculate the unexplained variance between y and y_hat, which is the sum of squared errors 63 | loss[j] <- sum(y_error[,j]^2) 64 | 65 | ## difference between current estimate and the unexplained differences, which is the method to calculate the gradient at that point 66 | delta_mu_hat[j] <- sum(y_error[,j])/n 67 | 68 | ## shift estimates along the gradient (+ function if y-y_hat; - function if y_hat-y) 69 | mu_hat[j+1] <- mu_hat[j] - (lr*delta_mu_hat[j]) 70 | } 71 | 72 | mu_hat[length(mu_hat)] 73 | 74 | plot(mu_hat) 75 | 76 | plot(mu_hat[-1], delta_mu_hat) 77 | -------------------------------------------------------------------------------- /R-Program-Lessons/Day09_Model_Evaluation_Review/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/R-Program-Lessons/Day09_Model_Evaluation_Review/.DS_Store -------------------------------------------------------------------------------- /R-Program-Lessons/Day10_Additional_Programs/dynamic_IRT_practice_v2.R: -------------------------------------------------------------------------------- 1 | ## 2 | library(boot) 3 | library(rstan) 4 | 5 | time_index <- 1:30 6 | time_index 7 | 8 | # simulation for checking the distribution of correlations 9 | #value <- NA 10 | #for(j in 1:1000){ 11 | x <- rnorm(1, mean=0, sd=1) 12 | x 13 | 14 | for(i in 2:length(time_index)){ 15 | x[i] <- rnorm(1, mean=x[i-1], sd=1) 16 | } 17 | x 18 | 19 | #plot(x, type="l") 20 | #MASS::truehist(x) 21 | 22 | #cbind(x[2:length(time_index)], x[1:(length(time_index)-1)]) 23 | 24 | #value[j] <- cor(x[2:length(time_index)], x[1:(length(time_index)-1)]) 25 | #} 26 | 27 | MASS::truehist(value) 28 | 29 | alpha1 <- -1 30 | beta1 <- 2 31 | prob_y1 <- inv.logit(alpha1 + beta1*x + rnorm(length(time_index))) 32 | y1 <- rbinom(length(time_index), 1, prob=prob_y1) 33 | #y1 <- alpha + beta*x + 34 | 35 | alpha2 <- 0 36 | beta2 <- 2 37 | prob_y2 <- inv.logit(alpha2 + beta2*x + rnorm(length(time_index))) 38 | y2 <- rbinom(length(time_index), 1, prob=prob_y2) 39 | 40 | alpha3 <- 1 41 | beta3 <- 2 42 | prob_y3 <- inv.logit(alpha3 + beta3*x + rnorm(length(time_index))) 43 | y3 <- rbinom(length(time_index), 1, prob=prob_y3) 44 | 45 | alpha4 <- 1 46 | beta4 <- 2 47 | prob_y4 <- inv.logit(alpha4 + beta4*x + rnorm(length(time_index))) 48 | y4 <- rbinom(length(time_index), 1, prob=prob_y4) 49 | 50 | additive_scale <- y1 + y2 + y3 + y4 51 | 52 | plot(additive_scale ~ x) 53 | 54 | par(mfrow=c(1,2), mar=c(4,4,1,1)) 55 | plot(additive_scale, type="l") 56 | plot(x, type="l") 57 | 58 | cor(additive_scale,x) 59 | 60 | table(additive_scale) 61 | 62 | model <- " 63 | 64 | data{ 65 | int n; 66 | int k; 67 | int y1[n]; 68 | int y2[n]; 69 | int y3[n]; 70 | int y4[n]; 71 | } 72 | 73 | parameters{ 74 | real theta[n]; 75 | real alpha[k]; 76 | real beta[k]; 77 | real sigma; 78 | } 79 | 80 | model{ 81 | // priors 82 | theta[1] ~ normal(0, 1); 83 | 84 | for(i in 2:n){ 85 | theta[i] ~ normal(theta[i-1], 1); 86 | } 87 | 88 | alpha ~ normal(0,1); 89 | beta ~ normal(0,1); 90 | sigma ~ normal(0,1); 91 | 92 | // likelihood 93 | for(i in 1:n){ 94 | y1[i] ~ bernoulli_logit(alpha[1] + beta[1] * theta[i]); 95 | y2[i] ~ bernoulli_logit(alpha[2] + beta[2] * theta[i]); 96 | y3[i] ~ bernoulli_logit(alpha[3] + beta[3] * theta[i]); 97 | y4[i] ~ bernoulli_logit(alpha[4] + beta[4] * theta[i]); 98 | } 99 | } 100 | 101 | " 102 | 103 | data_list <- list(n=length(time_index), k=4, y1=y1, y2=y2, y3=y3, y4=y4) 104 | data_list 105 | 106 | fit <- stan(model_code=model, data=data_list, iter=1000, chains=4, pars=c("theta_star", "sigma_star"), include=FALSE) 107 | fit 108 | 109 | output <- extract(fit) 110 | dim(output) 111 | names(output) 112 | 113 | dim(output$theta) 114 | names(output$theta) 115 | 116 | theta_hat <- apply(output$theta, MARGIN=2, FUN=mean) 117 | theta_hat 118 | 119 | par(mfrow=c(1,1)) 120 | plot(x=x, y=theta_hat, xlab="true x", ylab="estiamted theta of x") 121 | abline(reg=lm(theta_hat~x),col=2) 122 | cor(x, theta_hat) 123 | cor(additive_scale, theta_hat) 124 | 125 | apply(output$alpha, MARGIN=2, FUN=mean) 126 | c(alpha1, alpha2, alpha3, alpha4) 127 | 128 | 129 | apply(output$beta, MARGIN=2, FUN=mean) 130 | c(beta1, beta2, beta3, beta4) 131 | 132 | alpha_hat <- apply(output$alpha, MARGIN=2, FUN=mean) 133 | beta_hat <- apply(output$beta, MARGIN=2, FUN=mean) 134 | 135 | ## inflection points in the latent space 136 | inflection_points <- - alpha_hat / beta_hat 137 | 138 | x_seq <- seq(-4,4,.1) 139 | prob_y1_hat <- inv.logit(alpha_hat[1] + beta_hat[1] * x_seq) 140 | prob_y2_hat <- inv.logit(alpha_hat[2] + beta_hat[2] * x_seq) 141 | prob_y3_hat <- inv.logit(alpha_hat[3] + beta_hat[3] * x_seq) 142 | prob_y4_hat <- inv.logit(alpha_hat[4] + beta_hat[4] * x_seq) 143 | 144 | par(mfrow=c(2,2)) 145 | plot(x=x_seq, y=prob_y1_hat, type="l") 146 | abline(v=inflection_points[1], col=2); abline(h=.5, lty=2) 147 | 148 | plot(x=x_seq, y=prob_y2_hat, type="l") 149 | abline(v=inflection_points[2], col=2); abline(h=.5, lty=2) 150 | 151 | plot(x=x_seq, y=prob_y3_hat, type="l") 152 | abline(v=inflection_points[3], col=2); abline(h=.5, lty=2) 153 | 154 | plot(x=x_seq, y=prob_y4_hat, type="l") 155 | abline(v=inflection_points[4], col=2); abline(h=.5, lty=2) 156 | 157 | 158 | #apply(output$sigma, MARGIN=2, FUN=mean) 159 | mean(output$sigma) 160 | 161 | -------------------------------------------------------------------------------- /Rplots/Google_search_term_pairs_longlist.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJFariss/Advanced-Computational-Methods-for-Social-Media-and-Text-Data/4b98fe1c43cc681a9d39e7495953883acd3a76e8/Rplots/Google_search_term_pairs_longlist.pdf --------------------------------------------------------------------------------