├── .Rapp.history ├── README.md ├── Twitter_devices.r ├── r_facebook_gender.r ├── r_foursquare_map.r ├── r_googleplus.r ├── r_instagram.r ├── r_mongoDB.r ├── r_mongoDB_REST.r ├── r_pinterest_machine_learning.R ├── r_rfacebook.r ├── r_twitter_cluster.r ├── sentiment_cloud.r ├── sentiment_datumbox.r ├── sentiment_viralheat.r ├── spam_class_r.r └── twitter_authentication.r /.Rapp.history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JulianHill/R-Tutorials/670374ca3c13fc9c656ccddee5a411fca10e01a9/.Rapp.history -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | R-Tutorials 2 | =========== 3 | 4 | Code from the r tutorials on my blog 5 | -------------------------------------------------------------------------------- /Twitter_devices.r: -------------------------------------------------------------------------------- 1 | tweets = searchTwitter("Social Media", n=20, cainfo="cacert.pem") 2 | 3 | devices <- sapply(tweets, function(x) x$getStatusSource()) 4 | 5 | devices <- gsub("","", devices) 6 | devices <- strsplit(devices, ">") 7 | 8 | devices <- sapply(devices,function(x) ifelse(length(x) > 1, x[2], x[1])) 9 | 10 | pie(table(sources)) -------------------------------------------------------------------------------- /r_facebook_gender.r: -------------------------------------------------------------------------------- 1 | 2 | require(Rfacebook) 3 | 4 | # Change for your need 5 | page_name <- "forbes" 6 | number_posts <- 2 7 | token <- "XXX" 8 | 9 | #Get the general page info 10 | page <- getPage(page_name, token, n = number_posts, feed = FALSE) 11 | 12 | 13 | #Extract the post ids 14 | posts <- page$id 15 | 16 | 17 | data_frame_gender <- data.frame(post=character(),male=numeric(),female=numeric(),etc=numeric(),likes=numeric(),type=character(),stringsAsFactors=FALSE) 18 | #process each post and analyze the gender distribution of the likes 19 | for(i in 1:length(posts)) 20 | { 21 | temp <- posts[i] 22 | 23 | post <- getPost(temp,token) 24 | 25 | data_frame_gender[i,1] <- post$post$message 26 | data_frame_gender[i,5] <- post$post$likes 27 | data_frame_gender[i,6] <- post$post$type 28 | 29 | gender_frame <- data.frame(gender=character(),stringsAsFactors=FALSE) 30 | 31 | for(j in 1:length(post$likes$from_id)) 32 | { 33 | likes <- post$likes$from_id 34 | user_id <- likes[j] 35 | 36 | user <- getUsers(user_id,token=token) 37 | 38 | gender <- user$gender 39 | 40 | gender_frame[nrow(gender_frame)+1,] <- gender 41 | 42 | } 43 | 44 | number_males <- nrow(subset(gender_frame, gender=="male")) 45 | number_females <- nrow(subset(gender_frame, gender=="female")) 46 | number_etc <- data_frame_gender[i,5] - (number_males+number_females) 47 | 48 | data_frame_gender[i,2] <- number_males 49 | data_frame_gender[i,3] <- number_females 50 | data_frame_gender[i,4] <- number_etc 51 | 52 | } 53 | 54 | 55 | slices <- c(sum(data_frame_gender$male),sum(data_frame_gender$female),sum(data_frame_gender$etc)) 56 | 57 | pct <- round(slices/sum(slices)*100) 58 | lbls <- names(data_frame_gender[2:4]) 59 | lbls <- paste(lbls, pct) # add percents to labels 60 | lbls <- paste(lbls,"%",sep="") # ad % to labels 61 | 62 | pie(slices, labels = lbls, main="Gender Distribution of all analyzed posts") 63 | 64 | -------------------------------------------------------------------------------- /r_foursquare_map.r: -------------------------------------------------------------------------------- 1 | library(rjson) 2 | library(RCurl) 3 | library(httr) 4 | 5 | #Authentication: 6 | require(devtools) #install if necessary (install.packages("devtools") 7 | 8 | dev_mode(on=T) 9 | 10 | install_github("ThinkToStartR",username="JulianHill") 11 | 12 | 13 | require(ThinkToStartR) 14 | library(rjson) 15 | require(RCurl) 16 | 17 | 18 | token <- ThinkToStart("Foursquare_auth",app_name="R_Test",app_id="XXX",app_secret="XXX") 19 | 20 | 21 | ####Get the Data 22 | 23 | data <- fromJSON(getURL(paste('https://api.foursquare.com/v2/users/self/venuehistory?oauth_token=',token,'&v=',format(Sys.time(), "%Y%m%d"),sep=""))) 24 | 25 | response <- data$response 26 | venues <- response$venues$items 27 | 28 | 29 | no_venues = length(data$response$venues$items) 30 | 31 | df = data.frame(no = 1:no_venues) 32 | 33 | 34 | for (i in 1:nrow(df)){ 35 | 36 | #Add Name and the location of the Venue 37 | df$venue_name[i] <- venues[[i]]$venue$name 38 | df$venue_lat[i] <- venues[[i]]$venue$location$lat 39 | df$venue_lng[i] <- venues[[i]]$venue$location$lng 40 | 41 | ########################## 42 | #Add the address of the location 43 | if(length(venues[[i]]$venue$location$address)>0) 44 | { 45 | df$venue_address[i] <- venues[[i]]$venue$location$address 46 | } 47 | else{ 48 | df$venue_address[i] <- "No Address Available" 49 | 50 | } 51 | 52 | ########################## 53 | #Add the citiy of the location 54 | if(length(venues[[i]]$venue$location$city)>0) 55 | { 56 | df$venue_city[i] <- venues[[i]]$venue$location$city 57 | } 58 | else{ 59 | df$venue_city[i] <- "No City Available" 60 | 61 | } 62 | 63 | ########################## 64 | #Add the number of check-ins of the venue 65 | df$venue_checkinsCount[i] <- venues[[i]]$venue$stats[[1]] 66 | 67 | ########################## 68 | #Add the URL of the URL if defined 69 | if(length(venues[[i]]$venue$url)>0) 70 | { 71 | df$url[i] <- venues[[i]]$venue$url 72 | } 73 | else{ 74 | df$url[i] <- NA 75 | 76 | } 77 | 78 | } 79 | 80 | 81 | mean_lat <- mean(df$venue_lat) # outcome: 50.90956 82 | mean_lon <- mean(df$venue_lng) # outcome: 7.576119 83 | 84 | require(rCharts) 85 | 86 | map <- Leaflet$new() 87 | map$setView(c(mean_lat, mean_lon), zoom = 5) 88 | 89 | 90 | for (i in 1:no_venues){ 91 | 92 | #Get the name and the number of check-ins of the current venue 93 | name <- df$venue_name[i] 94 | checkins <- df$venue_checkinsCount[i] 95 | 96 | #Add the marker to the map but just add a website link if we have a URL for the venue 97 | 98 | #if URL is available 99 | if(is.na(df$url[i])) 100 | { map$marker(c(df$venue_lat[i], df$venue_lng[i]), bindPopup = paste(name,'
Checkins: ',checkins,sep="")) 101 | } 102 | else 103 | { 104 | map$marker(c(df$venue_lat[i], df$venue_lng[i]), bindPopup = paste(name,'
Checkins: ',checkins,'
Website ',sep="")) 105 | } 106 | 107 | } 108 | 109 | map 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /r_googleplus.r: -------------------------------------------------------------------------------- 1 | library(RCurl); 2 | library(RJSONIO); 3 | api_key<-"XXX" 4 | 5 | user_id <- "105616015219357887822" 6 | # Still need to add ssl.verifypeer = FALSE to get a connection :( 7 | # Add a max results parameter in the URL structure to get 100 results (maximum allowed by the API : https://developers.google.com/+/api/) 8 | data <- getURL(paste("https://www.googleapis.com/plus/v1/people/",user_id,"/activities/public?maxResults=100&key=", api_key, sep=""),ssl.verifypeer = FALSE) 9 | js <- fromJSON(data, asText=TRUE); 10 | 11 | df = data.frame(no = 1:length(js$items)) 12 | 13 | for (i in 1:nrow(df)){ 14 | df$kind[i] = js$items[[i]]$verb 15 | df$title[i] = js$items[[i]]$title 16 | df$published[i] = js$items[[i]]$published # add publish date to the df 17 | df$replies[i] = js$items[[i]]$object$replies$totalItems 18 | df$plusones[i] = js$items[[i]]$object$plusoners$totalItems 19 | df$reshares[i] = js$items[[i]]$object$resharers$totalItems 20 | df$url[i] = js$items[[i]]$object$url 21 | 22 | } 23 | 24 | # Export to .csv 25 | filename <- paste("gplus_data_", user_id, sep="") # in case we have more user_ids 26 | write.table(df, file = paste0(filename,".csv"), sep = ",", col.names = NA, 27 | qmethod = "double") 28 | 29 | df_graph = df[,c(1,5,6,7)] 30 | 31 | 32 | 33 | require(ggplot2) 34 | require(reshape2) 35 | 36 | melted=melt(df_graph,id.vars='no') 37 | 38 | ggplot(melted,aes(x=factor(no),y=value,color=factor(variable),group=factor(variable)))+ 39 | geom_line()+xlab('no')+guides(color=guide_legend("metrics"))+ 40 | labs(title="Google+") 41 | -------------------------------------------------------------------------------- /r_instagram.r: -------------------------------------------------------------------------------- 1 | #Analyze Instagram with R 2 | #Author: Julian Hillebrand 3 | 4 | #packages 5 | require(httr) 6 | require(rjson) 7 | require(RCurl) 8 | 9 | 10 | #Authentication 11 | 12 | ## getting callback URL 13 | full_url <- oauth_callback() 14 | full_url <- gsub("(.*localhost:[0-9]{1,5}/).*", x=full_url, replacement="\\1") 15 | #message <- paste("Copy and paste into Site URL on Instagram App Settings:", 16 | # full_url, "\nWhen done, press any key to continue...") 17 | 18 | invisible(readline(message)) 19 | 20 | app_name <- "ThinkToStartTest" 21 | client_id <- "CLIENT_ID" 22 | client_secret <- "CLIENT_SECRET" 23 | scope = "public_content" 24 | 25 | 26 | 27 | instagram <- oauth_endpoint( 28 | authorize = "https://api.instagram.com/oauth/authorize", 29 | access = "https://api.instagram.com/oauth/access_token") 30 | myapp <- oauth_app(app_name, client_id, client_secret) 31 | 32 | #scope <- NULL 33 | ig_oauth <- oauth2.0_token(instagram, myapp,scope=scope, type = "application/x-www-form-urlencoded",cache=FALSE) 34 | tmp <- strsplit(toString(names(ig_oauth$credentials)), '"') 35 | token <- tmp[[1]][30] 36 | 37 | ######################################################## 38 | 39 | 40 | user_info <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/self/?access_token=',token,sep=""))) 41 | 42 | 43 | 44 | received_profile <- user_info$data$id 45 | 46 | 47 | 48 | 49 | 50 | #Get recent media (20 pictures) 51 | media <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/self/media/recent/?access_token=',token,sep=""))) 52 | 53 | 54 | 55 | df = data.frame(no = 1:length(media$data)) 56 | 57 | for(i in 1:length(media$data)) 58 | { 59 | #comments 60 | df$comments[i] <-media$data[[i]]$comments$count 61 | 62 | #likes: 63 | df$likes[i] <- media$data[[i]]$likes$count 64 | 65 | #date 66 | df$date[i] <- toString(as.POSIXct(as.numeric(media$data[[i]]$created_time), origin="1970-01-01")) 67 | } 68 | 69 | #Visualization 70 | 71 | require(rCharts) 72 | 73 | m1 <- mPlot(x = "date", y = c("likes", "comments"), type = "Line", data = df) 74 | 75 | -------------------------------------------------------------------------------- /r_mongoDB.r: -------------------------------------------------------------------------------- 1 | # install package to connect through monodb 2 | install.packages("rmongodb") 3 | library(rmongodb) 4 | # connect to MongoDB 5 | mongo = mongo.create(host = "localhost") 6 | mongo.is.connected(mongo) 7 | 8 | mongo.get.databases(mongo) 9 | 10 | mongo.get.database.collections(mongo, db = "tweetDB2") #”tweetDB” is where twitter data is stored 11 | 12 | library(plyr) 13 | ## create the empty data frame 14 | df1 = data.frame(stringsAsFactors = FALSE) 15 | 16 | ## create the namespace 17 | DBNS = "tweetDB2.#analytic" 18 | 19 | ## create the cursor we will iterate over, basically a select * in SQL 20 | cursor = mongo.find(mongo, DBNS) 21 | 22 | ## create the counter 23 | i = 1 24 | 25 | ## iterate over the cursor 26 | while (mongo.cursor.next(cursor)) { 27 | # iterate and grab the next record 28 | tmp = mongo.bson.to.list(mongo.cursor.value(cursor)) 29 | # make it a dataframe 30 | tmp.df = as.data.frame(t(unlist(tmp)), stringsAsFactors = F) 31 | # bind to the master dataframe 32 | df1 = rbind.fill(df1, tmp.df) 33 | } 34 | 35 | dim(df1) -------------------------------------------------------------------------------- /r_mongoDB_REST.r: -------------------------------------------------------------------------------- 1 | library(RCurl) 2 | library(rjson) 3 | 4 | database = "tweetDB" 5 | collection = "Apple" 6 | limit = "100" 7 | db <- paste("http://localhost:28017/",database,"/",collection,"/?limit=",limit,sep = "") 8 | 9 | tweets <- fromJSON(getURL(db)) 10 | 11 | tweet_df = data.frame(text=1:limit) 12 | for (i in 1:limit){ 13 | tweet_df$text[i] = tweets$rows[[i]]$tweet_text} 14 | tweet_df 15 | -------------------------------------------------------------------------------- /r_pinterest_machine_learning.R: -------------------------------------------------------------------------------- 1 | # Pinterest 2 | 3 | #Analyze Pinterest with R 4 | #Author: Julian Hillebrand 5 | 6 | #packages 7 | require(httr) 8 | require(rjson) 9 | require(RCurl) 10 | 11 | 12 | #Authentication 13 | 14 | ## getting callback URL 15 | full_url <- oauth_callback() 16 | full_url <- gsub("(.*localhost:[0-9]{1,5}/).*", x=full_url, replacement="\\1") 17 | #message <- paste("Copy and paste into Site URL on Instagram App Settings:", 18 | # full_url, "\nWhen done, press any key to continue...") 19 | 20 | invisible(readline(message)) 21 | 22 | app_name <- "R Test" 23 | client_id <- "XXX" 24 | client_secret <- "XXX" 25 | scope = "read_public" 26 | redirect_uri="https://mywebsite.com/connect/pinterest/" 27 | 28 | https://api.pinterest.com/oauth/? 29 | response_type=code& 30 | redirect_uri=https://mywebsite.com/connect/pinterest/& 31 | client_id=12345& 32 | scope=read_public,write_public& 33 | state=768uyFys 34 | 35 | user_info <- fromJSON(getURL(paste("https://api.pinterest.com/oauth?response_type=code&client_id=",client_id,"&scope=",scope,sep="")),unexpected.escape ="keep") 36 | 37 | paste("https://api.pinterest.com/oauth?response_type=code&client_id=",client_id,"&scope=",scope,sep="") 38 | 39 | pinterest <- oauth_endpoint( 40 | authorize = "https://api.pinterest.com/oauth", 41 | access = "https://api.pinterest.com/v1/oauth/token") 42 | 43 | myapp <- oauth_app(app_name, client_id, client_secret) 44 | 45 | 46 | 47 | #scope <- NULL 48 | pi_oauth <- oauth2.0_token(pinterest, myapp,scope=scope,use_oob = TRUE, as_header = TRUE) 49 | 50 | 51 | fb_ep = oauth_endpoint(token_url, auth_url, access_url) 52 | pi_oauth <- oauth1.0_token(pinterest, myapp) 53 | 54 | 55 | 56 | 57 | tmp <- strsplit(toString(names(ig_oauth$credentials)), '"') 58 | token <- tmp[[1]][4] 59 | 60 | ######################################################## 61 | 62 | username <- "therock" 63 | 64 | #search for the username 65 | user_info <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/search?q=',username,'&access_token=',token,sep="")),unexpected.escape = "keep") 66 | 67 | received_profile <- user_info$data[[1]] 68 | 69 | if(grepl(received_profile$username,username)) 70 | { 71 | user_id <- received_profile$id 72 | #Get recent media (20 pictures) 73 | media <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/',user_id,'/media/recent/?access_token=',token,sep=""))) 74 | 75 | 76 | df = data.frame(no = 1:length(media$data)) 77 | 78 | for(i in 1:length(media$data)) 79 | { 80 | #comments 81 | df$comments[i] <-media$data[[i]]$comments$count 82 | 83 | #likes: 84 | df$likes[i] <- media$data[[i]]$likes$count 85 | 86 | #date 87 | df$date[i] <- toString(as.POSIXct(as.numeric(media$data[[i]]$created_time), origin="1970-01-01")) 88 | } 89 | 90 | #Visualization 91 | 92 | require(rCharts) 93 | 94 | m1 <- mPlot(x = "date", y = c("likes", "comments"), type = "Line", data = df) 95 | 96 | 97 | }else 98 | { 99 | print("Error: User not found!") 100 | } 101 | -------------------------------------------------------------------------------- /r_rfacebook.r: -------------------------------------------------------------------------------- 1 | install.packages("devtools") 2 | library(devtools) 3 | 4 | install_github("Rfacebook", "pablobarbera", subdir="Rfacebook") 5 | 6 | 7 | require("Rfacebook") 8 | 9 | 10 | 11 | 12 | 13 | ######Using the App Authentication: 14 | 15 | fb_oauth <- fbOAuth(app_id="123456789", app_secret="1A2B3C4D",extended_permissions = TRUE) 16 | 17 | 18 | #now we have our fb_oauth connection 19 | #we will just save them to be able to use them later 20 | save(fb_oauth, file="fb_oauth") 21 | 22 | #so if you want to connect to Facebook again you just have to call 23 | load("fb_oauth") 24 | 25 | me <- getUsers("me",token=fb_oauth) 26 | 27 | my_likes <- getLikes(user="me", token=fb_oauth) 28 | 29 | 30 | ######Using the Token Authentication: 31 | 32 | token <- 'YOUR AUTHENTICATION TOKEN' 33 | me <- getUsers("me", token, private_info=TRUE) 34 | 35 | getUsers(c("barackobama", "donaldtrump"), token) 36 | 37 | 38 | getFriends(token, simplify = FALSE) 39 | 40 | my_friends <- getFriends(token=fb_oauth, simplify=TRUE) 41 | 42 | head(my_friends, n=10) 43 | 44 | getUser() 45 | 46 | my_friends_info <- getUsers(my_friends$id, token=fb_oauth, private_info=TRUE) 47 | 48 | #create a table with the relationship statuses 49 | 50 | table(my_friends_info$relationship_status) 51 | -------------------------------------------------------------------------------- /r_twitter_cluster.r: -------------------------------------------------------------------------------- 1 | # run this if you don't have one or more of the next packages: 2 | # install.packages(c("devtools", "RCurl", "rjson", "bit64","httr","ROAuth")) 3 | # library(devtools) 4 | # install_github("twitteR", username="geoffjentry") 5 | # install_github('rCharts','ramnathv') 6 | 7 | 8 | 9 | library(RCurl) 10 | # Set SSL certs globally 11 | options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))) 12 | 13 | library(twitteR) 14 | 15 | #Authentication 16 | #http://thinktostart.wordpress.com/2013/05/22/twitter-authentification-with-r/ 17 | 18 | 19 | library(rCharts) 20 | 21 | user <- getUser("ACCOUNT-TO-BE-ANALYZED") 22 | userFriends <- user$getFriends(n=5000) #put () if you want to get all friends and followers 23 | userFollowers <- user$getFollowers(n=5000) 24 | userNeighbors <- union(userFollowers, userFriends) 25 | userNeighbors.df = twListToDF(userNeighbors) 26 | 27 | userNeighbors.df[userNeighbors.df=="0"]<-1 28 | 29 | userNeighbors.df$logFollowersCount <-log(userNeighbors.df$followersCount) 30 | 31 | userNeighbors.df$logFriendsCount <-log(userNeighbors.df$friendsCount) 32 | 33 | kObject.log <- data.frame(userNeighbors.df$logFriendsCount,userNeighbors.df$logFollowersCount) 34 | 35 | ###elbow 36 | mydata <- kObject.log 37 | wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) 38 | for (i in 2:15) wss[i] <- sum(kmeans(mydata, 39 | centers=i)$withinss) 40 | plot(1:15, wss, type="b", xlab="Number of Clusters", 41 | ylab="Within groups sum of squares") 42 | 43 | 44 | ###k-means 45 | 46 | ##Run the K Means algorithm, remember to specify centers from 'elbow plot' 47 | userMeans.log <- kmeans(kObject.log, centers=4, iter.max=10, nstart=100) 48 | 49 | ##Add the vector of specified clusters back to the original vector as a factor 50 | kObject.log$cluster=factor(userMeans.log$cluster) 51 | userNeighbors.df$cluster <- kObject.log$cluster 52 | 53 | 54 | p2 <- nPlot(logFollowersCount ~ logFriendsCount, group = 'cluster', data = userNeighbors.df, type = 'scatterChart') 55 | p2$xAxis(axisLabel = 'Followers Count') 56 | p2$yAxis(axisLabel = 'Friends Count') 57 | p2$chart(tooltipContent = "#! function(key, x, y, e){ 58 | return e.point.screenName + ' Followers: ' + e.point.followersCount +' Friends: ' + e.point.friendsCount 59 | } !#") 60 | p2 61 | -------------------------------------------------------------------------------- /sentiment_cloud.r: -------------------------------------------------------------------------------- 1 | library(twitteR) 2 | library(RCurl) 3 | library(RJSONIO) 4 | library(stringr) 5 | library(tm) 6 | library(wordcloud) 7 | 8 | 9 | #################################################################### 10 | 11 | getSentiment <- function (text, key){ 12 | 13 | text <- URLencode(text); 14 | 15 | #save all the spaces, then get rid of the weird characters that break the API, then convert back the URL-encoded spaces. 16 | text <- str_replace_all(text, "%20", " "); 17 | text <- str_replace_all(text, "%\\d\\d", ""); 18 | text <- str_replace_all(text, " ", "%20"); 19 | 20 | 21 | if (str_length(text) > 360){ 22 | text <- substr(text, 0, 359); 23 | } 24 | ########################################## 25 | 26 | data <- getURL(paste("http://api.datumbox.com/1.0/TwitterSentimentAnalysis.json?api_key=", key, "&text=",text, sep="")) 27 | 28 | js <- fromJSON(data, asText=TRUE); 29 | 30 | # get mood probability 31 | sentiment = js$output$result 32 | 33 | ################################### 34 | 35 | 36 | return(list(sentiment=sentiment)) 37 | } 38 | 39 | clean.text <- function(some_txt) 40 | { 41 | some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt) 42 | some_txt = gsub("@\\w+", "", some_txt) 43 | some_txt = gsub("[[:punct:]]", "", some_txt) 44 | some_txt = gsub("[[:digit:]]", "", some_txt) 45 | some_txt = gsub("http\\w+", "", some_txt) 46 | some_txt = gsub("[ \t]{2,}", "", some_txt) 47 | some_txt = gsub("^\\s+|\\s+$", "", some_txt) 48 | some_txt = gsub("amp", "", some_txt) 49 | # define "tolower error handling" function 50 | try.tolower = function(x) 51 | { 52 | y = NA 53 | try_error = tryCatch(tolower(x), error=function(e) e) 54 | if (!inherits(try_error, "error")) 55 | y = tolower(x) 56 | return(y) 57 | } 58 | 59 | some_txt = sapply(some_txt, try.tolower) 60 | some_txt = some_txt[some_txt != ""] 61 | names(some_txt) = NULL 62 | return(some_txt) 63 | } 64 | 65 | 66 | 67 | ########################################################### 68 | 69 | 70 | 71 | print("Getting tweets...") 72 | # get some tweets 73 | tweets = searchTwitter(keyword, n, lang="en") 74 | # get text 75 | tweet_txt = sapply(tweets, function(x) x$getText()) 76 | 77 | # clean text 78 | tweet_clean = clean.text(tweet_txt) 79 | tweet_num = length(tweet_clean) 80 | # data frame (text, sentiment) 81 | tweet_df = data.frame(text=tweet_clean, sentiment=rep("", tweet_num),stringsAsFactors=FALSE) 82 | 83 | print("Getting sentiments...") 84 | # apply function getSentiment 85 | sentiment = rep(0, tweet_num) 86 | for (i in 1:tweet_num) 87 | { 88 | tmp = getSentiment(tweet_clean[i], db_key) 89 | 90 | tweet_df$sentiment[i] = tmp$sentiment 91 | 92 | print(paste(i," of ", tweet_num)) 93 | 94 | 95 | } 96 | 97 | # delete rows with no sentiment 98 | tweet_df <- tweet_df[tweet_df$sentiment!="",] 99 | 100 | 101 | #separate text by sentiment 102 | sents = levels(factor(tweet_df$sentiment)) 103 | #emos_label <- emos 104 | 105 | 106 | # get the labels and percents 107 | 108 | labels <- lapply(sents, function(x) paste(x,format(round((length((tweet_df[tweet_df$sentiment ==x,])$text)/length(tweet_df$sentiment)*100),2),nsmall=2),"%")) 109 | 110 | 111 | 112 | nemo = length(sents) 113 | emo.docs = rep("", nemo) 114 | for (i in 1:nemo) 115 | { 116 | tmp = tweet_df[tweet_df$sentiment == sents[i],]$text 117 | 118 | emo.docs[i] = paste(tmp,collapse=" ") 119 | } 120 | 121 | 122 | 123 | # remove stopwords 124 | emo.docs = removeWords(emo.docs, stopwords("german")) 125 | emo.docs = removeWords(emo.docs, stopwords("english")) 126 | corpus = Corpus(VectorSource(emo.docs)) 127 | tdm = TermDocumentMatrix(corpus) 128 | tdm = as.matrix(tdm) 129 | colnames(tdm) = labels 130 | 131 | 132 | 133 | 134 | # comparison word cloud 135 | comparison.cloud(tdm, colors = brewer.pal(nemo, "Dark2"), 136 | scale = c(3,.5), random.order = FALSE, title.size = 1.5) 137 | 138 | 139 | -------------------------------------------------------------------------------- /sentiment_datumbox.r: -------------------------------------------------------------------------------- 1 | # load packages 2 | library(twitteR) 3 | library(RCurl) 4 | library(RJSONIO) 5 | library(stringr) 6 | 7 | 8 | 9 | getSentiment <- function (text, key){ 10 |    11 |    12 |    13 |   text <- URLencode(text); 14 |    15 |   #save all the spaces, then get rid of the weird characters that break the API, then convert back the URL-encoded spaces. 16 |   text <- str_replace_all(text, "%20", " "); 17 |   text <- str_replace_all(text, "%\\d\\d", ""); 18 |   text <- str_replace_all(text, " ", "%20"); 19 |    20 |    21 |   if (str_length(text) > 360){ 22 |     text <- substr(text, 0, 359); 23 |   } 24 |   ########################################## 25 |    26 |   data <- getURL(paste("http://api.datumbox.com/1.0/TwitterSentimentAnalysis.json?api_key=", key, "&text=",text, sep="")) 27 |    28 | js <- fromJSON(data, asText=TRUE); 29 |   30 |   # get mood probability 31 |   sentiment = js$output$result 32 | 33 | ################################### 34 | 35 |   data <- getURL(paste("http://api.datumbox.com/1.0/SubjectivityAnalysis.json?api_key=", key, "&text=",text, sep="")) 36 |    37 | js <- fromJSON(data, asText=TRUE); 38 |   39 |   # get mood probability 40 |   subject = js$output$result 41 | 42 | ################################## 43 | 44 | data <- getURL(paste("http://api.datumbox.com/1.0/TopicClassification.json?api_key=", key, "&text=",text, sep="")) 45 |    46 | js <- fromJSON(data, asText=TRUE); 47 |   48 |   # get mood probability 49 |   topic = js$output$result 50 | 51 | ################################## 52 | data <- getURL(paste("http://api.datumbox.com/1.0/GenderDetection.json?api_key=", key, "&text=",text, sep="")) 53 |    54 | js <- fromJSON(data, asText=TRUE); 55 |   56 |   # get mood probability 57 |   gender = js$output$result 58 | 59 | return(list(sentiment=sentiment,subject=subject,topic=topic,gender=gender)) 60 |   } 61 |        62 |   63 | 64 | ################# 65 | clean.text <- function(some_txt) 66 | { 67 |    some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt) 68 |    some_txt = gsub("@\\w+", "", some_txt) 69 |    some_txt = gsub("[[:punct:]]", "", some_txt) 70 |    some_txt = gsub("[[:digit:]]", "", some_txt) 71 |    some_txt = gsub("http\\w+", "", some_txt) 72 |    some_txt = gsub("[ \t]{2,}", "", some_txt) 73 |    some_txt = gsub("^\\s+|\\s+$", "", some_txt) 74 | 75 |    # define "tolower error handling" function 76 |    try.tolower = function(x) 77 |    { 78 |       y = NA 79 |       try_error = tryCatch(tolower(x), error=function(e) e) 80 |       if (!inherits(try_error, "error")) 81 |          y = tolower(x) 82 |       return(y) 83 |    } 84 | 85 |    some_txt = sapply(some_txt, try.tolower) 86 |    some_txt = some_txt[some_txt != ""] 87 |    names(some_txt) = NULL 88 |    return(some_txt) 89 | } 90 | 91 | 92 | # harvest tweets 93 | tweets = searchTwitter("iPhone", n=200, lang="en") 94 | 95 | 96 | # get text 97 | tweet_txt = sapply(tweets, function(x) x$getText()) 98 | 99 | 100 | # clean text 101 | tweet_clean = clean.text(tweet_txt) 102 | 103 | ##################################### 104 | # how many tweets 105 | tweet_num = length(tweet_clean) 106 | 107 | # data frame (text, sentiment, score) 108 | tweet_df = data.frame(text=tweet_clean, sentiment=rep("", tweet_num), 109 |    subject=1:tweet_num, topic=1:tweet_num, gender=1:tweet_num, stringsAsFactors=FALSE) 110 | 111 | # apply function getSentiment 112 | sentiment = rep(0, tweet_num) 113 | for (i in 1:tweet_num) 114 | { 115 |    tmp = getSentiment(tweet_clean[i], "API_KEY") 116 |     117 | tweet_df$sentiment[i] = tmp$sentiment 118 |   119 | tweet_df$subject[i] = tmp$subject 120 | tweet_df$topic[i] = tmp$topic 121 | tweet_df$gender[i] = tmp$gender 122 | } 123 | 124 | -------------------------------------------------------------------------------- /sentiment_viralheat.r: -------------------------------------------------------------------------------- 1 | library(twitteR) 2 | library(RCurl) 3 | library(RJSONIO) 4 | library(stringr) 5 | 6 | getSentiment <- function (text, key){ 7 | library(RCurl); 8 | library(RJSONIO); 9 | 10 | text <- URLencode(text); 11 | 12 | #save all the spaces, then get rid of the weird characters that break the API, then convert back the URL-encoded spaces. 13 | text <- str_replace_all(text, "%20", " "); 14 | text <- str_replace_all(text, "%\\d\\d", ""); 15 | text <- str_replace_all(text, " ", "%20"); 16 | 17 | if (str_length(text) > 360){ 18 | text <- substr(text, 0, 359); 19 | } 20 | 21 | data <- getURL(paste("https://www.viralheat.com/api/sentiment/review.json?api_key=", key, "&text=",text, sep="")) 22 | 23 | js <- fromJSON(data, asText=TRUE); 24 | 25 | # get mood probability 26 | score = js$prob 27 | 28 | # positive, negative or neutral? 29 | if (js$mood != "positive") 30 | { 31 | if (js$mood == "negative") { 32 | score = -1 * score 33 | } else { 34 | # neutral 35 | score = 0 36 | } 37 | } 38 | 39 | return(list(mood=js$mood, score=score)) 40 | } 41 | 42 | clean.text <- function(some_txt) 43 | { 44 | some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt) 45 | some_txt = gsub("@\\w+", "", some_txt) 46 | some_txt = gsub("[[:punct:]]", "", some_txt) 47 | some_txt = gsub("[[:digit:]]", "", some_txt) 48 | some_txt = gsub("http\\w+", "", some_txt) 49 | some_txt = gsub("[ \t]{2,}", "", some_txt) 50 | some_txt = gsub("^\\s+|\\s+$", "", some_txt) 51 | 52 | # define "tolower error handling" function 53 | try.tolower = function(x) 54 | { 55 | y = NA 56 | try_error = tryCatch(tolower(x), error=function(e) e) 57 | if (!inherits(try_error, "error")) 58 | y = tolower(x) 59 | return(y) 60 | } 61 | 62 | some_txt = sapply(some_txt, try.tolower) 63 | some_txt = some_txt[some_txt != ""] 64 | names(some_txt) = NULL 65 | return(some_txt) 66 | } 67 | 68 | # harvest tweets 69 | tweets = searchTwitter("iphone5", n=200, lang="en") 70 | 71 | tweet_txt = sapply(mc_tweets, function(x) x$getText()) 72 | tweet_clean = clean.text(tweet_txt) 73 | mcnum = length(tweet_clean) 74 | tweet_df = data.frame(text=tweet_clean, sentiment=rep("", mcnum), score=1:mcnum, stringsAsFactors=FALSE) 75 | 76 | sentiment = rep(0, mcnum) 77 | for (i in 1:mcnum) 78 | { 79 | tmp = getSentiment(tweet_clean[i], "API-KEY") 80 | tweet_df$sentiment[i] = tmp$mood 81 | tweet_df$score[i] = tmp$score 82 | } 83 | 84 | tweet_df -------------------------------------------------------------------------------- /spam_class_r.r: -------------------------------------------------------------------------------- 1 | #Download Data Files: 2 | #data.csv: http://thinktostart.com/data/data.csv 3 | #names.csv: http://thinktostart.com/data/names.csv 4 | 5 | #Load the two files into R: 6 | dataset <- read.csv("data.csv",header=FALSE,sep=";") 7 | names <- read.csv("names.csv",header=FALSE,sep=";") 8 | 9 | #Set the names of the dataset dataframe: 10 | names(dataset) <- sapply((1:nrow(names)),function(i) toString(names[i,1])) 11 | 12 | #make column y a factor variable for binary classification (spam or non-spam) 13 | dataset$y <- as.factor(dataset$y) 14 | 15 | 16 | #get a sample of 1000 rows 17 | sample <- dataset[sample(nrow(dataset), 1000),] 18 | 19 | 20 | #Set up the packages: 21 | 22 | #install.packages(“caret”) 23 | 24 | require(caret) 25 | 26 | #install.packages(“kernlab”) 27 | 28 | require(kernlab) 29 | 30 | #install.packages(“doMC”) 31 | 32 | require(doMC) 33 | 34 | 35 | #Split the data in dataTrain and dataTest 36 | trainIndex <- createDataPartition(sample$y, p = .8, list = FALSE, times = 1) 37 | dataTrain <- sample[ trainIndex,] 38 | dataTest <- sample[-trainIndex,] 39 | 40 | #set up multicore environment 41 | registerDoMC(cores=5) 42 | 43 | 44 | #Create the SVM model: 45 | 46 | ### finding optimal value of a tuning parameter 47 | sigDist <- sigest(y ~ ., data = dataTrain, frac = 1) 48 | ### creating a grid of two tuning parameters, .sigma comes from the earlier line. we are trying to find best value of .C 49 | svmTuneGrid <- data.frame(.sigma = sigDist[1], .C = 2^(-2:7)) 50 | 51 | x <- train(y ~ ., 52 | data = dataTrain, 53 | method = "svmRadial", 54 | preProc = c("center", "scale"), 55 | tuneGrid = svmTuneGrid, 56 | trControl = trainControl(method = "repeatedcv", repeats = 5, 57 | classProbs = TRUE)) 58 | 59 | #Evaluate the model 60 | pred <- predict(x,dataTest[,1:57]) 61 | 62 | acc <- confusionMatrix(pred,dataTest$y) -------------------------------------------------------------------------------- /twitter_authentication.r: -------------------------------------------------------------------------------- 1 | # https://dev.twitter.com/ 2 | 3 | # Install the newest version of the twitteR package from GitHub 4 | install.packages(c("devtools", "rjson", "bit64", "httr")) 5 | 6 | #RESTART R session! 7 | 8 | library(devtools) 9 | install_github("twitteR", username="geoffjentry") 10 | library(twitteR) 11 | 12 | 13 | require(twitteR) 14 | 15 | 16 | #library(RCurl) 17 | # Set SSL certs globally 18 | #options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))) 19 | 20 | 21 | reqURL <- "https://api.twitter.com/oauth/request_token" 22 | accessURL <- "https://api.twitter.com/oauth/access_token" 23 | authURL <- "https://api.twitter.com/oauth/authorize" 24 | apiKey <- "yourAPIkey" 25 | apiSecret <- "yourAPIsecret" 26 | 27 | 28 | 29 | setup_twitter_oauth(apiKey, apiSecret) 30 | 31 | 32 | --------------------------------------------------------------------------------