├── .Rapp.history
├── README.md
├── Twitter_devices.r
├── r_facebook_gender.r
├── r_foursquare_map.r
├── r_googleplus.r
├── r_instagram.r
├── r_mongoDB.r
├── r_mongoDB_REST.r
├── r_pinterest_machine_learning.R
├── r_rfacebook.r
├── r_twitter_cluster.r
├── sentiment_cloud.r
├── sentiment_datumbox.r
├── sentiment_viralheat.r
├── spam_class_r.r
└── twitter_authentication.r


/.Rapp.history:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JulianHill/R-Tutorials/670374ca3c13fc9c656ccddee5a411fca10e01a9/.Rapp.history


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | R-Tutorials
2 | ===========
3 | 
4 | Code from the r tutorials on my blog
5 | 


--------------------------------------------------------------------------------
/Twitter_devices.r:
--------------------------------------------------------------------------------
 1 | tweets = searchTwitter("Social Media", n=20, cainfo="cacert.pem")
 2 | 
 3 | devices <- sapply(tweets, function(x) x$getStatusSource())
 4 |  
 5 | devices <- gsub("","", devices)
 6 | devices <- strsplit(devices, ">")
 7 |  
 8 | devices <- sapply(devices,function(x) ifelse(length(x) > 1, x[2], x[1]))
 9 | 
10 | pie(table(sources))


--------------------------------------------------------------------------------
/r_facebook_gender.r:
--------------------------------------------------------------------------------
 1 | 
 2 | require(Rfacebook)
 3 |     
 4 |     # Change for your need
 5 |     page_name <- "forbes"
 6 |     number_posts <- 2
 7 |     token <- "XXX"
 8 |     
 9 |     #Get the general page info
10 |     page <- getPage(page_name, token, n = number_posts, feed = FALSE)
11 |    
12 |     
13 |     #Extract the post ids
14 |     posts <- page$id
15 |   
16 |     
17 |     data_frame_gender <- data.frame(post=character(),male=numeric(),female=numeric(),etc=numeric(),likes=numeric(),type=character(),stringsAsFactors=FALSE)
18 |    #process each post and analyze the gender distribution of the likes 
19 |     for(i in 1:length(posts))
20 |     {
21 |       temp <- posts[i]
22 |      
23 |       post <- getPost(temp,token)
24 |       
25 |       data_frame_gender[i,1] <- post$post$message
26 |       data_frame_gender[i,5] <- post$post$likes
27 |       data_frame_gender[i,6] <- post$post$type
28 |       
29 |       gender_frame <- data.frame(gender=character(),stringsAsFactors=FALSE)
30 |       
31 |       for(j in 1:length(post$likes$from_id))
32 |       {
33 |         likes <- post$likes$from_id
34 |         user_id <- likes[j]
35 |         
36 |         user <- getUsers(user_id,token=token)
37 |         
38 |         gender <- user$gender
39 |         
40 |         gender_frame[nrow(gender_frame)+1,] <- gender
41 |         
42 |       }
43 |       
44 |       number_males <- nrow(subset(gender_frame, gender=="male"))
45 |       number_females <- nrow(subset(gender_frame, gender=="female"))
46 |       number_etc <- data_frame_gender[i,5] - (number_males+number_females)
47 |       
48 |       data_frame_gender[i,2] <- number_males
49 |       data_frame_gender[i,3] <- number_females
50 |       data_frame_gender[i,4] <- number_etc
51 |       
52 |     }
53 | 
54 | 
55 |     slices <- c(sum(data_frame_gender$male),sum(data_frame_gender$female),sum(data_frame_gender$etc))
56 |    
57 | pct <- round(slices/sum(slices)*100)
58 |     lbls <- names(data_frame_gender[2:4])
59 |     lbls <- paste(lbls, pct) # add percents to labels
60 |     lbls <- paste(lbls,"%",sep="") # ad % to labels
61 |    
62 |     pie(slices, labels = lbls, main="Gender Distribution of all analyzed posts")
63 | 
64 |     


--------------------------------------------------------------------------------
/r_foursquare_map.r:
--------------------------------------------------------------------------------
  1 | library(rjson)
  2 | library(RCurl)
  3 | library(httr)
  4 | 
  5 | #Authentication:
  6 | require(devtools) #install if necessary (install.packages("devtools")
  7 | 
  8 | dev_mode(on=T)
  9 | 
 10 | install_github("ThinkToStartR",username="JulianHill")
 11 | 
 12 | 
 13 | require(ThinkToStartR)
 14 | library(rjson)
 15 | require(RCurl)
 16 | 
 17 | 
 18 | token <- ThinkToStart("Foursquare_auth",app_name="R_Test",app_id="XXX",app_secret="XXX")
 19 | 
 20 | 
 21 | ####Get the Data
 22 | 
 23 | data <- fromJSON(getURL(paste('https://api.foursquare.com/v2/users/self/venuehistory?oauth_token=',token,'&v=',format(Sys.time(), "%Y%m%d"),sep="")))
 24 | 
 25 | response <- data$response
 26 | venues <- response$venues$items
 27 | 
 28 | 
 29 | no_venues = length(data$response$venues$items)
 30 | 
 31 | df = data.frame(no = 1:no_venues)
 32 | 
 33 | 
 34 | for (i in 1:nrow(df)){
 35 |  
 36 |   #Add Name and the location of the Venue
 37 |   df$venue_name[i] <- venues[[i]]$venue$name
 38 |   df$venue_lat[i] <- venues[[i]]$venue$location$lat
 39 |   df$venue_lng[i] <- venues[[i]]$venue$location$lng
 40 |   
 41 |   ##########################
 42 |   #Add the address of the location 
 43 |   if(length(venues[[i]]$venue$location$address)>0)
 44 |   {
 45 |     df$venue_address[i] <- venues[[i]]$venue$location$address
 46 |   }
 47 |   else{
 48 |     df$venue_address[i] <- "No Address Available"
 49 |     
 50 |   }
 51 | 
 52 |   ##########################
 53 |   #Add the citiy of the location
 54 |   if(length(venues[[i]]$venue$location$city)>0)
 55 |   {
 56 |     df$venue_city[i] <- venues[[i]]$venue$location$city
 57 |   }
 58 |   else{
 59 |     df$venue_city[i] <- "No City Available"
 60 |     
 61 |   }
 62 |   
 63 |   ##########################
 64 |   #Add the number of check-ins of the venue
 65 |   df$venue_checkinsCount[i] <- venues[[i]]$venue$stats[[1]]
 66 |  
 67 |   ##########################
 68 |   #Add the URL of the URL if defined
 69 |  if(length(venues[[i]]$venue$url)>0)
 70 |  {
 71 |    df$url[i] <- venues[[i]]$venue$url
 72 |  }
 73 |  else{
 74 |    df$url[i] <- NA
 75 |    
 76 |  }
 77 |  
 78 | }
 79 | 
 80 |  
 81 |  mean_lat <- mean(df$venue_lat) # outcome: 50.90956
 82 | mean_lon <- mean(df$venue_lng) # outcome: 7.576119
 83 | 
 84 | require(rCharts)
 85 | 
 86 | map <- Leaflet$new()
 87 | map$setView(c(mean_lat, mean_lon), zoom = 5)
 88 | 
 89 | 
 90 | for (i in 1:no_venues){
 91 |   
 92 |   #Get the name and the number of check-ins of the current venue
 93 |   name <- df$venue_name[i]
 94 |   checkins <- df$venue_checkinsCount[i]
 95 |   
 96 |   #Add the marker to the map but just add a website link if we have a URL for the venue
 97 |   
 98 |   #if URL is available
 99 |   if(is.na(df$url[i]))
100 |   {  map$marker(c(df$venue_lat[i], df$venue_lng[i]), bindPopup = paste(name,' <br> Checkins: ',checkins,sep=""))
101 |   }
102 |   else
103 |   {
104 |   map$marker(c(df$venue_lat[i], df$venue_lng[i]), bindPopup = paste(name,' <br> Checkins: ',checkins,'<br> <a href="',df$url[i],'" target="_blank">Website</a> ',sep=""))
105 |   }
106 |   
107 | }
108 | 
109 | map
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/r_googleplus.r:
--------------------------------------------------------------------------------
 1 | library(RCurl);
 2 | library(RJSONIO);
 3 | api_key<-"XXX"
 4 | 
 5 | user_id <- "105616015219357887822"
 6 | # Still need to add ssl.verifypeer = FALSE to get a connection :(
 7 | # Add a max results parameter in the URL structure to get 100 results (maximum allowed by the API : https://developers.google.com/+/api/)
 8 | data <- getURL(paste("https://www.googleapis.com/plus/v1/people/",user_id,"/activities/public?maxResults=100&key=", api_key, sep=""),ssl.verifypeer = FALSE)
 9 | js <- fromJSON(data, asText=TRUE);
10 | 
11 | df = data.frame(no = 1:length(js$items))
12 | 
13 | for (i in 1:nrow(df)){
14 |   df$kind[i] = js$items[[i]]$verb
15 |   df$title[i] = js$items[[i]]$title
16 |   df$published[i] = js$items[[i]]$published # add publish date to the df
17 |   df$replies[i] = js$items[[i]]$object$replies$totalItems
18 |   df$plusones[i] = js$items[[i]]$object$plusoners$totalItems
19 |   df$reshares[i] = js$items[[i]]$object$resharers$totalItems
20 |   df$url[i] = js$items[[i]]$object$url
21 |   
22 | }
23 | 
24 | # Export to .csv
25 | filename <- paste("gplus_data_", user_id, sep="") # in case we have more user_ids
26 | write.table(df, file = paste0(filename,".csv"), sep = ",", col.names = NA,
27 |             qmethod = "double")
28 | 
29 | df_graph = df[,c(1,5,6,7)]
30 | 
31 | 
32 |  
33 | require(ggplot2)
34 | require(reshape2)
35 | 
36 | melted=melt(df_graph,id.vars='no')
37 | 
38 | ggplot(melted,aes(x=factor(no),y=value,color=factor(variable),group=factor(variable)))+
39 |     geom_line()+xlab('no')+guides(color=guide_legend("metrics"))+
40 |    labs(title="Google+")
41 | 


--------------------------------------------------------------------------------
/r_instagram.r:
--------------------------------------------------------------------------------
 1 | #Analyze Instagram with R
 2 | #Author: Julian Hillebrand
 3 | 
 4 | #packages
 5 | require(httr)
 6 | require(rjson)
 7 | require(RCurl)
 8 | 
 9 | 
10 | #Authentication
11 | 
12 | ## getting callback URL
13 | full_url <- oauth_callback()
14 | full_url <- gsub("(.*localhost:[0-9]{1,5}/).*", x=full_url, replacement="\\1")
15 | #message <- paste("Copy and paste into Site URL on Instagram App Settings:", 
16 |  #                full_url, "\nWhen done, press any key to continue...")
17 | 
18 | invisible(readline(message))
19 | 
20 | app_name <- "ThinkToStartTest"
21 | client_id <- "CLIENT_ID"
22 | client_secret <- "CLIENT_SECRET"
23 | scope = "public_content"
24 | 
25 | 
26 | 
27 | instagram <- oauth_endpoint(
28 |   authorize = "https://api.instagram.com/oauth/authorize",
29 |   access = "https://api.instagram.com/oauth/access_token")  
30 | myapp <- oauth_app(app_name, client_id, client_secret)
31 | 
32 | #scope <- NULL
33 | ig_oauth <- oauth2.0_token(instagram, myapp,scope=scope,  type = "application/x-www-form-urlencoded",cache=FALSE)  
34 | tmp <- strsplit(toString(names(ig_oauth$credentials)), '"')
35 | token <- tmp[[1]][30]
36 | 
37 | ########################################################
38 | 
39 | 
40 | user_info <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/self/?access_token=',token,sep="")))
41 | 
42 | 
43 | 
44 | received_profile <- user_info$data$id
45 | 
46 | 
47 | 
48 | 
49 | 
50 |   #Get recent media (20 pictures)
51 |   media <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/self/media/recent/?access_token=',token,sep="")))
52 |   
53 |   
54 |   
55 |   df = data.frame(no = 1:length(media$data))
56 |   
57 |   for(i in 1:length(media$data))
58 |   {
59 |     #comments
60 |     df$comments[i] <-media$data[[i]]$comments$count
61 |     
62 |     #likes:
63 |     df$likes[i] <- media$data[[i]]$likes$count
64 |     
65 |     #date
66 |     df$date[i] <- toString(as.POSIXct(as.numeric(media$data[[i]]$created_time), origin="1970-01-01"))
67 |   }
68 |   
69 |   #Visualization
70 |   
71 |   require(rCharts)
72 |   
73 |   m1 <- mPlot(x = "date", y = c("likes", "comments"), type = "Line", data = df)
74 |   
75 |  


--------------------------------------------------------------------------------
/r_mongoDB.r:
--------------------------------------------------------------------------------
 1 | # install package to connect through monodb
 2 | install.packages("rmongodb")
 3 | library(rmongodb)
 4 | # connect to MongoDB
 5 | mongo = mongo.create(host = "localhost")
 6 | mongo.is.connected(mongo)
 7 | 
 8 | mongo.get.databases(mongo)
 9 | 
10 | mongo.get.database.collections(mongo, db = "tweetDB2") #”tweetDB” is where twitter data is stored
11 | 
12 | library(plyr)
13 | ## create the empty data frame
14 | df1 = data.frame(stringsAsFactors = FALSE)
15 | 
16 | ## create the namespace
17 | DBNS = "tweetDB2.#analytic"
18 | 
19 | ## create the cursor we will iterate over, basically a select * in SQL
20 | cursor = mongo.find(mongo, DBNS)
21 | 
22 | ## create the counter
23 | i = 1
24 | 
25 | ## iterate over the cursor
26 | while (mongo.cursor.next(cursor)) {
27 | # iterate and grab the next record
28 | tmp = mongo.bson.to.list(mongo.cursor.value(cursor))
29 | # make it a dataframe
30 | tmp.df = as.data.frame(t(unlist(tmp)), stringsAsFactors = F)
31 | # bind to the master dataframe
32 | df1 = rbind.fill(df1, tmp.df)
33 | }
34 | 
35 | dim(df1)


--------------------------------------------------------------------------------
/r_mongoDB_REST.r:
--------------------------------------------------------------------------------
 1 | library(RCurl)
 2 | library(rjson)
 3 | 
 4 | database = "tweetDB"
 5 | collection = "Apple"
 6 | limit = "100"
 7 | db <- paste("http://localhost:28017/",database,"/",collection,"/?limit=",limit,sep = "")
 8 | 
 9 | tweets <- fromJSON(getURL(db))
10 | 
11 | tweet_df = data.frame(text=1:limit)
12 | for (i in 1:limit){
13 | tweet_df$text[i] = tweets$rows[[i]]$tweet_text}
14 | tweet_df
15 | 


--------------------------------------------------------------------------------
/r_pinterest_machine_learning.R:
--------------------------------------------------------------------------------
  1 | # Pinterest
  2 | 
  3 | #Analyze Pinterest with R
  4 | #Author: Julian Hillebrand
  5 | 
  6 | #packages
  7 | require(httr)
  8 | require(rjson)
  9 | require(RCurl)
 10 | 
 11 | 
 12 | #Authentication
 13 | 
 14 | ## getting callback URL
 15 | full_url <- oauth_callback()
 16 | full_url <- gsub("(.*localhost:[0-9]{1,5}/).*", x=full_url, replacement="\\1")
 17 | #message <- paste("Copy and paste into Site URL on Instagram App Settings:", 
 18 | #                full_url, "\nWhen done, press any key to continue...")
 19 | 
 20 | invisible(readline(message))
 21 | 
 22 | app_name <- "R Test"
 23 | client_id <- "XXX"
 24 | client_secret <- "XXX"
 25 | scope = "read_public"
 26 | redirect_uri="https://mywebsite.com/connect/pinterest/"
 27 | 
 28 | https://api.pinterest.com/oauth/?
 29 | response_type=code&
 30 |   redirect_uri=https://mywebsite.com/connect/pinterest/&
 31 |   client_id=12345&
 32 |   scope=read_public,write_public&
 33 |   state=768uyFys
 34 | 
 35 | user_info <- fromJSON(getURL(paste("https://api.pinterest.com/oauth?response_type=code&client_id=",client_id,"&scope=",scope,sep="")),unexpected.escape ="keep")
 36 | 
 37 | paste("https://api.pinterest.com/oauth?response_type=code&client_id=",client_id,"&scope=",scope,sep="")
 38 | 
 39 | pinterest <- oauth_endpoint(
 40 |   authorize = "https://api.pinterest.com/oauth",
 41 |   access = "https://api.pinterest.com/v1/oauth/token")  
 42 | 
 43 | myapp <- oauth_app(app_name, client_id, client_secret)
 44 | 
 45 | 
 46 | 
 47 | #scope <- NULL
 48 | pi_oauth <- oauth2.0_token(pinterest, myapp,scope=scope,use_oob = TRUE, as_header = TRUE)
 49 | 
 50 | 
 51 | fb_ep = oauth_endpoint(token_url, auth_url, access_url)
 52 | pi_oauth <- oauth1.0_token(pinterest, myapp)
 53 | 
 54 | 
 55 | 
 56 | 
 57 | tmp <- strsplit(toString(names(ig_oauth$credentials)), '"')
 58 | token <- tmp[[1]][4]
 59 | 
 60 | ########################################################
 61 | 
 62 | username <- "therock"
 63 | 
 64 | #search for the username
 65 | user_info <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/search?q=',username,'&access_token=',token,sep="")),unexpected.escape = "keep")
 66 | 
 67 | received_profile <- user_info$data[[1]]
 68 | 
 69 | if(grepl(received_profile$username,username))
 70 | {
 71 |   user_id <- received_profile$id
 72 |   #Get recent media (20 pictures)
 73 |   media <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/',user_id,'/media/recent/?access_token=',token,sep="")))
 74 |   
 75 |   
 76 |   df = data.frame(no = 1:length(media$data))
 77 |   
 78 |   for(i in 1:length(media$data))
 79 |   {
 80 |     #comments
 81 |     df$comments[i] <-media$data[[i]]$comments$count
 82 |     
 83 |     #likes:
 84 |     df$likes[i] <- media$data[[i]]$likes$count
 85 |     
 86 |     #date
 87 |     df$date[i] <- toString(as.POSIXct(as.numeric(media$data[[i]]$created_time), origin="1970-01-01"))
 88 |   }
 89 |   
 90 |   #Visualization
 91 |   
 92 |   require(rCharts)
 93 |   
 94 |   m1 <- mPlot(x = "date", y = c("likes", "comments"), type = "Line", data = df)
 95 |   
 96 |   
 97 | }else
 98 | {
 99 |   print("Error: User not found!")
100 | }
101 | 


--------------------------------------------------------------------------------
/r_rfacebook.r:
--------------------------------------------------------------------------------
 1 | install.packages("devtools")
 2 | library(devtools)
 3 | 
 4 | install_github("Rfacebook", "pablobarbera", subdir="Rfacebook")
 5 | 
 6 | 
 7 | require("Rfacebook")
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | ######Using the App Authentication:
14 | 
15 | fb_oauth <- fbOAuth(app_id="123456789", app_secret="1A2B3C4D",extended_permissions = TRUE)
16 | 
17 | 
18 | #now we have our fb_oauth connection
19 |  #we will just save them to be able to use them later
20 |  save(fb_oauth, file="fb_oauth")
21 |  
22 |  #so if you want to connect to Facebook again you just have to call
23 |  load("fb_oauth")
24 | 
25 |  me <- getUsers("me",token=fb_oauth)
26 |  
27 |  my_likes <- getLikes(user="me", token=fb_oauth)
28 |  
29 |  
30 |  ######Using the Token Authentication:
31 | 
32 | token <- 'YOUR AUTHENTICATION TOKEN'
33 | me <- getUsers("me", token, private_info=TRUE)
34 | 
35 | getUsers(c("barackobama", "donaldtrump"), token)
36 | 
37 | 
38 | getFriends(token, simplify = FALSE)
39 | 
40 | my_friends <- getFriends(token=fb_oauth, simplify=TRUE)
41 | 
42 | head(my_friends, n=10)
43 | 
44 | getUser()
45 | 
46 | my_friends_info <- getUsers(my_friends$id, token=fb_oauth, private_info=TRUE)
47 |  
48 |  #create a table with the relationship statuses
49 |  
50 | table(my_friends_info$relationship_status)
51 | 


--------------------------------------------------------------------------------
/r_twitter_cluster.r:
--------------------------------------------------------------------------------
 1 | # run this if you don't have one or more of the next packages:
 2 | # install.packages(c("devtools", "RCurl", "rjson", "bit64","httr","ROAuth"))
 3 | # library(devtools)
 4 | # install_github("twitteR", username="geoffjentry")
 5 | # install_github('rCharts','ramnathv')
 6 | 
 7 | 
 8 | 
 9 | library(RCurl)
10 | # Set SSL certs globally
11 | options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
12 | 
13 | library(twitteR)
14 | 
15 | #Authentication
16 | #http://thinktostart.wordpress.com/2013/05/22/twitter-authentification-with-r/
17 | 
18 | 
19 | library(rCharts)
20 | 
21 | user <- getUser("ACCOUNT-TO-BE-ANALYZED")
22 | userFriends <- user$getFriends(n=5000) #put () if you want to get all friends and followers
23 | userFollowers <- user$getFollowers(n=5000)
24 | userNeighbors <- union(userFollowers, userFriends)
25 | userNeighbors.df = twListToDF(userNeighbors)
26 | 
27 | userNeighbors.df[userNeighbors.df=="0"]<-1
28 | 
29 | userNeighbors.df$logFollowersCount <-log(userNeighbors.df$followersCount)
30 | 
31 | userNeighbors.df$logFriendsCount <-log(userNeighbors.df$friendsCount)
32 | 
33 | kObject.log <- data.frame(userNeighbors.df$logFriendsCount,userNeighbors.df$logFollowersCount)
34 | 
35 | ###elbow
36 | mydata <- kObject.log
37 | wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
38 | for (i in 2:15) wss[i] <- sum(kmeans(mydata,
39 | 					centers=i)$withinss)
40 | plot(1:15, wss, type="b", xlab="Number of Clusters",
41 | 		ylab="Within groups sum of squares")
42 | 
43 | 
44 | ###k-means
45 | 
46 | ##Run the K Means algorithm, remember to specify centers from 'elbow plot'
47 | userMeans.log <- kmeans(kObject.log, centers=4, iter.max=10, nstart=100)
48 | 
49 | ##Add the vector of specified clusters back to the original vector as a factor
50 | kObject.log$cluster=factor(userMeans.log$cluster)
51 | userNeighbors.df$cluster <- kObject.log$cluster
52 | 
53 | 
54 | p2 <- nPlot(logFollowersCount ~ logFriendsCount, group = 'cluster', data = userNeighbors.df, type = 'scatterChart')
55 | p2$xAxis(axisLabel = 'Followers Count')
56 | p2$yAxis(axisLabel = 'Friends Count')
57 | p2$chart(tooltipContent = "#! function(key, x, y, e){
58 | 				return e.point.screenName + ' Followers: ' + e.point.followersCount +' Friends: ' + e.point.friendsCount
59 | 				} !#")
60 | p2
61 | 


--------------------------------------------------------------------------------
/sentiment_cloud.r:
--------------------------------------------------------------------------------
  1 | library(twitteR)
  2 | library(RCurl)
  3 | library(RJSONIO)
  4 | library(stringr)
  5 | library(tm)
  6 | library(wordcloud)
  7 | 
  8 | 
  9 | ####################################################################
 10 | 
 11 | getSentiment <- function (text, key){
 12 |  
 13 | text <- URLencode(text);
 14 |  
 15 | #save all the spaces, then get rid of the weird characters that break the API, then convert back the URL-encoded spaces.
 16 | text <- str_replace_all(text, "%20", " ");
 17 | text <- str_replace_all(text, "%\\d\\d", "");
 18 | text <- str_replace_all(text, " ", "%20");
 19 |  
 20 |  
 21 | if (str_length(text) > 360){
 22 | text <- substr(text, 0, 359);
 23 | }
 24 | ##########################################
 25 |  
 26 | data <- getURL(paste("http://api.datumbox.com/1.0/TwitterSentimentAnalysis.json?api_key=", key, "&text=",text, sep=""))
 27 |  
 28 | js <- fromJSON(data, asText=TRUE);
 29 |  
 30 | # get mood probability
 31 | sentiment = js$output$result
 32 |  
 33 | ###################################
 34 | 
 35 |  
 36 | return(list(sentiment=sentiment))
 37 | }
 38 | 
 39 | clean.text <- function(some_txt)
 40 | {
 41 | some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
 42 | some_txt = gsub("@\\w+", "", some_txt)
 43 | some_txt = gsub("[[:punct:]]", "", some_txt)
 44 | some_txt = gsub("[[:digit:]]", "", some_txt)
 45 | some_txt = gsub("http\\w+", "", some_txt)
 46 | some_txt = gsub("[ \t]{2,}", "", some_txt)
 47 | some_txt = gsub("^\\s+|\\s+$", "", some_txt)
 48 | some_txt = gsub("amp", "", some_txt)
 49 | # define "tolower error handling" function
 50 | try.tolower = function(x)
 51 | {
 52 | y = NA
 53 | try_error = tryCatch(tolower(x), error=function(e) e)
 54 | if (!inherits(try_error, "error"))
 55 | y = tolower(x)
 56 | return(y)
 57 | }
 58 | 
 59 | some_txt = sapply(some_txt, try.tolower)
 60 | some_txt = some_txt[some_txt != ""]
 61 | names(some_txt) = NULL
 62 | return(some_txt)
 63 | }
 64 | 
 65 | 
 66 | 
 67 | ###########################################################
 68 | 
 69 | 
 70 |   
 71 |    print("Getting tweets...")
 72 | # get some tweets
 73 | tweets = searchTwitter(keyword, n, lang="en")
 74 | # get text 
 75 | tweet_txt = sapply(tweets, function(x) x$getText())
 76 | 
 77 | # clean text
 78 | tweet_clean = clean.text(tweet_txt)
 79 | tweet_num = length(tweet_clean)
 80 | # data frame (text, sentiment)
 81 | tweet_df = data.frame(text=tweet_clean, sentiment=rep("", tweet_num),stringsAsFactors=FALSE)
 82 | 
 83 | print("Getting sentiments...")
 84 |  # apply function getSentiment
 85 |  sentiment = rep(0, tweet_num)
 86 |  for (i in 1:tweet_num)
 87 |  {
 88 |  tmp = getSentiment(tweet_clean[i], db_key)
 89 |   
 90 |   tweet_df$sentiment[i] = tmp$sentiment
 91 |   
 92 |   print(paste(i," of ", tweet_num))
 93 |  
 94 |  
 95 | }
 96 |   
 97 | # delete rows with no sentiment
 98 | tweet_df <- tweet_df[tweet_df$sentiment!="",]
 99 | 
100 | 
101 | #separate text by sentiment
102 | sents = levels(factor(tweet_df$sentiment))
103 | #emos_label <- emos
104 | 
105 | 
106 | # get the labels and percents
107 | 
108 | labels <-  lapply(sents, function(x) paste(x,format(round((length((tweet_df[tweet_df$sentiment ==x,])$text)/length(tweet_df$sentiment)*100),2),nsmall=2),"%"))
109 | 
110 | 
111 | 
112 | nemo = length(sents)
113 | emo.docs = rep("", nemo)
114 | for (i in 1:nemo)
115 | {
116 |   tmp = tweet_df[tweet_df$sentiment == sents[i],]$text
117 |    
118 |    emo.docs[i] = paste(tmp,collapse=" ")
119 | }
120 | 
121 | 
122 | 
123 | # remove stopwords
124 | emo.docs = removeWords(emo.docs, stopwords("german"))
125 | emo.docs = removeWords(emo.docs, stopwords("english"))
126 | corpus = Corpus(VectorSource(emo.docs))
127 | tdm = TermDocumentMatrix(corpus)
128 | tdm = as.matrix(tdm)
129 | colnames(tdm) = labels
130 | 
131 | 
132 | 
133 | 
134 | # comparison word cloud
135 | comparison.cloud(tdm, colors = brewer.pal(nemo, "Dark2"),
136 |    scale = c(3,.5), random.order = FALSE, title.size = 1.5)
137 | 
138 |   
139 | 


--------------------------------------------------------------------------------
/sentiment_datumbox.r:
--------------------------------------------------------------------------------
  1 | # load packages
  2 | library(twitteR)
  3 | library(RCurl)
  4 | library(RJSONIO)
  5 | library(stringr)
  6 | 
  7 | 
  8 | 
  9 | getSentiment <- function (text, key){
 10 |   
 11 |   
 12 |   
 13 |   text <- URLencode(text);
 14 |   
 15 |   #save all the spaces, then get rid of the weird characters that break the API, then convert back the URL-encoded spaces.
 16 |   text <- str_replace_all(text, "%20", " ");
 17 |   text <- str_replace_all(text, "%\\d\\d", "");
 18 |   text <- str_replace_all(text, " ", "%20");
 19 |   
 20 |   
 21 |   if (str_length(text) > 360){
 22 |     text <- substr(text, 0, 359);
 23 |   }
 24 |   ##########################################
 25 |   
 26 |   data <- getURL(paste("http://api.datumbox.com/1.0/TwitterSentimentAnalysis.json?api_key=", key, "&text=",text, sep=""))
 27 |   
 28 | js <- fromJSON(data, asText=TRUE);
 29 |  
 30 |   # get mood probability
 31 |   sentiment = js$output$result
 32 | 
 33 | ###################################
 34 | 
 35 |   data <- getURL(paste("http://api.datumbox.com/1.0/SubjectivityAnalysis.json?api_key=", key, "&text=",text, sep=""))
 36 |   
 37 | js <- fromJSON(data, asText=TRUE);
 38 |  
 39 |   # get mood probability
 40 |   subject = js$output$result
 41 | 
 42 | ##################################
 43 | 
 44 | data <- getURL(paste("http://api.datumbox.com/1.0/TopicClassification.json?api_key=", key, "&text=",text, sep=""))
 45 |   
 46 | js <- fromJSON(data, asText=TRUE);
 47 |  
 48 |   # get mood probability
 49 |   topic = js$output$result
 50 | 
 51 | ##################################
 52 | data <- getURL(paste("http://api.datumbox.com/1.0/GenderDetection.json?api_key=", key, "&text=",text, sep=""))
 53 |   
 54 | js <- fromJSON(data, asText=TRUE);
 55 |  
 56 |   # get mood probability
 57 |   gender = js$output$result
 58 | 
 59 | return(list(sentiment=sentiment,subject=subject,topic=topic,gender=gender))
 60 |   }
 61 |       
 62 |  
 63 | 
 64 | #################
 65 | clean.text <- function(some_txt)
 66 | {
 67 |    some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
 68 |    some_txt = gsub("@\\w+", "", some_txt)
 69 |    some_txt = gsub("[[:punct:]]", "", some_txt)
 70 |    some_txt = gsub("[[:digit:]]", "", some_txt)
 71 |    some_txt = gsub("http\\w+", "", some_txt)
 72 |    some_txt = gsub("[ \t]{2,}", "", some_txt)
 73 |    some_txt = gsub("^\\s+|\\s+$", "", some_txt)
 74 | 
 75 |    # define "tolower error handling" function
 76 |    try.tolower = function(x)
 77 |    {
 78 |       y = NA
 79 |       try_error = tryCatch(tolower(x), error=function(e) e)
 80 |       if (!inherits(try_error, "error"))
 81 |          y = tolower(x)
 82 |       return(y)
 83 |    }
 84 | 
 85 |    some_txt = sapply(some_txt, try.tolower)
 86 |    some_txt = some_txt[some_txt != ""]
 87 |    names(some_txt) = NULL
 88 |    return(some_txt)
 89 | }
 90 | 
 91 | 
 92 | # harvest tweets
 93 | tweets = searchTwitter("iPhone", n=200, lang="en")
 94 | 
 95 | 
 96 | # get text
 97 | tweet_txt = sapply(tweets, function(x) x$getText())
 98 | 
 99 | 
100 | # clean text
101 | tweet_clean = clean.text(tweet_txt)
102 | 
103 | #####################################
104 | # how many tweets
105 | tweet_num = length(tweet_clean)
106 | 
107 | # data frame (text, sentiment, score)
108 | tweet_df = data.frame(text=tweet_clean, sentiment=rep("", tweet_num),
109 |    subject=1:tweet_num, topic=1:tweet_num, gender=1:tweet_num, stringsAsFactors=FALSE)
110 | 
111 | # apply function getSentiment
112 | sentiment = rep(0, tweet_num)
113 | for (i in 1:tweet_num)
114 | {
115 |    tmp = getSentiment(tweet_clean[i], "API_KEY")
116 |    
117 | 	tweet_df$sentiment[i] = tmp$sentiment
118 |   
119 | 	tweet_df$subject[i] = tmp$subject
120 | 	tweet_df$topic[i] = tmp$topic
121 | 	tweet_df$gender[i] = tmp$gender
122 | }
123 | 
124 | 


--------------------------------------------------------------------------------
/sentiment_viralheat.r:
--------------------------------------------------------------------------------
 1 | library(twitteR)
 2 | library(RCurl)
 3 | library(RJSONIO)
 4 | library(stringr)
 5 | 
 6 | getSentiment <- function (text, key){
 7 | library(RCurl);
 8 | library(RJSONIO);
 9 |  
10 | text <- URLencode(text);
11 |  
12 | #save all the spaces, then get rid of the weird characters that break the API, then convert back the URL-encoded spaces.
13 | text <- str_replace_all(text, "%20", " ");
14 | text <- str_replace_all(text, "%\\d\\d", "");
15 | text <- str_replace_all(text, " ", "%20");
16 |  
17 | if (str_length(text) > 360){
18 | text <- substr(text, 0, 359);
19 | }
20 |  
21 | data <- getURL(paste("https://www.viralheat.com/api/sentiment/review.json?api_key=", key, "&text=",text, sep=""))
22 |  
23 | js <- fromJSON(data, asText=TRUE);
24 |  
25 | # get mood probability
26 | score = js$prob
27 |  
28 | # positive, negative or neutral?
29 | if (js$mood != "positive")
30 | {
31 | if (js$mood == "negative") {
32 | score = -1 * score
33 | } else {
34 | # neutral
35 | score = 0
36 | }
37 | }
38 |  
39 | return(list(mood=js$mood, score=score))
40 | }
41 | 
42 | clean.text <- function(some_txt)
43 | {
44 | some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
45 | some_txt = gsub("@\\w+", "", some_txt)
46 | some_txt = gsub("[[:punct:]]", "", some_txt)
47 | some_txt = gsub("[[:digit:]]", "", some_txt)
48 | some_txt = gsub("http\\w+", "", some_txt)
49 | some_txt = gsub("[ \t]{2,}", "", some_txt)
50 | some_txt = gsub("^\\s+|\\s+$", "", some_txt)
51 |  
52 | # define "tolower error handling" function
53 | try.tolower = function(x)
54 | {
55 | y = NA
56 | try_error = tryCatch(tolower(x), error=function(e) e)
57 | if (!inherits(try_error, "error"))
58 | y = tolower(x)
59 | return(y)
60 | }
61 |  
62 | some_txt = sapply(some_txt, try.tolower)
63 | some_txt = some_txt[some_txt != ""]
64 | names(some_txt) = NULL
65 | return(some_txt)
66 | }
67 | 
68 | # harvest tweets
69 | tweets = searchTwitter("iphone5", n=200, lang="en")
70 | 
71 | tweet_txt = sapply(mc_tweets, function(x) x$getText())
72 | tweet_clean = clean.text(tweet_txt)
73 | mcnum = length(tweet_clean)
74 | tweet_df = data.frame(text=tweet_clean, sentiment=rep("", mcnum), score=1:mcnum, stringsAsFactors=FALSE)
75 | 
76 | sentiment = rep(0, mcnum)
77 | for (i in 1:mcnum)
78 | {
79 | tmp = getSentiment(tweet_clean[i], "API-KEY")
80 | tweet_df$sentiment[i] = tmp$mood
81 | tweet_df$score[i] = tmp$score
82 | }
83 | 
84 | tweet_df


--------------------------------------------------------------------------------
/spam_class_r.r:
--------------------------------------------------------------------------------
 1 | #Download Data Files:
 2 | #data.csv:  http://thinktostart.com/data/data.csv
 3 | #names.csv:  http://thinktostart.com/data/names.csv
 4 | 
 5 | #Load the two files into R:
 6 | dataset <- read.csv("data.csv",header=FALSE,sep=";")
 7 | names <- read.csv("names.csv",header=FALSE,sep=";")
 8 | 
 9 | #Set the names of the dataset dataframe:
10 | names(dataset) <- sapply((1:nrow(names)),function(i) toString(names[i,1]))
11 | 
12 | #make column y a factor variable for binary classification (spam or non-spam)
13 | dataset$y <- as.factor(dataset$y)
14 | 
15 | 
16 | #get a sample of 1000 rows
17 | sample <- dataset[sample(nrow(dataset), 1000),]
18 | 
19 | 
20 | #Set up the packages:
21 | 
22 | #install.packages(“caret”)
23 |  
24 | require(caret)
25 |  
26 | #install.packages(“kernlab”)
27 |  
28 | require(kernlab)
29 |  
30 | #install.packages(“doMC”)
31 |  
32 | require(doMC)
33 | 
34 | 
35 | #Split the data in dataTrain and dataTest
36 | trainIndex <- createDataPartition(sample$y, p = .8, list = FALSE, times = 1)
37 | dataTrain <- sample[ trainIndex,]
38 | dataTest  <- sample[-trainIndex,]
39 | 
40 | #set up multicore environment
41 | registerDoMC(cores=5)
42 | 
43 | 
44 | #Create the SVM model:
45 | 
46 | ### finding optimal value of a tuning parameter
47 | sigDist <- sigest(y ~ ., data = dataTrain, frac = 1)
48 | ### creating a grid of two tuning parameters, .sigma comes from the earlier line. we are trying to find best value of .C
49 | svmTuneGrid <- data.frame(.sigma = sigDist[1], .C = 2^(-2:7))
50 | 
51 | x <- train(y ~ .,
52 |                  data = dataTrain,
53 |                  method = "svmRadial",
54 |                  preProc = c("center", "scale"),
55 |                 tuneGrid = svmTuneGrid,
56 |                  trControl = trainControl(method = "repeatedcv", repeats = 5, 
57 |                                          classProbs =  TRUE))
58 | 
59 | #Evaluate the model
60 | pred <- predict(x,dataTest[,1:57])
61 |  
62 | acc <- confusionMatrix(pred,dataTest$y)


--------------------------------------------------------------------------------
/twitter_authentication.r:
--------------------------------------------------------------------------------
 1 | # https://dev.twitter.com/ 
 2 | 
 3 | # Install the newest version of the twitteR package from GitHub
 4 | install.packages(c("devtools", "rjson", "bit64", "httr"))
 5 | 
 6 | #RESTART R session!
 7 | 
 8 | library(devtools)
 9 | install_github("twitteR", username="geoffjentry")
10 | library(twitteR)
11 | 
12 | 
13 | require(twitteR)
14 |  
15 | 
16 | #library(RCurl)
17 | # Set SSL certs globally
18 | #options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
19 |  
20 | 
21 | reqURL <- "https://api.twitter.com/oauth/request_token"
22 | accessURL <- "https://api.twitter.com/oauth/access_token"
23 | authURL <- "https://api.twitter.com/oauth/authorize"
24 | apiKey <- "yourAPIkey"
25 | apiSecret <- "yourAPIsecret"
26 | 
27 | 
28 | 
29 | setup_twitter_oauth(apiKey, apiSecret)
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------