├── .Rapp.history
├── README.md
├── Twitter_devices.r
├── r_facebook_gender.r
├── r_foursquare_map.r
├── r_googleplus.r
├── r_instagram.r
├── r_mongoDB.r
├── r_mongoDB_REST.r
├── r_pinterest_machine_learning.R
├── r_rfacebook.r
├── r_twitter_cluster.r
├── sentiment_cloud.r
├── sentiment_datumbox.r
├── sentiment_viralheat.r
├── spam_class_r.r
└── twitter_authentication.r
/.Rapp.history:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JulianHill/R-Tutorials/670374ca3c13fc9c656ccddee5a411fca10e01a9/.Rapp.history
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | R-Tutorials
2 | ===========
3 |
4 | Code from the r tutorials on my blog
5 |
--------------------------------------------------------------------------------
/Twitter_devices.r:
--------------------------------------------------------------------------------
1 | tweets = searchTwitter("Social Media", n=20, cainfo="cacert.pem")
2 |
3 | devices <- sapply(tweets, function(x) x$getStatusSource())
4 |
5 | devices <- gsub("","", devices)
6 | devices <- strsplit(devices, ">")
7 |
8 | devices <- sapply(devices,function(x) ifelse(length(x) > 1, x[2], x[1]))
9 |
10 | pie(table(sources))
--------------------------------------------------------------------------------
/r_facebook_gender.r:
--------------------------------------------------------------------------------
1 |
2 | require(Rfacebook)
3 |
4 | # Change for your need
5 | page_name <- "forbes"
6 | number_posts <- 2
7 | token <- "XXX"
8 |
9 | #Get the general page info
10 | page <- getPage(page_name, token, n = number_posts, feed = FALSE)
11 |
12 |
13 | #Extract the post ids
14 | posts <- page$id
15 |
16 |
17 | data_frame_gender <- data.frame(post=character(),male=numeric(),female=numeric(),etc=numeric(),likes=numeric(),type=character(),stringsAsFactors=FALSE)
18 | #process each post and analyze the gender distribution of the likes
19 | for(i in 1:length(posts))
20 | {
21 | temp <- posts[i]
22 |
23 | post <- getPost(temp,token)
24 |
25 | data_frame_gender[i,1] <- post$post$message
26 | data_frame_gender[i,5] <- post$post$likes
27 | data_frame_gender[i,6] <- post$post$type
28 |
29 | gender_frame <- data.frame(gender=character(),stringsAsFactors=FALSE)
30 |
31 | for(j in 1:length(post$likes$from_id))
32 | {
33 | likes <- post$likes$from_id
34 | user_id <- likes[j]
35 |
36 | user <- getUsers(user_id,token=token)
37 |
38 | gender <- user$gender
39 |
40 | gender_frame[nrow(gender_frame)+1,] <- gender
41 |
42 | }
43 |
44 | number_males <- nrow(subset(gender_frame, gender=="male"))
45 | number_females <- nrow(subset(gender_frame, gender=="female"))
46 | number_etc <- data_frame_gender[i,5] - (number_males+number_females)
47 |
48 | data_frame_gender[i,2] <- number_males
49 | data_frame_gender[i,3] <- number_females
50 | data_frame_gender[i,4] <- number_etc
51 |
52 | }
53 |
54 |
55 | slices <- c(sum(data_frame_gender$male),sum(data_frame_gender$female),sum(data_frame_gender$etc))
56 |
57 | pct <- round(slices/sum(slices)*100)
58 | lbls <- names(data_frame_gender[2:4])
59 | lbls <- paste(lbls, pct) # add percents to labels
60 | lbls <- paste(lbls,"%",sep="") # ad % to labels
61 |
62 | pie(slices, labels = lbls, main="Gender Distribution of all analyzed posts")
63 |
64 |
--------------------------------------------------------------------------------
/r_foursquare_map.r:
--------------------------------------------------------------------------------
1 | library(rjson)
2 | library(RCurl)
3 | library(httr)
4 |
5 | #Authentication:
6 | require(devtools) #install if necessary (install.packages("devtools")
7 |
8 | dev_mode(on=T)
9 |
10 | install_github("ThinkToStartR",username="JulianHill")
11 |
12 |
13 | require(ThinkToStartR)
14 | library(rjson)
15 | require(RCurl)
16 |
17 |
18 | token <- ThinkToStart("Foursquare_auth",app_name="R_Test",app_id="XXX",app_secret="XXX")
19 |
20 |
21 | ####Get the Data
22 |
23 | data <- fromJSON(getURL(paste('https://api.foursquare.com/v2/users/self/venuehistory?oauth_token=',token,'&v=',format(Sys.time(), "%Y%m%d"),sep="")))
24 |
25 | response <- data$response
26 | venues <- response$venues$items
27 |
28 |
29 | no_venues = length(data$response$venues$items)
30 |
31 | df = data.frame(no = 1:no_venues)
32 |
33 |
34 | for (i in 1:nrow(df)){
35 |
36 | #Add Name and the location of the Venue
37 | df$venue_name[i] <- venues[[i]]$venue$name
38 | df$venue_lat[i] <- venues[[i]]$venue$location$lat
39 | df$venue_lng[i] <- venues[[i]]$venue$location$lng
40 |
41 | ##########################
42 | #Add the address of the location
43 | if(length(venues[[i]]$venue$location$address)>0)
44 | {
45 | df$venue_address[i] <- venues[[i]]$venue$location$address
46 | }
47 | else{
48 | df$venue_address[i] <- "No Address Available"
49 |
50 | }
51 |
52 | ##########################
53 | #Add the citiy of the location
54 | if(length(venues[[i]]$venue$location$city)>0)
55 | {
56 | df$venue_city[i] <- venues[[i]]$venue$location$city
57 | }
58 | else{
59 | df$venue_city[i] <- "No City Available"
60 |
61 | }
62 |
63 | ##########################
64 | #Add the number of check-ins of the venue
65 | df$venue_checkinsCount[i] <- venues[[i]]$venue$stats[[1]]
66 |
67 | ##########################
68 | #Add the URL of the URL if defined
69 | if(length(venues[[i]]$venue$url)>0)
70 | {
71 | df$url[i] <- venues[[i]]$venue$url
72 | }
73 | else{
74 | df$url[i] <- NA
75 |
76 | }
77 |
78 | }
79 |
80 |
81 | mean_lat <- mean(df$venue_lat) # outcome: 50.90956
82 | mean_lon <- mean(df$venue_lng) # outcome: 7.576119
83 |
84 | require(rCharts)
85 |
86 | map <- Leaflet$new()
87 | map$setView(c(mean_lat, mean_lon), zoom = 5)
88 |
89 |
90 | for (i in 1:no_venues){
91 |
92 | #Get the name and the number of check-ins of the current venue
93 | name <- df$venue_name[i]
94 | checkins <- df$venue_checkinsCount[i]
95 |
96 | #Add the marker to the map but just add a website link if we have a URL for the venue
97 |
98 | #if URL is available
99 | if(is.na(df$url[i]))
100 | { map$marker(c(df$venue_lat[i], df$venue_lng[i]), bindPopup = paste(name,'
Checkins: ',checkins,sep=""))
101 | }
102 | else
103 | {
104 | map$marker(c(df$venue_lat[i], df$venue_lng[i]), bindPopup = paste(name,'
Checkins: ',checkins,'
Website ',sep=""))
105 | }
106 |
107 | }
108 |
109 | map
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/r_googleplus.r:
--------------------------------------------------------------------------------
1 | library(RCurl);
2 | library(RJSONIO);
3 | api_key<-"XXX"
4 |
5 | user_id <- "105616015219357887822"
6 | # Still need to add ssl.verifypeer = FALSE to get a connection :(
7 | # Add a max results parameter in the URL structure to get 100 results (maximum allowed by the API : https://developers.google.com/+/api/)
8 | data <- getURL(paste("https://www.googleapis.com/plus/v1/people/",user_id,"/activities/public?maxResults=100&key=", api_key, sep=""),ssl.verifypeer = FALSE)
9 | js <- fromJSON(data, asText=TRUE);
10 |
11 | df = data.frame(no = 1:length(js$items))
12 |
13 | for (i in 1:nrow(df)){
14 | df$kind[i] = js$items[[i]]$verb
15 | df$title[i] = js$items[[i]]$title
16 | df$published[i] = js$items[[i]]$published # add publish date to the df
17 | df$replies[i] = js$items[[i]]$object$replies$totalItems
18 | df$plusones[i] = js$items[[i]]$object$plusoners$totalItems
19 | df$reshares[i] = js$items[[i]]$object$resharers$totalItems
20 | df$url[i] = js$items[[i]]$object$url
21 |
22 | }
23 |
24 | # Export to .csv
25 | filename <- paste("gplus_data_", user_id, sep="") # in case we have more user_ids
26 | write.table(df, file = paste0(filename,".csv"), sep = ",", col.names = NA,
27 | qmethod = "double")
28 |
29 | df_graph = df[,c(1,5,6,7)]
30 |
31 |
32 |
33 | require(ggplot2)
34 | require(reshape2)
35 |
36 | melted=melt(df_graph,id.vars='no')
37 |
38 | ggplot(melted,aes(x=factor(no),y=value,color=factor(variable),group=factor(variable)))+
39 | geom_line()+xlab('no')+guides(color=guide_legend("metrics"))+
40 | labs(title="Google+")
41 |
--------------------------------------------------------------------------------
/r_instagram.r:
--------------------------------------------------------------------------------
1 | #Analyze Instagram with R
2 | #Author: Julian Hillebrand
3 |
4 | #packages
5 | require(httr)
6 | require(rjson)
7 | require(RCurl)
8 |
9 |
10 | #Authentication
11 |
12 | ## getting callback URL
13 | full_url <- oauth_callback()
14 | full_url <- gsub("(.*localhost:[0-9]{1,5}/).*", x=full_url, replacement="\\1")
15 | #message <- paste("Copy and paste into Site URL on Instagram App Settings:",
16 | # full_url, "\nWhen done, press any key to continue...")
17 |
18 | invisible(readline(message))
19 |
20 | app_name <- "ThinkToStartTest"
21 | client_id <- "CLIENT_ID"
22 | client_secret <- "CLIENT_SECRET"
23 | scope = "public_content"
24 |
25 |
26 |
27 | instagram <- oauth_endpoint(
28 | authorize = "https://api.instagram.com/oauth/authorize",
29 | access = "https://api.instagram.com/oauth/access_token")
30 | myapp <- oauth_app(app_name, client_id, client_secret)
31 |
32 | #scope <- NULL
33 | ig_oauth <- oauth2.0_token(instagram, myapp,scope=scope, type = "application/x-www-form-urlencoded",cache=FALSE)
34 | tmp <- strsplit(toString(names(ig_oauth$credentials)), '"')
35 | token <- tmp[[1]][30]
36 |
37 | ########################################################
38 |
39 |
40 | user_info <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/self/?access_token=',token,sep="")))
41 |
42 |
43 |
44 | received_profile <- user_info$data$id
45 |
46 |
47 |
48 |
49 |
50 | #Get recent media (20 pictures)
51 | media <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/self/media/recent/?access_token=',token,sep="")))
52 |
53 |
54 |
55 | df = data.frame(no = 1:length(media$data))
56 |
57 | for(i in 1:length(media$data))
58 | {
59 | #comments
60 | df$comments[i] <-media$data[[i]]$comments$count
61 |
62 | #likes:
63 | df$likes[i] <- media$data[[i]]$likes$count
64 |
65 | #date
66 | df$date[i] <- toString(as.POSIXct(as.numeric(media$data[[i]]$created_time), origin="1970-01-01"))
67 | }
68 |
69 | #Visualization
70 |
71 | require(rCharts)
72 |
73 | m1 <- mPlot(x = "date", y = c("likes", "comments"), type = "Line", data = df)
74 |
75 |
--------------------------------------------------------------------------------
/r_mongoDB.r:
--------------------------------------------------------------------------------
1 | # install package to connect through monodb
2 | install.packages("rmongodb")
3 | library(rmongodb)
4 | # connect to MongoDB
5 | mongo = mongo.create(host = "localhost")
6 | mongo.is.connected(mongo)
7 |
8 | mongo.get.databases(mongo)
9 |
10 | mongo.get.database.collections(mongo, db = "tweetDB2") #”tweetDB” is where twitter data is stored
11 |
12 | library(plyr)
13 | ## create the empty data frame
14 | df1 = data.frame(stringsAsFactors = FALSE)
15 |
16 | ## create the namespace
17 | DBNS = "tweetDB2.#analytic"
18 |
19 | ## create the cursor we will iterate over, basically a select * in SQL
20 | cursor = mongo.find(mongo, DBNS)
21 |
22 | ## create the counter
23 | i = 1
24 |
25 | ## iterate over the cursor
26 | while (mongo.cursor.next(cursor)) {
27 | # iterate and grab the next record
28 | tmp = mongo.bson.to.list(mongo.cursor.value(cursor))
29 | # make it a dataframe
30 | tmp.df = as.data.frame(t(unlist(tmp)), stringsAsFactors = F)
31 | # bind to the master dataframe
32 | df1 = rbind.fill(df1, tmp.df)
33 | }
34 |
35 | dim(df1)
--------------------------------------------------------------------------------
/r_mongoDB_REST.r:
--------------------------------------------------------------------------------
1 | library(RCurl)
2 | library(rjson)
3 |
4 | database = "tweetDB"
5 | collection = "Apple"
6 | limit = "100"
7 | db <- paste("http://localhost:28017/",database,"/",collection,"/?limit=",limit,sep = "")
8 |
9 | tweets <- fromJSON(getURL(db))
10 |
11 | tweet_df = data.frame(text=1:limit)
12 | for (i in 1:limit){
13 | tweet_df$text[i] = tweets$rows[[i]]$tweet_text}
14 | tweet_df
15 |
--------------------------------------------------------------------------------
/r_pinterest_machine_learning.R:
--------------------------------------------------------------------------------
1 | # Pinterest
2 |
3 | #Analyze Pinterest with R
4 | #Author: Julian Hillebrand
5 |
6 | #packages
7 | require(httr)
8 | require(rjson)
9 | require(RCurl)
10 |
11 |
12 | #Authentication
13 |
14 | ## getting callback URL
15 | full_url <- oauth_callback()
16 | full_url <- gsub("(.*localhost:[0-9]{1,5}/).*", x=full_url, replacement="\\1")
17 | #message <- paste("Copy and paste into Site URL on Instagram App Settings:",
18 | # full_url, "\nWhen done, press any key to continue...")
19 |
20 | invisible(readline(message))
21 |
22 | app_name <- "R Test"
23 | client_id <- "XXX"
24 | client_secret <- "XXX"
25 | scope = "read_public"
26 | redirect_uri="https://mywebsite.com/connect/pinterest/"
27 |
28 | https://api.pinterest.com/oauth/?
29 | response_type=code&
30 | redirect_uri=https://mywebsite.com/connect/pinterest/&
31 | client_id=12345&
32 | scope=read_public,write_public&
33 | state=768uyFys
34 |
35 | user_info <- fromJSON(getURL(paste("https://api.pinterest.com/oauth?response_type=code&client_id=",client_id,"&scope=",scope,sep="")),unexpected.escape ="keep")
36 |
37 | paste("https://api.pinterest.com/oauth?response_type=code&client_id=",client_id,"&scope=",scope,sep="")
38 |
39 | pinterest <- oauth_endpoint(
40 | authorize = "https://api.pinterest.com/oauth",
41 | access = "https://api.pinterest.com/v1/oauth/token")
42 |
43 | myapp <- oauth_app(app_name, client_id, client_secret)
44 |
45 |
46 |
47 | #scope <- NULL
48 | pi_oauth <- oauth2.0_token(pinterest, myapp,scope=scope,use_oob = TRUE, as_header = TRUE)
49 |
50 |
51 | fb_ep = oauth_endpoint(token_url, auth_url, access_url)
52 | pi_oauth <- oauth1.0_token(pinterest, myapp)
53 |
54 |
55 |
56 |
57 | tmp <- strsplit(toString(names(ig_oauth$credentials)), '"')
58 | token <- tmp[[1]][4]
59 |
60 | ########################################################
61 |
62 | username <- "therock"
63 |
64 | #search for the username
65 | user_info <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/search?q=',username,'&access_token=',token,sep="")),unexpected.escape = "keep")
66 |
67 | received_profile <- user_info$data[[1]]
68 |
69 | if(grepl(received_profile$username,username))
70 | {
71 | user_id <- received_profile$id
72 | #Get recent media (20 pictures)
73 | media <- fromJSON(getURL(paste('https://api.instagram.com/v1/users/',user_id,'/media/recent/?access_token=',token,sep="")))
74 |
75 |
76 | df = data.frame(no = 1:length(media$data))
77 |
78 | for(i in 1:length(media$data))
79 | {
80 | #comments
81 | df$comments[i] <-media$data[[i]]$comments$count
82 |
83 | #likes:
84 | df$likes[i] <- media$data[[i]]$likes$count
85 |
86 | #date
87 | df$date[i] <- toString(as.POSIXct(as.numeric(media$data[[i]]$created_time), origin="1970-01-01"))
88 | }
89 |
90 | #Visualization
91 |
92 | require(rCharts)
93 |
94 | m1 <- mPlot(x = "date", y = c("likes", "comments"), type = "Line", data = df)
95 |
96 |
97 | }else
98 | {
99 | print("Error: User not found!")
100 | }
101 |
--------------------------------------------------------------------------------
/r_rfacebook.r:
--------------------------------------------------------------------------------
1 | install.packages("devtools")
2 | library(devtools)
3 |
4 | install_github("Rfacebook", "pablobarbera", subdir="Rfacebook")
5 |
6 |
7 | require("Rfacebook")
8 |
9 |
10 |
11 |
12 |
13 | ######Using the App Authentication:
14 |
15 | fb_oauth <- fbOAuth(app_id="123456789", app_secret="1A2B3C4D",extended_permissions = TRUE)
16 |
17 |
18 | #now we have our fb_oauth connection
19 | #we will just save them to be able to use them later
20 | save(fb_oauth, file="fb_oauth")
21 |
22 | #so if you want to connect to Facebook again you just have to call
23 | load("fb_oauth")
24 |
25 | me <- getUsers("me",token=fb_oauth)
26 |
27 | my_likes <- getLikes(user="me", token=fb_oauth)
28 |
29 |
30 | ######Using the Token Authentication:
31 |
32 | token <- 'YOUR AUTHENTICATION TOKEN'
33 | me <- getUsers("me", token, private_info=TRUE)
34 |
35 | getUsers(c("barackobama", "donaldtrump"), token)
36 |
37 |
38 | getFriends(token, simplify = FALSE)
39 |
40 | my_friends <- getFriends(token=fb_oauth, simplify=TRUE)
41 |
42 | head(my_friends, n=10)
43 |
44 | getUser()
45 |
46 | my_friends_info <- getUsers(my_friends$id, token=fb_oauth, private_info=TRUE)
47 |
48 | #create a table with the relationship statuses
49 |
50 | table(my_friends_info$relationship_status)
51 |
--------------------------------------------------------------------------------
/r_twitter_cluster.r:
--------------------------------------------------------------------------------
1 | # run this if you don't have one or more of the next packages:
2 | # install.packages(c("devtools", "RCurl", "rjson", "bit64","httr","ROAuth"))
3 | # library(devtools)
4 | # install_github("twitteR", username="geoffjentry")
5 | # install_github('rCharts','ramnathv')
6 |
7 |
8 |
9 | library(RCurl)
10 | # Set SSL certs globally
11 | options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
12 |
13 | library(twitteR)
14 |
15 | #Authentication
16 | #http://thinktostart.wordpress.com/2013/05/22/twitter-authentification-with-r/
17 |
18 |
19 | library(rCharts)
20 |
21 | user <- getUser("ACCOUNT-TO-BE-ANALYZED")
22 | userFriends <- user$getFriends(n=5000) #put () if you want to get all friends and followers
23 | userFollowers <- user$getFollowers(n=5000)
24 | userNeighbors <- union(userFollowers, userFriends)
25 | userNeighbors.df = twListToDF(userNeighbors)
26 |
27 | userNeighbors.df[userNeighbors.df=="0"]<-1
28 |
29 | userNeighbors.df$logFollowersCount <-log(userNeighbors.df$followersCount)
30 |
31 | userNeighbors.df$logFriendsCount <-log(userNeighbors.df$friendsCount)
32 |
33 | kObject.log <- data.frame(userNeighbors.df$logFriendsCount,userNeighbors.df$logFollowersCount)
34 |
35 | ###elbow
36 | mydata <- kObject.log
37 | wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
38 | for (i in 2:15) wss[i] <- sum(kmeans(mydata,
39 | centers=i)$withinss)
40 | plot(1:15, wss, type="b", xlab="Number of Clusters",
41 | ylab="Within groups sum of squares")
42 |
43 |
44 | ###k-means
45 |
46 | ##Run the K Means algorithm, remember to specify centers from 'elbow plot'
47 | userMeans.log <- kmeans(kObject.log, centers=4, iter.max=10, nstart=100)
48 |
49 | ##Add the vector of specified clusters back to the original vector as a factor
50 | kObject.log$cluster=factor(userMeans.log$cluster)
51 | userNeighbors.df$cluster <- kObject.log$cluster
52 |
53 |
54 | p2 <- nPlot(logFollowersCount ~ logFriendsCount, group = 'cluster', data = userNeighbors.df, type = 'scatterChart')
55 | p2$xAxis(axisLabel = 'Followers Count')
56 | p2$yAxis(axisLabel = 'Friends Count')
57 | p2$chart(tooltipContent = "#! function(key, x, y, e){
58 | return e.point.screenName + ' Followers: ' + e.point.followersCount +' Friends: ' + e.point.friendsCount
59 | } !#")
60 | p2
61 |
--------------------------------------------------------------------------------
/sentiment_cloud.r:
--------------------------------------------------------------------------------
1 | library(twitteR)
2 | library(RCurl)
3 | library(RJSONIO)
4 | library(stringr)
5 | library(tm)
6 | library(wordcloud)
7 |
8 |
9 | ####################################################################
10 |
11 | getSentiment <- function (text, key){
12 |
13 | text <- URLencode(text);
14 |
15 | #save all the spaces, then get rid of the weird characters that break the API, then convert back the URL-encoded spaces.
16 | text <- str_replace_all(text, "%20", " ");
17 | text <- str_replace_all(text, "%\\d\\d", "");
18 | text <- str_replace_all(text, " ", "%20");
19 |
20 |
21 | if (str_length(text) > 360){
22 | text <- substr(text, 0, 359);
23 | }
24 | ##########################################
25 |
26 | data <- getURL(paste("http://api.datumbox.com/1.0/TwitterSentimentAnalysis.json?api_key=", key, "&text=",text, sep=""))
27 |
28 | js <- fromJSON(data, asText=TRUE);
29 |
30 | # get mood probability
31 | sentiment = js$output$result
32 |
33 | ###################################
34 |
35 |
36 | return(list(sentiment=sentiment))
37 | }
38 |
39 | clean.text <- function(some_txt)
40 | {
41 | some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
42 | some_txt = gsub("@\\w+", "", some_txt)
43 | some_txt = gsub("[[:punct:]]", "", some_txt)
44 | some_txt = gsub("[[:digit:]]", "", some_txt)
45 | some_txt = gsub("http\\w+", "", some_txt)
46 | some_txt = gsub("[ \t]{2,}", "", some_txt)
47 | some_txt = gsub("^\\s+|\\s+$", "", some_txt)
48 | some_txt = gsub("amp", "", some_txt)
49 | # define "tolower error handling" function
50 | try.tolower = function(x)
51 | {
52 | y = NA
53 | try_error = tryCatch(tolower(x), error=function(e) e)
54 | if (!inherits(try_error, "error"))
55 | y = tolower(x)
56 | return(y)
57 | }
58 |
59 | some_txt = sapply(some_txt, try.tolower)
60 | some_txt = some_txt[some_txt != ""]
61 | names(some_txt) = NULL
62 | return(some_txt)
63 | }
64 |
65 |
66 |
67 | ###########################################################
68 |
69 |
70 |
71 | print("Getting tweets...")
72 | # get some tweets
73 | tweets = searchTwitter(keyword, n, lang="en")
74 | # get text
75 | tweet_txt = sapply(tweets, function(x) x$getText())
76 |
77 | # clean text
78 | tweet_clean = clean.text(tweet_txt)
79 | tweet_num = length(tweet_clean)
80 | # data frame (text, sentiment)
81 | tweet_df = data.frame(text=tweet_clean, sentiment=rep("", tweet_num),stringsAsFactors=FALSE)
82 |
83 | print("Getting sentiments...")
84 | # apply function getSentiment
85 | sentiment = rep(0, tweet_num)
86 | for (i in 1:tweet_num)
87 | {
88 | tmp = getSentiment(tweet_clean[i], db_key)
89 |
90 | tweet_df$sentiment[i] = tmp$sentiment
91 |
92 | print(paste(i," of ", tweet_num))
93 |
94 |
95 | }
96 |
97 | # delete rows with no sentiment
98 | tweet_df <- tweet_df[tweet_df$sentiment!="",]
99 |
100 |
101 | #separate text by sentiment
102 | sents = levels(factor(tweet_df$sentiment))
103 | #emos_label <- emos
104 |
105 |
106 | # get the labels and percents
107 |
108 | labels <- lapply(sents, function(x) paste(x,format(round((length((tweet_df[tweet_df$sentiment ==x,])$text)/length(tweet_df$sentiment)*100),2),nsmall=2),"%"))
109 |
110 |
111 |
112 | nemo = length(sents)
113 | emo.docs = rep("", nemo)
114 | for (i in 1:nemo)
115 | {
116 | tmp = tweet_df[tweet_df$sentiment == sents[i],]$text
117 |
118 | emo.docs[i] = paste(tmp,collapse=" ")
119 | }
120 |
121 |
122 |
123 | # remove stopwords
124 | emo.docs = removeWords(emo.docs, stopwords("german"))
125 | emo.docs = removeWords(emo.docs, stopwords("english"))
126 | corpus = Corpus(VectorSource(emo.docs))
127 | tdm = TermDocumentMatrix(corpus)
128 | tdm = as.matrix(tdm)
129 | colnames(tdm) = labels
130 |
131 |
132 |
133 |
134 | # comparison word cloud
135 | comparison.cloud(tdm, colors = brewer.pal(nemo, "Dark2"),
136 | scale = c(3,.5), random.order = FALSE, title.size = 1.5)
137 |
138 |
139 |
--------------------------------------------------------------------------------
/sentiment_datumbox.r:
--------------------------------------------------------------------------------
1 | # load packages
2 | library(twitteR)
3 | library(RCurl)
4 | library(RJSONIO)
5 | library(stringr)
6 |
7 |
8 |
9 | getSentiment <- function (text, key){
10 |
11 |
12 |
13 | text <- URLencode(text);
14 |
15 | #save all the spaces, then get rid of the weird characters that break the API, then convert back the URL-encoded spaces.
16 | text <- str_replace_all(text, "%20", " ");
17 | text <- str_replace_all(text, "%\\d\\d", "");
18 | text <- str_replace_all(text, " ", "%20");
19 |
20 |
21 | if (str_length(text) > 360){
22 | text <- substr(text, 0, 359);
23 | }
24 | ##########################################
25 |
26 | data <- getURL(paste("http://api.datumbox.com/1.0/TwitterSentimentAnalysis.json?api_key=", key, "&text=",text, sep=""))
27 |
28 | js <- fromJSON(data, asText=TRUE);
29 |
30 | # get mood probability
31 | sentiment = js$output$result
32 |
33 | ###################################
34 |
35 | data <- getURL(paste("http://api.datumbox.com/1.0/SubjectivityAnalysis.json?api_key=", key, "&text=",text, sep=""))
36 |
37 | js <- fromJSON(data, asText=TRUE);
38 |
39 | # get mood probability
40 | subject = js$output$result
41 |
42 | ##################################
43 |
44 | data <- getURL(paste("http://api.datumbox.com/1.0/TopicClassification.json?api_key=", key, "&text=",text, sep=""))
45 |
46 | js <- fromJSON(data, asText=TRUE);
47 |
48 | # get mood probability
49 | topic = js$output$result
50 |
51 | ##################################
52 | data <- getURL(paste("http://api.datumbox.com/1.0/GenderDetection.json?api_key=", key, "&text=",text, sep=""))
53 |
54 | js <- fromJSON(data, asText=TRUE);
55 |
56 | # get mood probability
57 | gender = js$output$result
58 |
59 | return(list(sentiment=sentiment,subject=subject,topic=topic,gender=gender))
60 | }
61 |
62 |
63 |
64 | #################
65 | clean.text <- function(some_txt)
66 | {
67 | some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
68 | some_txt = gsub("@\\w+", "", some_txt)
69 | some_txt = gsub("[[:punct:]]", "", some_txt)
70 | some_txt = gsub("[[:digit:]]", "", some_txt)
71 | some_txt = gsub("http\\w+", "", some_txt)
72 | some_txt = gsub("[ \t]{2,}", "", some_txt)
73 | some_txt = gsub("^\\s+|\\s+$", "", some_txt)
74 |
75 | # define "tolower error handling" function
76 | try.tolower = function(x)
77 | {
78 | y = NA
79 | try_error = tryCatch(tolower(x), error=function(e) e)
80 | if (!inherits(try_error, "error"))
81 | y = tolower(x)
82 | return(y)
83 | }
84 |
85 | some_txt = sapply(some_txt, try.tolower)
86 | some_txt = some_txt[some_txt != ""]
87 | names(some_txt) = NULL
88 | return(some_txt)
89 | }
90 |
91 |
92 | # harvest tweets
93 | tweets = searchTwitter("iPhone", n=200, lang="en")
94 |
95 |
96 | # get text
97 | tweet_txt = sapply(tweets, function(x) x$getText())
98 |
99 |
100 | # clean text
101 | tweet_clean = clean.text(tweet_txt)
102 |
103 | #####################################
104 | # how many tweets
105 | tweet_num = length(tweet_clean)
106 |
107 | # data frame (text, sentiment, score)
108 | tweet_df = data.frame(text=tweet_clean, sentiment=rep("", tweet_num),
109 | subject=1:tweet_num, topic=1:tweet_num, gender=1:tweet_num, stringsAsFactors=FALSE)
110 |
111 | # apply function getSentiment
112 | sentiment = rep(0, tweet_num)
113 | for (i in 1:tweet_num)
114 | {
115 | tmp = getSentiment(tweet_clean[i], "API_KEY")
116 |
117 | tweet_df$sentiment[i] = tmp$sentiment
118 |
119 | tweet_df$subject[i] = tmp$subject
120 | tweet_df$topic[i] = tmp$topic
121 | tweet_df$gender[i] = tmp$gender
122 | }
123 |
124 |
--------------------------------------------------------------------------------
/sentiment_viralheat.r:
--------------------------------------------------------------------------------
1 | library(twitteR)
2 | library(RCurl)
3 | library(RJSONIO)
4 | library(stringr)
5 |
6 | getSentiment <- function (text, key){
7 | library(RCurl);
8 | library(RJSONIO);
9 |
10 | text <- URLencode(text);
11 |
12 | #save all the spaces, then get rid of the weird characters that break the API, then convert back the URL-encoded spaces.
13 | text <- str_replace_all(text, "%20", " ");
14 | text <- str_replace_all(text, "%\\d\\d", "");
15 | text <- str_replace_all(text, " ", "%20");
16 |
17 | if (str_length(text) > 360){
18 | text <- substr(text, 0, 359);
19 | }
20 |
21 | data <- getURL(paste("https://www.viralheat.com/api/sentiment/review.json?api_key=", key, "&text=",text, sep=""))
22 |
23 | js <- fromJSON(data, asText=TRUE);
24 |
25 | # get mood probability
26 | score = js$prob
27 |
28 | # positive, negative or neutral?
29 | if (js$mood != "positive")
30 | {
31 | if (js$mood == "negative") {
32 | score = -1 * score
33 | } else {
34 | # neutral
35 | score = 0
36 | }
37 | }
38 |
39 | return(list(mood=js$mood, score=score))
40 | }
41 |
42 | clean.text <- function(some_txt)
43 | {
44 | some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
45 | some_txt = gsub("@\\w+", "", some_txt)
46 | some_txt = gsub("[[:punct:]]", "", some_txt)
47 | some_txt = gsub("[[:digit:]]", "", some_txt)
48 | some_txt = gsub("http\\w+", "", some_txt)
49 | some_txt = gsub("[ \t]{2,}", "", some_txt)
50 | some_txt = gsub("^\\s+|\\s+$", "", some_txt)
51 |
52 | # define "tolower error handling" function
53 | try.tolower = function(x)
54 | {
55 | y = NA
56 | try_error = tryCatch(tolower(x), error=function(e) e)
57 | if (!inherits(try_error, "error"))
58 | y = tolower(x)
59 | return(y)
60 | }
61 |
62 | some_txt = sapply(some_txt, try.tolower)
63 | some_txt = some_txt[some_txt != ""]
64 | names(some_txt) = NULL
65 | return(some_txt)
66 | }
67 |
68 | # harvest tweets
69 | tweets = searchTwitter("iphone5", n=200, lang="en")
70 |
71 | tweet_txt = sapply(mc_tweets, function(x) x$getText())
72 | tweet_clean = clean.text(tweet_txt)
73 | mcnum = length(tweet_clean)
74 | tweet_df = data.frame(text=tweet_clean, sentiment=rep("", mcnum), score=1:mcnum, stringsAsFactors=FALSE)
75 |
76 | sentiment = rep(0, mcnum)
77 | for (i in 1:mcnum)
78 | {
79 | tmp = getSentiment(tweet_clean[i], "API-KEY")
80 | tweet_df$sentiment[i] = tmp$mood
81 | tweet_df$score[i] = tmp$score
82 | }
83 |
84 | tweet_df
--------------------------------------------------------------------------------
/spam_class_r.r:
--------------------------------------------------------------------------------
1 | #Download Data Files:
2 | #data.csv: http://thinktostart.com/data/data.csv
3 | #names.csv: http://thinktostart.com/data/names.csv
4 |
5 | #Load the two files into R:
6 | dataset <- read.csv("data.csv",header=FALSE,sep=";")
7 | names <- read.csv("names.csv",header=FALSE,sep=";")
8 |
9 | #Set the names of the dataset dataframe:
10 | names(dataset) <- sapply((1:nrow(names)),function(i) toString(names[i,1]))
11 |
12 | #make column y a factor variable for binary classification (spam or non-spam)
13 | dataset$y <- as.factor(dataset$y)
14 |
15 |
16 | #get a sample of 1000 rows
17 | sample <- dataset[sample(nrow(dataset), 1000),]
18 |
19 |
20 | #Set up the packages:
21 |
22 | #install.packages(“caret”)
23 |
24 | require(caret)
25 |
26 | #install.packages(“kernlab”)
27 |
28 | require(kernlab)
29 |
30 | #install.packages(“doMC”)
31 |
32 | require(doMC)
33 |
34 |
35 | #Split the data in dataTrain and dataTest
36 | trainIndex <- createDataPartition(sample$y, p = .8, list = FALSE, times = 1)
37 | dataTrain <- sample[ trainIndex,]
38 | dataTest <- sample[-trainIndex,]
39 |
40 | #set up multicore environment
41 | registerDoMC(cores=5)
42 |
43 |
44 | #Create the SVM model:
45 |
46 | ### finding optimal value of a tuning parameter
47 | sigDist <- sigest(y ~ ., data = dataTrain, frac = 1)
48 | ### creating a grid of two tuning parameters, .sigma comes from the earlier line. we are trying to find best value of .C
49 | svmTuneGrid <- data.frame(.sigma = sigDist[1], .C = 2^(-2:7))
50 |
51 | x <- train(y ~ .,
52 | data = dataTrain,
53 | method = "svmRadial",
54 | preProc = c("center", "scale"),
55 | tuneGrid = svmTuneGrid,
56 | trControl = trainControl(method = "repeatedcv", repeats = 5,
57 | classProbs = TRUE))
58 |
59 | #Evaluate the model
60 | pred <- predict(x,dataTest[,1:57])
61 |
62 | acc <- confusionMatrix(pred,dataTest$y)
--------------------------------------------------------------------------------
/twitter_authentication.r:
--------------------------------------------------------------------------------
1 | # https://dev.twitter.com/
2 |
3 | # Install the newest version of the twitteR package from GitHub
4 | install.packages(c("devtools", "rjson", "bit64", "httr"))
5 |
6 | #RESTART R session!
7 |
8 | library(devtools)
9 | install_github("twitteR", username="geoffjentry")
10 | library(twitteR)
11 |
12 |
13 | require(twitteR)
14 |
15 |
16 | #library(RCurl)
17 | # Set SSL certs globally
18 | #options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
19 |
20 |
21 | reqURL <- "https://api.twitter.com/oauth/request_token"
22 | accessURL <- "https://api.twitter.com/oauth/access_token"
23 | authURL <- "https://api.twitter.com/oauth/authorize"
24 | apiKey <- "yourAPIkey"
25 | apiSecret <- "yourAPIsecret"
26 |
27 |
28 |
29 | setup_twitter_oauth(apiKey, apiSecret)
30 |
31 |
32 |
--------------------------------------------------------------------------------