├── data └── stopwords.rda ├── inst └── images │ ├── big.png │ ├── fan.jpg │ ├── daiyu.jpg │ ├── dream.jpg │ ├── peach.jpg │ ├── peony.jpg │ ├── plum.jpg │ ├── bamboo.jpg │ └── orchid.jpg ├── NAMESPACE ├── DESCRIPTION ├── man ├── get.book.info.Rd ├── get_movie_comments.Rd ├── get_movie_info.Rd ├── get_book_info.Rd ├── get_movie_discussions.Rd ├── get_music_discussions.Rd ├── get_book_discussions.Rd ├── get_music_reviews.Rd ├── get_book_notes.Rd ├── get_music_info.Rd ├── get_book_reviews.Rd ├── get_movie_reviews.Rd ├── get.movie.info.Rd ├── get.movie.comment.Rd ├── get.music.info.Rd ├── user_movie_viz.Rd ├── get.book.review.Rd ├── user_status.Rd ├── get.movie.review.Rd ├── user_book_viz.Rd ├── user_note_status.Rd ├── user_book_status.Rd └── user_movie_status.Rd ├── R ├── get_online_tags.R ├── get.movie.info.R ├── get.book.info.R ├── get.music.info.R ├── user_status.R ├── get.movie.comment.R ├── user_note_status.R ├── get_music_discussions.R ├── utils.R ├── get_movie_comments.R ├── get_music_info.R ├── get_book_reviews.R ├── get_book_notes.R ├── get_book_discussions.R ├── get_movie_info.R ├── get_movie_discussions.R ├── get.book.review.R ├── get_music_reviews.R ├── get_movie_reviews.R ├── get.movie.review.R ├── get_book_info.R ├── user_movie_status.R ├── user_book_status.R ├── user_movie_viz.R └── user_book_viz.R └── README.md /data/stopwords.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/data/stopwords.rda -------------------------------------------------------------------------------- /inst/images/big.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/big.png -------------------------------------------------------------------------------- /inst/images/fan.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/fan.jpg -------------------------------------------------------------------------------- /inst/images/daiyu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/daiyu.jpg -------------------------------------------------------------------------------- /inst/images/dream.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/dream.jpg -------------------------------------------------------------------------------- /inst/images/peach.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/peach.jpg -------------------------------------------------------------------------------- /inst/images/peony.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/peony.jpg -------------------------------------------------------------------------------- /inst/images/plum.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/plum.jpg -------------------------------------------------------------------------------- /inst/images/bamboo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/bamboo.jpg -------------------------------------------------------------------------------- /inst/images/orchid.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/orchid.jpg -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Default NAMESPACE created by R 2 | # Remove the previous line if you edit this file 3 | 4 | # Export all names 5 | exportPattern(".") 6 | 7 | # Import all packages listed as Imports or Depends 8 | import( 9 | RCurl, 10 | XML, 11 | RJSONIO 12 | ) 13 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: Rdouban 2 | Version: 0.2-1 3 | Date: 2013-12-13 4 | Title: Provide an interface to the douban comment data 5 | Author: qxde01@gmail.com 6 | Depends: R (>= 2.12.0), RCurl,XML,RJSONIO 7 | Description: douban 8 | Maintainer: qxde01 9 | License: GPL 10 | Encoding: UTF-8 11 | Packaged: 2013-12-13 22:49:05 UTC; sz 12 | Repository: https://github.com/qxde01/Rdouban 13 | Date/Publication: 2013-12-13 22:57:03 -------------------------------------------------------------------------------- /man/get.book.info.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get.book.info} 3 | \alias{get.book.info} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{豆瓣图书基本信息官方API} 6 | \description{ 获取豆瓣图书基本信息,包括作者、简介、豆瓣评分等.} 7 | \usage{ 8 | get.book.info(bookid) 9 | } 10 | %- maybe also 'usage' for other objects documented here. 11 | \arguments{ 12 | \item{bookid}{豆瓣书籍的id号,一串数字,如20326626,可通过图书的豆瓣主页查询} 13 | } 14 | 15 | \value{ 16 | \item{title}{书名} 17 | \item{author }{作者} 18 | \item{rating }{评分:最小值、评分人数、平均值、最大值} 19 | \item{summary }{内容简介} 20 | \item{tags }{一个data.frame,常用标签,两列分别是标签名称和标签频数} 21 | \item{href}{书籍主页网址} 22 | \item{image }{封面图片网址} 23 | \item{attribute }{书籍其他信息,比如出本社、价格、页数等} 24 | } 25 | \references{} 26 | \author{ 27 | <\email{qxde01@gmail.com}> 28 | } 29 | 30 | \examples{ 31 | \dontrun{x<-get.book.info(bookid='2567698')} 32 | } 33 | \keyword{book} 34 | \keyword{douban} -------------------------------------------------------------------------------- /R/get_online_tags.R: -------------------------------------------------------------------------------- 1 | get_online_tags<-function(results=1000,count=200){ 2 | pages<-ceiling(results/count) 3 | tags<-c() 4 | for(pg in 1:pages){ 5 | u=paste0("http://www.douban.com/online/tag/?start=",(pg-1)*count) 6 | cat(pg,", Getting tags from u:",u,"......\n") 7 | p=.refreshURL(u) 8 | n1<-getNodeSet(p,'//div[@class="article"]//div[@class="indent"]//span') 9 | tag<-sapply(n1,xmlValue) 10 | tag<-gsub("[\n ]|  ","",tag) 11 | #cat(tag,"\n") 12 | tag<-unlist(strsplit(tag,"\\(|\\)")) 13 | word<-tag[seq(1,length(tag),2)] 14 | freq<-tag[seq(1,length(tag),2)+1] 15 | url=sapply(getNodeSet(p,'//div[@class="article"]//div[@class="indent"]//a'), 16 | function(x) xmlGetAttr(x, "href")) 17 | #cat(length(word)," ",length(freq),"\n") 18 | tags0<-cbind(tag=word,url=url,freq=freq) 19 | tags<-rbind(tags,tags0) 20 | } 21 | row.names(tags)<-NULL 22 | return(as.data.frame(tags,stringsAsFactors=F)) 23 | } -------------------------------------------------------------------------------- /man/get_movie_comments.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get_movie_comments} 3 | \alias{get_movie_comments} 4 | \title{获取豆瓣电影的短评} 5 | \description{获取豆瓣电影的短评.} 6 | \usage{get_movie_comments(movieid,results=100,fresh=10,verbose=TRUE,...)} 7 | \arguments{ 8 | \item{movieid}{ 豆瓣电影的id号} 9 | \item{results}{获取评论的数量,默认100} 10 | \item{fresh}{页面异常时最大刷新次数,默认20} 11 | \item{verbose}{若\code{verbose=TRUE则显示详细过程信息,否则不显示,默认\code{TRUE}}} 12 | } 13 | 14 | \value{一个data.frame,各列为: 15 | \item{author}{文章作者昵称} 16 | \item{author_uri }{作者豆瓣URI} 17 | \item{published }{短评发表时间} 18 | \item{votes }{短评被赞为有用的次数} 19 | \item{comment}{短评内容} 20 | \item{rating}{作者对电影的评分} 21 | 22 | } 23 | \note{每页获取短评个数为20,如果results不是20的倍数,则获取的实际影评数量是\code{ceiling(results/20)*20}.} 24 | \author{ 25 | qxde01<\email{qxde01@gmail.com}> 26 | } 27 | \seealso{ 28 | \code{\link{get_movie_reviews}} 29 | } 30 | \examples{ 31 | ## http://movie.douban.com/subject/5308265/comments 32 | \dontrun{x=get_movie_comments(movieid=5308265,results=100)} 33 | } 34 | \keyword{comment} 35 | \keyword{movie} -------------------------------------------------------------------------------- /man/get_movie_info.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get_movie_info} 3 | \alias{get_movie_info} 4 | \title{获取豆瓣影视的基本信息} 5 | \description{获取豆瓣影视剧的基本信息,包括电影名称、导演、演员、豆瓣评分等信息. 6 | } 7 | \usage{ 8 | get_movie_info(movieid,...) 9 | } 10 | \arguments{ 11 | \item{movieid}{豆瓣电影和电视剧的ID,由数字组成的字符串,可通过每部电影或电视剧的豆瓣主页查询.} 12 | } 13 | \value{一个列表(list), 包括: 14 | \item{title}{电影名称} 15 | \item{author}{导演} 16 | \item{rating}{用户评分信息,\code{average}为总分,\code{votes}指参与的评分人数,\code{starsx}指对应评分用户数所占的比例} 17 | \item{summary}{电影或电视剧简介} 18 | \item{tags}{\code{tag_label}标签名称及对应的使用频率\code{tag_freq}} 19 | \item{href}{电影主页URL} 20 | \item{image}{电影海报图片URL} 21 | \item{reviews_total}{长影评数量} 22 | \item{comments_total}{短影评数量} 23 | \item{audience}{观众信息,\code{doings}指正在观看的用户数量,\code{collections}指已观看的用户数量,\code{wishes}指想看的用户数量} 24 | \item{attribute}{一个字符串,豆瓣影视剧的基本信息,包括电影名称、导演、演员} 25 | } 26 | \author{<\email{qxde01@gmail.com}> 27 | } 28 | 29 | \examples{ 30 | ## http://movie.douban.com/subject/5308265/ 31 | \dontrun{m<-get_movie_info(movieid='5308265')} 32 | } 33 | \keyword{movie} 34 | -------------------------------------------------------------------------------- /man/get_book_info.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get_book_info} 3 | \alias{get_book_info} 4 | \title{获取豆瓣图书基本信息} 5 | \description{ 获取豆瓣图书基本信息,包括作者、简介、豆瓣评分等.} 6 | \usage{get_book_info(bookid,...) 7 | } 8 | \arguments{ 9 | \item{bookid}{豆瓣书籍的id号,一串数字,如20326626,可通过图书的豆瓣主页查询}} 10 | \value{一个列表list,包括: 11 | \item{title}{书名} 12 | \item{author}{作者} 13 | \item{rating}{长度为7的向量,用户评分信息,\code{avergae}为总分,\code{votes}指参与的评分人数,\code{starsx}指对应评分用户的所占的比例} 14 | \item{href}{书籍主页URL} 15 | \item{image}{书籍封面图片URL} 16 | \item{summary}{书籍内容简介} 17 | \item{author_intro}{作者简介} 18 | \item{tags}{\code{tag_label}标签名称及对应的使用频率\code{tag_freq}} 19 | \item{comments_total}{评论数量} 20 | \item{notes_total}{为笔记数量} 21 | \item{readers}{\code{doings}指正在阅读的用户数量,\code{collections}指已读的用户数量,\code{wishes}指想读的用户数量} 22 | \item{attribute}{书籍的基本信息,如作者、出版社、ISBN等} 23 | } 24 | 25 | \author{ 26 | <\email{qxde01@gmail.com}> 27 | } 28 | \examples{ 29 | ## http://book.douban.com/subject/1291204/ 30 | \dontrun{book<-get_book_info(bookid='1291204')} 31 | } 32 | \keyword{book} 33 | \keyword{douban} 34 | -------------------------------------------------------------------------------- /man/get_movie_discussions.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get_movie_discussions} 3 | \alias{get_movie_discussions} 4 | \title{获取豆瓣电影的话题讨论内容 5 | } 6 | \description{获取某个豆瓣电影的话题讨论内容. 7 | } 8 | \usage{ 9 | get_movie_discussions(movieid,results = 100, fresh = 10,count=20, verbose = TRUE,...) 10 | } 11 | \arguments{ 12 | \item{movieid}{豆瓣电影ID} 13 | \item{results}{获取话题讨论的数量,默认100} 14 | \item{fresh}{页面异常时最大刷新次数,默认10} 15 | \item{count}{每页discussion_uri数量,默认20,最大20} 16 | \item{verbose}{若\code{verbose=TRUE则显示详细过程信息,否则不显示,默认\code{TRUE}}} 17 | } 18 | \value{一个data.frame,各列为: 19 | \item{discussion_uri }{话题主页URL} 20 | \item{title }{话题标题题} 21 | \item{published }{话题发表的时间} 22 | \item{author}{作者昵称} 23 | \item{author_uri }{作者豆瓣URI} 24 | \item{discussion }{话题详细内容} 25 | \item{useful }{被赞为有用的次数} 26 | \item{unuseful}{被评为无用的次数} 27 | } 28 | 29 | \author{ 30 | qxde01<\email{qxde01@gmail.com}> 31 | } 32 | 33 | \examples{ 34 | ## http://movie.douban.com/subject/5308265/discussion/ 35 | \dontrun{get_movie_discussions(movieid=5308265,n=100)} 36 | } 37 | \keyword{discussion} 38 | -------------------------------------------------------------------------------- /man/get_music_discussions.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get_music_discussions} 3 | \alias{get_music_discussions} 4 | \title{获取豆瓣音乐专辑的话题讨论内容 5 | } 6 | \description{获取豆瓣音乐专辑的话题讨论内容. 7 | } 8 | \usage{ 9 | get_music_discussions(musicid,results = 100, fresh = 10,count=20, verbose = TRUE,...) 10 | } 11 | \arguments{ 12 | \item{musicid}{豆瓣音乐专辑的ID} 13 | \item{results}{获取话题讨论的数量,默认100} 14 | \item{fresh}{页面异常时最大刷新次数,默认10} 15 | \item{count}{每页discussion_uri数量,默认20,最大20} 16 | \item{verbose}{若\code{verbose=TRUE则显示详细过程信息,否则不显示,默认\code{TRUE}}} 17 | } 18 | \value{一个data.frame,各列为: 19 | \item{discussion_uri }{话题主页URL} 20 | \item{title }{话题标题题} 21 | \item{published }{话题发表的时间} 22 | \item{author}{作者昵称} 23 | \item{author_uri }{作者豆瓣URI} 24 | \item{discussion }{话题详细内容} 25 | \item{useful }{被赞为有用的次数} 26 | \item{unuseful}{被评为无用的次数} 27 | } 28 | 29 | \author{ 30 | qxde01<\email{qxde01@gmail.com}> 31 | } 32 | 33 | \examples{ 34 | ## http://music.douban.com/subject/3843530/discussion/ 35 | \dontrun{a<-get_music_discussions(musicid=3843530,results = 100)} 36 | } 37 | \keyword{discussion} 38 | -------------------------------------------------------------------------------- /R/get.movie.info.R: -------------------------------------------------------------------------------- 1 | get.movie.info<-function(movieid){ 2 | u=paste0("https://api.douban.com/v2/movie/",movieid) 3 | #p=getURL(u,ssl.verifypeer = FALSE) 4 | p<-.refreshURL(u,ssl.verifypeer = FALSE) 5 | reslist <- fromJSON(p) 6 | title<-reslist[["title"]] 7 | author<-unlist(reslist[["author"]]);names(author)<-NULL 8 | rating<-unlist(reslist[["rating"]]) 9 | tags<-reslist[["tags"]] 10 | summary<-reslist[["summary"]] 11 | tags<-data.frame(tag_label=sapply(tags,function(x)x[["name"]]), 12 | tag_freq=sapply(tags,function(x)x[["count"]]), 13 | stringsAsFactors=F) 14 | image<-reslist[['image']] 15 | href<-reslist[["alt"]] 16 | reslist$title<-NULL;reslist$author<-NULL; 17 | reslist$rating<-NULL;reslist$tags<-NULL; 18 | reslist$summary<-NULL;reslist$alt<-NULL 19 | reslist$image<-NULL 20 | attribute=reslist$attrs 21 | 22 | list(title=title, 23 | author=author, 24 | rating=as.double(rating), 25 | summary=summary, 26 | tags=tags, 27 | href=href, 28 | image=image, 29 | attribute= attribute) 30 | } -------------------------------------------------------------------------------- /man/get_book_discussions.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get_book_discussions} 3 | \alias{get_book_discussions} 4 | \title{获取豆瓣图书的话题讨论内容 5 | } 6 | \description{获取豆瓣图书话题讨论内容. 7 | } 8 | \usage{ 9 | get_book_discussions(bookid,results=100,fresh=10,count=20,verbose=TRUE) 10 | } 11 | \arguments{ 12 | \item{bookid}{ 豆瓣书籍的id号,一串数字,如2567698} 13 | \item{results}{获取话题讨论的数量,默认100} 14 | \item{fresh}{页面异常时最大刷新次数,默认10} 15 | \item{count}{每页discussion_uri数量,默认20,最大20} 16 | \item{verbose}{若\code{verbose=TRUE则显示详细过程信息,否则不显示,默认\code{TRUE}}} 17 | } 18 | \value{一个data.frame,各列为: 19 | \item{discussion_uri }{话题主页URL} 20 | \item{title }{话题标题题} 21 | \item{published }{话题发表的时间} 22 | \item{author}{作者昵称} 23 | \item{author_uri }{作者豆瓣URI} 24 | \item{discussion }{话题详细内容} 25 | \item{useful }{被赞为有用的次数} 26 | \item{unuseful}{被评为无用的次数} 27 | } 28 | \note{} 29 | \author{ 30 | qxde01<\email{qxde01@gmail.com}> 31 | } 32 | \seealso{ 33 | 34 | } 35 | \examples{ 36 | ## http://book.douban.com/subject/1291204/discussion/ 37 | \dontrun{b<-get_book_discussions(bookid=1291204,results=100)} 38 | } 39 | \keyword{discussion} 40 | -------------------------------------------------------------------------------- /man/get_music_reviews.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get_music_reviews} 3 | \alias{get_music_reviews} 4 | \title{获取豆瓣音乐评论信息.} 5 | \description{获取豆瓣音乐评论信息.} 6 | \usage{ 7 | get_music_reviews(musicid,results = 100, fresh = 10,count=25, verbose = TRUE,...) 8 | } 9 | %- maybe also 'usage' for other objects documented here. 10 | \arguments{ 11 | \item{musicid}{豆瓣音乐ID} 12 | \item{results}{获取评论的数量,默认100} 13 | \item{fresh}{页面异常时最大刷新次数,默认10} 14 | \item{count}{每页review_uri数量,默认25,最大25} 15 | \item{verbose}{若\code{verbose=TRUE则显示详细过程信息,否则不显示,默认\code{TRUE}}} 16 | } 17 | \value{一个data.frame,各列为: 18 | \item{review_uri }{评论主页URL} 19 | \item{title }{评论文章名称} 20 | \item{published }{文章发表时间} 21 | \item{author}{文章作者昵称} 22 | \item{author_uri }{作者豆瓣URI} 23 | \item{review }{评论文章详细内容} 24 | \item{rating }{作者对电影的评分} 25 | \item{useful}{被评为useful的次数} 26 | \item{unuseful}{被评为unuseful的次数} 27 | } 28 | \author{ 29 | qxde01<\email{qxde01@gmail.com}> 30 | } 31 | 32 | \examples{ 33 | ## http://music.douban.com/subject/3843530/reviews 34 | \dontrun{x<-get_music_reviews(musicid=3843530,results = 100)} 35 | } 36 | \keyword{music} 37 | \keyword{review} 38 | -------------------------------------------------------------------------------- /R/get.book.info.R: -------------------------------------------------------------------------------- 1 | ##'@douban api of book information 2 | ##'@bookid 3 | ##'@version 4 | get.book.info <- function(bookid) { 5 | u = paste0("https://api.douban.com/v2/book/", bookid) 6 | # p=getURL(u,ssl.verifypeer = FALSE) 7 | p <- .refreshURL(u, ssl.verifypeer = FALSE) 8 | reslist <- fromJSON(p) 9 | title <- reslist[["title"]] 10 | author <- reslist[["author"]] 11 | rating <- unlist(reslist[["rating"]]) 12 | tags <- reslist[["tags"]] 13 | summary <- reslist[["summary"]] 14 | tags <- data.frame(tag_label = sapply(tags, function(x) x[["name"]]), 15 | tag_freq = sapply(tags, function(x) x[["count"]]), 16 | stringsAsFactors = F) 17 | image <- reslist[["images"]] 18 | href <- reslist[["alt"]] 19 | reslist$title <- NULL 20 | reslist$author <- NULL 21 | reslist$rating <- NULL 22 | reslist$tags <- NULL 23 | reslist$summary <- NULL 24 | reslist$alt <- NULL 25 | reslist$images <- NULL 26 | attribute = reslist 27 | 28 | list(title = title, author = author, rating = as.double(rating), summary = summary, 29 | tags = tags, href = href, image = image, attribute = attribute) 30 | } 31 | 32 | -------------------------------------------------------------------------------- /man/get_book_notes.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get_book_notes} 3 | \alias{get_book_notes} 4 | \title{获取豆瓣图书的读书笔记} 5 | \description{获取豆瓣图书的读书笔记.} 6 | \usage{get_book_notes(bookid,results=100,fresh=10,count=10,verbose=TRUE,...)} 7 | \arguments{ 8 | \item{bookid}{ 豆瓣书籍的id号,一串数字,如2567698} 9 | \item{results}{获取评论的数量,默认100} 10 | \item{fresh}{页面异常时最大刷新次数,默认10} 11 | \item{count}{每页note_uri数量,默认10,最大10} 12 | \item{verbose}{若\code{verbose=TRUE则显示详细过程信息,否则不显示,默认\code{TRUE}}} 13 | } 14 | \value{一个data.frame,各列为: 15 | \item{note_uri }{笔记主页URL} 16 | \item{title }{笔记标题} 17 | \item{published }{笔记发表时间} 18 | \item{author}{作者昵称} 19 | \item{author_uri }{作者豆瓣URI} 20 | \item{note }{笔记详细内容} 21 | \item{rating }{作者对书籍的评分} 22 | \item{readers}{被阅读的次数} 23 | \item{collectors}{被收藏的次数} 24 | 25 | } 26 | \note{若results不是count的倍数,则获取到的数量是\code{ceiling(results/count)*count}.} 27 | \author{ 28 | <\email{qxde01@gmail.com}> 29 | } 30 | 31 | \seealso{\code{\link{get_book_reviews}} 32 | } 33 | \examples{ 34 | ## http://book.douban.com/subject/1291204/annotation 35 | \dontrun{out<-get_book_notes(bookid='1291204',results=50)} 36 | } 37 | \keyword{note} 38 | \keyword{douban} 39 | -------------------------------------------------------------------------------- /man/get_music_info.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get_music_info} 3 | \alias{get_music_info} 4 | \title{获取豆瓣音乐的专辑信息 5 | } 6 | \description{ 7 | 获取豆瓣音乐的专辑信息,包括歌手、专辑名称等。 8 | } 9 | \usage{get_music_info(musicid) 10 | } 11 | %- maybe also 'usage' for other objects documented here. 12 | \arguments{ 13 | \item{musicid}{豆瓣音乐专辑ID号 } 14 | } 15 | \value{ 16 | \item{title}{音乐专辑名称} 17 | \item{author }{作者} 18 | \item{rating}{评分信息,\code{average}为总评分,\code{votes}为评分听众总数,\code{starx}为对应的评分听众所占总评分的比例 } 19 | \item{summary }{内容简介} 20 | \item{tags }{一个data.frame,常用标签,两列分别是标签名称和标签频数} 21 | \item{songs}{专辑所包括的歌曲名称} 22 | \item{href}{音乐专辑主页网址} 23 | \item{image }{封面图片网址} 24 | \item{listeners}{听众信息, 25 | \code{doings}正在听的听众数量,\code{collections}已听过的听众数量, 26 | \code{wishes}指想听的用户数量} 27 | \item{comments_total}{发表评论的听众数} 28 | \item{attribute }{专辑的其他信息,比如出版公司、出版时间等} 29 | } 30 | \references{ 31 | %% ~put references to the literature/web site here ~ 32 | } 33 | \author{ 34 | <\email{qxde01@gmail.com}> 35 | } 36 | \examples{ 37 | ## http://music.douban.com/subject/3843530/ 38 | \dontrun{music<-get_music_info(musicid='3843530')} 39 | } 40 | \keyword{music} 41 | \keyword{douban} 42 | -------------------------------------------------------------------------------- /man/get_book_reviews.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get_book_reviews} 3 | \alias{get_book_reviews} 4 | \title{获取豆瓣图书的评论文章} 5 | \description{获取豆瓣图书的评论文章.} 6 | \usage{get_book_reviews(bookid,results=100,fresh=10,count=10,verbose=TRUE,...) 7 | } 8 | \arguments{ 9 | \item{bookid}{ 豆瓣书籍的id号,一串数字,如2567698} 10 | \item{results}{获取评论的数量,默认100} 11 | \item{fresh}{页面异常时最大刷新次数,默认10} 12 | \item{count}{每页review_uri数量,默认25,最大25} 13 | \item{verbose}{若\code{verbose=TRUE则显示详细过程信息,否则不显示,默认\code{TRUE}}} 14 | } 15 | \value{一个data.frame,各列为: 16 | \item{review_uri }{评论主页URL} 17 | \item{title }{评论文章名称} 18 | \item{published }{文章发表时间} 19 | \item{author}{文章作者昵称} 20 | \item{author_uri }{作者豆瓣URI} 21 | \item{review }{评论文章详细内容} 22 | \item{rating }{作者对书籍的评分} 23 | \item{useful}{被评为useful的次数} 24 | \item{unuseful}{被评为unuseful的次数} 25 | } 26 | 27 | \note{若results不是count的倍数,则获取到的数量是\code{ceiling(results/count)*count}.} 28 | \author{qxde01<\email{qxde01@gmail.com}> 29 | } 30 | \seealso{ 31 | \code{\link{get_book_notes}} 32 | } 33 | \examples{ 34 | ## http://book.douban.com/subject/2567698/reviews 35 | \dontrun{out<-get_book_reviews(bookid=2567698,results=500)} 36 | } 37 | \keyword{comment} 38 | \keyword{review} 39 | \keyword{douban} 40 | -------------------------------------------------------------------------------- /man/get_movie_reviews.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get_movie_reviews} 3 | \alias{get_movie_reviews} 4 | \title{获取豆瓣电影的长篇影评 5 | } 6 | \description{获取豆瓣电影和电视剧的长篇影评. 7 | } 8 | \usage{get_movie_reviews(movieid,results=100,fresh=10,count=10,verbose=TRUE,...) 9 | } 10 | \arguments{ 11 | \item{movieid}{豆瓣电影的id号} 12 | \item{results}{获取评论的数量,默认100} 13 | \item{fresh}{页面异常时最大刷新次数,默认10} 14 | \item{count}{每页review_uri数量,默认10,最大20} 15 | \item{verbose}{若\code{verbose=TRUE则显示详细过程信息,否则不显示,默认\code{TRUE}}} 16 | 17 | } 18 | \value{一个data.frame,各列为: 19 | \item{review_uri }{影评主页URL} 20 | \item{title }{评论文章名称} 21 | \item{published }{文章发表时间} 22 | \item{author}{文章作者昵称} 23 | \item{author_uri }{作者豆瓣URI} 24 | \item{review }{评论文章详细内容} 25 | \item{rating }{作者对电影的评分} 26 | \item{useful}{被评为useful的次数} 27 | \item{unuseful}{被评为unuseful的次数} 28 | } 29 | \author{ 30 | qxde01<\email{qxde01@gmail.com}> 31 | } 32 | \note{一个值得注意的情况是不同页之间可能有相同的review_uri,最后获得评论数目可能和results的值不同。} 33 | \seealso{ 34 | \code{\link{get_movie_comments}} 35 | } 36 | \examples{ 37 | ## http://movie.douban.com/subject/5308265/reviews 38 | \dontrun{x<-get_movie_reviews(movieid=5308265,results=50)} 39 | } 40 | \keyword{review} 41 | \keyword{comment} 42 | -------------------------------------------------------------------------------- /man/get.movie.info.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get.movie.info} 3 | \alias{get.movie.info} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | 获取豆瓣电影的基本信息 7 | } 8 | \description{ 9 | 获取豆瓣电影的基本信息,API V2接口。 10 | } 11 | \usage{ 12 | get.movie.info(movieid) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{movieid}{豆瓣电影的ID,由数字组成,可通过每部电影的豆瓣主页查询.} 17 | } 18 | 19 | \value{ 20 | \item{title}{电影名称} 21 | \item{author }{作者} 22 | \item{rating }{评分:最小值、评分人数、平均值、最大值} 23 | \item{summary }{内容简介} 24 | \item{tags }{一个data.frame,常用标签,两列分别是标签名称和标签频数} 25 | \item{href}{影视主页网址} 26 | \item{image }{封面图片网址} 27 | \item{attribute }{一个list,电影其他信息,比如演员、上映时间等} 28 | } 29 | \references{ 30 | %% ~put references to the literature/web site here ~ 31 | } 32 | \author{ 33 | <\email{qxde01@gmail.com}> 34 | } 35 | \note{ 36 | %% ~~further notes~~ 37 | } 38 | 39 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 40 | 41 | \seealso{ 42 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 43 | } 44 | \examples{ 45 | ## http://movie.douban.com/subject/5308265/ 46 | \dontrun{get.movie.info(movieid='5308265')} 47 | } 48 | \keyword{movie} 49 | 50 | -------------------------------------------------------------------------------- /man/get.movie.comment.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get.movie.comment} 3 | \alias{get.movie.comment} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | 通过移动网页获取豆瓣电影的短评. 7 | } 8 | \description{ 9 | 通过移动网页m.douban.com获取豆瓣电影的短评。 10 | } 11 | \usage{ 12 | get.movie.comment(movieid,results=100,fresh=20,verbose=TRUE) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{movieid}{ 豆瓣电影的id号} 17 | \item{results}{获取评论的数量,默认100} 18 | \item{fresh}{页面异常时最大刷新次数,默认20} 19 | \item{verbose}{若\code{verbose=TRUE则显示详细过程信息,否则不显示,默认\code{TRUE}}} 20 | } 21 | 22 | \details{ 23 | %% ~~ If necessary, more details than the description above ~~ 24 | } 25 | \value{ 26 | \item{author}{作者昵称} 27 | \item{author_id }{作者豆瓣ID} 28 | \item{comment }{短评内容} 29 | \item{rating }{作者对电影的评分} 30 | } 31 | \references{ 32 | %% ~put references to the literature/web site here ~ 33 | } 34 | \author{ 35 | qxde01<\email{qxde01@gmail.com}> 36 | } 37 | 38 | \examples{ 39 | 40 | ## http://m.douban.com/movie/subject/11627047/comments 41 | \dontrun{cmt<-get.movie.comment(movieid=11627047,fresh=20,results=100,verbose=TRUE)} 42 | } 43 | 44 | \keyword{movie} 45 | \keyword{comment} 46 | \keyword{douban} 47 | 48 | -------------------------------------------------------------------------------- /man/get.music.info.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get.music.info} 3 | \alias{get.music.info} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | 获取豆瓣音乐的专辑信息 7 | } 8 | \description{ 9 | 获取豆瓣音乐的专辑信息,API V2接口。 10 | } 11 | \usage{ 12 | get.music.info(musicid) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{musicid}{豆瓣音乐专辑ID号 } 17 | } 18 | 19 | \value{ 20 | \item{title}{音乐专辑名称} 21 | \item{author }{作者} 22 | \item{rating }{评分:最小值、评分人数、平均值、最大值} 23 | \item{summary }{内容简介} 24 | \item{tags }{一个data.frame,常用标签,两列分别是标签名称和标签频数} 25 | \item{songs}{专辑所包括的歌曲名称} 26 | \item{href}{音乐专辑主页网址} 27 | \item{image }{封面图片网址} 28 | \item{attribute }{一个list,专辑的其他信息,比如出版公司、出版时间等} 29 | } 30 | \references{ 31 | %% ~put references to the literature/web site here ~ 32 | } 33 | \author{ 34 | <\email{qxde01@gmail.com}> 35 | } 36 | \note{ 37 | %% ~~further notes~~ 38 | } 39 | 40 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 41 | 42 | \seealso{ 43 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 44 | } 45 | 46 | \examples{ 47 | ## http://music.douban.com/subject/3843530/ 48 | \dontrun{get.music.info(musicid='3843530')} 49 | } 50 | \keyword{music} 51 | \keyword{douban} 52 | -------------------------------------------------------------------------------- /man/user_movie_viz.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{user_movie_viz} 3 | \alias{user_movie_viz} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | 豆瓣用户观影统计及可视化函数 7 | } 8 | \description{ 9 | 豆瓣用户观影统计及可视化函数,观影综合统计、标签云图、海报拼图等。 10 | } 11 | \usage{ 12 | user_movie_viz(x,YEAR="2013",stopwords=stopwords) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{x}{有函数\code{user_book_status}产生的数据} 17 | \item{YEAR}{年份,比如"2013",如果为NULL,则统计所有数据} 18 | \item{stopwords}{\code{Package:Rwordseg}分词时使用的中文停止词} 19 | } 20 | \details{ 21 | %% ~~ If necessary, more details than the description above ~~ 22 | } 23 | \value{ 24 | 产生以下图形: 25 | \item{1.}{观影综合统计} 26 | \item{2.}{每月统计(部,时长)} 27 | \item{3.}{根据电影标签归类} 28 | \item{4.}{电影标签云图} 29 | \item{5.}{电影名称与标签之间的关系图} 30 | \item{6.}{短评云图} 31 | \item{7.}{海报拼图} 32 | \item{8.}{导演和演员合作关系图} 33 | 34 | } 35 | \references{ 36 | %% ~put references to the literature/web site here ~ 37 | } 38 | \author{ 39 | qxde01<\email{qxde01@gmail.com}> 40 | } 41 | 42 | \examples{ 43 | ## http://movie.douban.com/people/qxde01 44 | \dontrun{ 45 | qxde<-user_movie_status(userid="qxde01") 46 | data(stopwords) 47 | user_movie_viz(x=qxde,YEAR=NULL,stopwords=stopwords) 48 | } 49 | } 50 | 51 | \keyword{ douban } 52 | \keyword{ movie } -------------------------------------------------------------------------------- /man/get.book.review.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get.book.review} 3 | \alias{get.book.review} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | 获取豆瓣图书的评论文章. 7 | } 8 | \description{ 9 | 获取豆瓣图书的评论文章,部分使用了官方API V1. 10 | } 11 | \usage{ 12 | get.book.review(bookid,results=100,verbose=TRUE) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{bookid}{ 豆瓣书籍的id号,一串数字,如20326626} 17 | \item{results}{获取评论的数量,默认results=100} 18 | \item{verbose}{若\code{verbose=TRUE则显示详细过程信息,否则不显示,默认\code{TRUE}}} 19 | } 20 | 21 | \details{ 22 | %% ~~ If necessary, more details than the description above ~~ 23 | } 24 | \value{一个data.frame,各列为: 25 | \item{title }{评论文章名称} 26 | \item{review_id }{评论文章的豆瓣ID} 27 | \item{review }{评论文章详细内容} 28 | \item{author}{文章作者昵称} 29 | \item{author_id }{作者豆瓣ID} 30 | \item{published }{文章发表时间} 31 | \item{ updated}{文章最后一次修改时间} 32 | \item{comments }{文章被评论的数量} 33 | \item{ useless}{文章被赞的次数} 34 | \item{votes }{文章被推荐次数} 35 | \item{rating }{作者对书籍的评分} 36 | } 37 | 38 | \author{ 39 | qxde01<\email{qxde01@gmail.com}> 40 | } 41 | 42 | \examples{ 43 | 44 | ## http://book.douban.com/subject/2567698/reviews 45 | \dontrun{x<-get.book.review(bookid=2567698,results=55)} 46 | } 47 | 48 | \keyword{comment} 49 | \keyword{review} 50 | \keyword{douban} 51 | -------------------------------------------------------------------------------- /man/user_status.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{user_status} 3 | \alias{user_status} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | 获取豆瓣用户的说说 7 | } 8 | \description{ 9 | 获取豆瓣用户的说说,最多能获取最近的200条 10 | } 11 | \usage{ 12 | user_status(usrid,results=200) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{usrid}{豆瓣用户ID} 17 | \item{results}{获取用户说说的数量,最多可获取最近200条} 18 | } 19 | \details{ 20 | %% ~~ If necessary, more details than the description above ~~ 21 | } 22 | \value{ 23 | 24 | 一个数据框(data.frame),各列为: 25 | \item{data_sid }{数据ID} 26 | \item{data_object_kind }{说说类别} 27 | \item{data_target_type}{说说来源类别} 28 | \item{txt }{说说、分享内容} 29 | \item{created_at}{说说发表时间} 30 | \item{saying }{说说内容} 31 | \item{reply }{回应的次数} 32 | } 33 | \references{ 34 | %% ~put references to the literature/web site here ~ 35 | } 36 | \author{ 37 | qxde01<\email{qxde01@gmail.com}> 38 | } 39 | \note{ 40 | %% ~~further notes~~ 41 | } 42 | 43 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 44 | 45 | \seealso{ 46 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 47 | } 48 | 49 | \examples{ 50 | ## http://www.douban.com/people/qxde01 51 | \dontrun{qxde<-user_status(userid='qxde01',results=100)} 52 | } 53 | 54 | \keyword{ douban } 55 | 56 | 57 | -------------------------------------------------------------------------------- /R/get.music.info.R: -------------------------------------------------------------------------------- 1 | get.music.info<-function(musicid){ 2 | u=paste0("https://api.douban.com/v2/music/",musicid) 3 | #p=getURL(u,ssl.verifypeer = FALSE) 4 | p<-.refreshURL(u,ssl.verifypeer = FALSE) 5 | reslist <- fromJSON(p) 6 | title<-reslist[["title"]] 7 | author<-unlist(reslist[["author"]]);names(author)<-NULL 8 | rating<-unlist(reslist[["rating"]]) 9 | summary<-reslist[["summary"]] 10 | tags<-reslist[["tags"]] 11 | 12 | tags<-data.frame(tag_label=sapply(tags,function(x)x[["name"]]), 13 | tag_freq=sapply(tags,function(x)x[["count"]]), 14 | stringsAsFactors=F) 15 | image<-reslist[['image']] 16 | href<-reslist[["alt"]] 17 | reslist$title<-NULL;reslist$author<-NULL; 18 | reslist$rating<-NULL;reslist$tags<-NULL; 19 | reslist$summary<-NULL;reslist$alt<-NULL 20 | reslist$image<-NULL 21 | attribute=reslist$attrs 22 | songs<-t(sapply(attribute$songs, 23 | function(x)c(x[["index"]],x[["title"]],x[["name"]]))) 24 | attribute$songs<-NULL 25 | colnames(songs)<-c("index","title","name") 26 | list(title=title, 27 | author=author, 28 | rating=as.double(rating), 29 | summary=summary, 30 | tags=tags, 31 | songs=as.data.frame(songs,stringsAsFactors=F), 32 | href=href, 33 | image=image, 34 | attribute= attribute) 35 | } 36 | -------------------------------------------------------------------------------- /man/get.movie.review.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{get.movie.review} 3 | \alias{get.movie.review} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | 通过移动网页获取豆瓣电影的长篇评论文章. 7 | } 8 | \description{ 9 | 通过移动网页m.douban.com获取豆瓣电影的长篇评论文章. 10 | } 11 | \usage{ 12 | get.movie.review(movieid,results=100,fresh=20,verbose=TRUE) 13 | } 14 | 15 | \arguments{ 16 | \item{movieid}{ 豆瓣电影的id号} 17 | \item{results}{获取评论的数量,默认100} 18 | \item{fresh}{页面异常时最大刷新次数,默认20} 19 | \item{verbose}{若\code{verbose=TRUE则显示详细过程信息,否则不显示,默认\code{TRUE}}} 20 | } 21 | \value{一个data.frame,各列为: 22 | \item{review_id }{评论文章的豆瓣ID} 23 | \item{title }{评论文章名称} 24 | \item{review }{评论文章详细内容} 25 | \item{author}{文章作者昵称} 26 | \item{author_uri }{作者豆瓣URI} 27 | \item{published }{文章发表时间} 28 | \item{votes }{文章被推荐次数} 29 | \item{rating }{作者对电影的评分} 30 | } 31 | \references{ 32 | %% ~put references to the literature/web site here ~ 33 | } 34 | \author{ 35 | qxde01<\email{qxde01@gmail.com}> 36 | } 37 | \note{ 38 | 连续获取会被豆瓣阻止,出现 Forbidden 错误. 39 | } 40 | 41 | \seealso{ 42 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 43 | } 44 | \examples{ 45 | 46 | ## http://m.douban.com/movie/subject/11627047/reviews 47 | \dontrun{longmen<-get.movie.review(movieid=11627047,fresh=20,results=300)} 48 | } 49 | 50 | \keyword{movie} 51 | \keyword{review} 52 | \keyword{douban} 53 | -------------------------------------------------------------------------------- /man/user_book_viz.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{user_book_viz} 3 | \alias{user_book_viz} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | 豆瓣用户阅读统计及可视化函数 7 | } 8 | \description{ 9 | 豆瓣用户阅读统计的可视化函数,统计阅读量、标签云图、封面拼图等。 10 | } 11 | \usage{ 12 | user_book_viz(x, YEAR = "2013", stopwords = stopwords, back = FALSE) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{x}{有函数\code{user_movie_status}产生的数据} 17 | \item{YEAR}{年份,比如"2013",如果为NULL,则统计所有数据} 18 | \item{stopwords}{\code{Package:Rwordseg}分词时使用的中文停止词} 19 | \item{back}{如果为TRUE,则为图片添加背景图} 20 | } 21 | \details{ 22 | %% ~~ If necessary, more details than the description above ~~ 23 | } 24 | \value{ 25 | 产生以下图形: 26 | \item{1.}{阅读综合统计} 27 | \item{2.}{每月统计(本,页数)} 28 | \item{3.}{根据书籍标签归类} 29 | \item{4.}{读书标签云图} 30 | \item{5.}{短评云图} 31 | \item{6.}{书籍名称与标签之间的关系图} 32 | \item{7.}{封面拼图} 33 | 34 | } 35 | \references{ 36 | %% ~put references to the literature/web site here ~ 37 | } 38 | 39 | \author{ 40 | qxde01<\email{qxde01@gmail.com}> 41 | } 42 | 43 | \examples{ 44 | ## http://book.douban.com/people/qxde01 45 | \dontrun{ 46 | qxde<-user_book_status(userid="qxde01") 47 | data(stopwords) 48 | user_book_viz(x=qxde,YEAR="2013",stopwords=stopwords,back=TRUE) 49 | } 50 | } 51 | 52 | \keyword{ douban } 53 | \keyword{ book } 54 | -------------------------------------------------------------------------------- /man/user_note_status.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{user_note_status} 3 | \alias{user_note_status} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | 获取豆瓣用户的日记 7 | } 8 | \description{ 9 | 获取豆瓣用户的日记. 10 | } 11 | \usage{ 12 | user_note_status(userid,count=10,verbose=TRUE) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{userid}{豆瓣用户id} 17 | \item{count}{每页显示的日记篇数,默认10} 18 | \item{verbose}{若为\code{TRUE}则显示详细过程信息,否则不显示,默认\code{TRUE}} 19 | } 20 | \details{ 21 | %% ~~ If necessary, more details than the description above ~~ 22 | } 23 | \value{一个data.frame,各列为: 24 | 25 | \item{title }{日记的标题} 26 | \item{published}{日记发表时间} 27 | \item{note}{日记内容} 28 | \item{liked}{被喜欢的次数} 29 | \item{recommend}{被推荐的次数} 30 | \item{url}{日记的网址} 31 | } 32 | 33 | \references{ 34 | %% ~put references to the literature/web site here ~ 35 | } 36 | \author{ 37 | qxde01<\email{qxde01@gmail.com}> 38 | } 39 | \note{ 40 | %% ~~further notes~~ 41 | } 42 | 43 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 44 | 45 | \seealso{ 46 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 47 | } 48 | \examples{ 49 | ## http://www.douban.com/people/qxde01/notes 50 | ## 作者没发表过日记,没有任何内容 51 | \dontrun{qxde<-user_note_status(userid="qxde01")} 52 | } 53 | % Add one or more standard keywords, see file 'KEYWORDS' in the 54 | % R documentation directory. 55 | \keyword{ douban } 56 | \keyword{ note}% __ONLY ONE__ keyword per line 57 | -------------------------------------------------------------------------------- /man/user_book_status.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{user_book_status} 3 | \alias{user_book_status} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | 豆瓣用户的读书信息 7 | } 8 | \description{ 9 | 豆瓣用户的读书信息,包括已读书籍、发表书评、笔记等。 10 | } 11 | \usage{ 12 | user_book_status(userid,verbose=TRUE,front=TRUE) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{userid}{豆瓣用户id} 17 | \item{verbose}{若为\code{TRUE}则显示详细过程信息,否则不显示,默认\code{TRUE}} 18 | \item{front}{若为TRUE,则下载已读书籍的封面图片,否则不下载} 19 | } 20 | \details{ 21 | %% ~~ If necessary, more details than the description above ~~ 22 | } 23 | \value{一个list,各项为: 24 | 25 | \item{collect_tags }{\code{data.frame},已读书籍的标签} 26 | \item{collect_df}{\code{data.frame},已读书籍的信息} 27 | \item{do_tags}{\code{data.frame},正在阅读书籍的标签} 28 | \item{do_df}{\code{data.frame},正在阅读书籍的信} 29 | \item{wish_tags}{\code{data.frame},想读书籍的标签} 30 | \item{wish_df}{\code{data.frame},想读书籍的信息} 31 | \item{reviews}{\code{data.frame},发表的书评,各列同\code{get_book_reviews}} 32 | \item{notes}{\code{data.frame},发表的读书笔记,各列同\code{get_book_notes}} 33 | \item{collect_images}{\code{list},已读书籍的封面图片,分辨率为\code{60x80}} 34 | } 35 | 36 | \author{ 37 | qxde01<\email{qxde01@gmail.com}> 38 | } 39 | 40 | \examples{ 41 | ## http://book.douban.com/people/qxde01 42 | \dontrun{qxde<-user_book_status(userid="qxde01")} 43 | } 44 | % Add one or more standard keywords, see file 'KEYWORDS' in the 45 | % R documentation directory. 46 | \keyword{ douban } 47 | \keyword{ book }% __ONLY ONE__ keyword per line 48 | -------------------------------------------------------------------------------- /R/user_status.R: -------------------------------------------------------------------------------- 1 | .statuses<-function(p){ 2 | .extract<-function(x){ 3 | a<-unlist(xmlToList(x)) 4 | created_at<-a['div.div.div.span..attrs.title'] 5 | saying<-a['div.div.blockquote.p'] 6 | saying<-ifelse(is.na(saying),a['div.div.div.blockquote.p'],saying) 7 | reply<-gsub('[^0-9]','',a[grep('回应',a)][1]) 8 | out<-c(created_at,saying,reply) 9 | names(out)<-c('created_at','saying','reply') 10 | return(out) 11 | } 12 | 13 | n1<-getNodeSet(p,'//div[@class="status-item"]') 14 | data_sid<-sapply(n1,function(x) xmlGetAttr(x, "data-sid")) 15 | data_target_type<-sapply(n1,function(x) xmlGetAttr(x, "data-target-type")) 16 | data_object_kind<-sapply(n1,function(x) xmlGetAttr(x, "data-object-kind")) 17 | 18 | n2<-getNodeSet(p,'//div[@class="status-item"]//div[@class="text"]') 19 | txt<-gsub('\n| ','',sapply(n2,xmlValue)) 20 | n3<-getNodeSet(p,'//div[@class="status-item"]//div[@class="block block-subject"]') 21 | 22 | txt<-gsub('\n| |> 我来回应','',sapply(n1,xmlValue)) 23 | txt<-gsub(' ','',txt) 24 | tmp0<-t(sapply(n1,.extract)) 25 | tmp<-cbind(data_sid,data_object_kind,data_target_type,txt,tmp0) 26 | tmp<-as.data.frame(tmp,stringsAsFactors=F) 27 | return(tmp) 28 | } 29 | ################################ 30 | user_status<-function(userid,results=200){ 31 | pages<-results/20 32 | df<-c() 33 | for(pg in 1:pages){ 34 | u<-paste0('http://www.douban.com/people/',userid,'/statuses?p=',pg) 35 | cat(pg,' Getting saying from:',u,'...\n') 36 | p<-.refreshURL(u) 37 | if(!is.null(p)){ 38 | df0<-.statuses(p) 39 | df<-rbind(df,df0) 40 | } 41 | } 42 | return(df) 43 | } 44 | ##qxde<-user_status(userid='qxde01',results=200) 45 | -------------------------------------------------------------------------------- /man/user_movie_status.Rd: -------------------------------------------------------------------------------- 1 | \encoding{UTF-8} 2 | \name{user_movie_status} 3 | \alias{user_movie_status} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | 豆瓣用户的观影信息 7 | } 8 | \description{ 9 | 豆瓣用户的观影信息,包括一看影视记录、发表影评等。 10 | } 11 | \usage{ 12 | user_movie_status(userid,verbose=TRUE,front=TRUE) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{userid}{豆瓣用户id} 17 | \item{verbose}{若为\code{TRUE}则显示详细过程信息,否则不显示,默认\code{TRUE}} 18 | \item{front}{\code{TRUE},则下载电影海报图片,否则不下载} 19 | } 20 | \details{ 21 | %% ~~ If necessary, more details than the description above ~~ 22 | } 23 | \value{一个list,各项为: 24 | 25 | \item{collect_tags }{\code{data.frame},已观看影视的标签} 26 | \item{collect_df}{\code{data.frame},已观看影视的信息} 27 | \item{do_tags}{\code{data.frame},正在观看影视的标签} 28 | \item{do_df}{\code{data.frame},正在观看影视的信息} 29 | \item{wish_tags}{\code{data.frame},想观看影视的标签} 30 | \item{wish_df}{\code{data.frame},想观看影视的信息} 31 | \item{reviews}{\code{data.frame},发表的影评,各列同\code{get_movie_reviews}} 32 | \item{collect_images}{\code{list},已观看影视的海报图片,分辨率为\code{60x80}} 33 | } 34 | \references{ 35 | %% ~put references to the literature/web site here ~ 36 | } 37 | \author{ 38 | qxde01<\email{qxde01@gmail.com}> 39 | } 40 | \note{ 41 | %% ~~further notes~~ 42 | } 43 | 44 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 45 | 46 | \seealso{ 47 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 48 | } 49 | \examples{ 50 | ## http://movie.douban.com/people/qxde01 51 | \dontrun{qxde<-user_movie_status(userid="qxde01")} 52 | } 53 | % Add one or more standard keywords, see file 'KEYWORDS' in the 54 | % R documentation directory. 55 | \keyword{ douban } 56 | \keyword{ movie }% __ONLY ONE__ keyword per line 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Rdouban 2 | ======= 3 | ### 获取豆瓣网评论数据的R接口,基于RCurl和XML。 4 | > #### 非官方接口 5 | * get_book_info 获取豆瓣图书基本信息 6 | * get_book_reviews 获取豆瓣图书的评论文章 7 | * get_book_discussions 获取豆瓣图书的话题讨论内容 8 | * get_book_notes 获取豆瓣图书的读书笔记 9 | * get_movie_info 获取豆瓣影视的基本信息 10 | * get_movie_reviews 获取豆瓣电影的长篇影评 11 | * get_movie_comments 获取豆瓣电影的短评 12 | * get_movie_discussions 获取关于某个豆瓣电影的话题讨论内容 13 | * get_music_info 获取豆瓣音乐的专辑信息 14 | * get_music_reviews 获取豆瓣音乐评论信息. 15 | * get_music_discussions 获取豆瓣音乐专辑的话题讨论内容 16 | * user_status 获取用户的说说 17 | * user_note_status 获取用户的日记 18 | * user_book_status 获取用户的读书信息 19 | * user_book_viz 对用户的读书信息进行统计和可视化 20 | * user_movie_status 获取用户的影视观看信息 21 | * user_movie_viz 对用户的观影信息进行统计和可视化 22 | 23 | > #### 下面的函数是上述部分函数的重写,部分采用了官方API 24 | * get.book.info 豆瓣图书信息API V2 25 | * get.movie.info 豆瓣电影信息API V2 26 | * get.music.info 豆瓣音乐信息API V2 27 | * get.book.review 豆瓣图书评论信息,部分采用API V1 28 | * get.movie.review 豆瓣电影长篇评论,通过移动网页m.douban.com,高频率访问会被豆瓣阻止 29 | * get.movie.comment 豆瓣电影短评,通过移动网页m.douban.com,高频率访问会被豆瓣阻止 30 | 31 | >### 安装 32 | ``` 33 | library("devtools") 34 | install_github("Rdouban","qxde01") 35 | ``` 36 | >### 例子 37 | 获取作者qxde01的阅读信息,并统计分析和可视化。注意,需要通过`biocLite()`安装`package:EBImage`。 38 | 两篇相关的博文见 豆瓣阅读统计及可视化[上](http://qxde01.blog.163.com/blog/static/6733574420132915952828/),[下](http://qxde01.blog.163.com/blog/static/673357442013355192638/)。博文与函数`user_book_viz`的具体实现是有差异的。 39 | 40 | ``` 41 | library(Rdouban) 42 | qxde<-user_book_status(userid="qxde01") 43 | data(stopwords) ## 中文停止词 44 | ## 生成用户qxde01的2013年阅读信息可视化图形 45 | user_book_viz(x=qxde,YEAR="2013",stopwords=stopwords,back=TRUE) 46 | ``` 47 | 48 | >### windows下安装 49 | 在windows下的中文编码至今没有搞定,**坑爹的windows,坑娘的微软**,安装编译时会出错,主要是因为帮助用中文编写,在R/etc/Rprofile.site 添加`options(encoding="UTF-8")`,使用RStudio可以编译通过,查看帮助会有乱码,使用浏览器查看有部分是乱码(需要手动修改浏览器编码查看方式)。在Linux下没有问题。 50 | 51 | >### 拼接后的效果图 52 | ![拼接后的效果图](inst/images/big.png) -------------------------------------------------------------------------------- /R/get.movie.comment.R: -------------------------------------------------------------------------------- 1 | ## movieid:movie id of douban 2 | ## results:get the number of comments 3 | 4 | get.movie.comment<-function(movieid,fresh=20,results=100,verbose=TRUE){ 5 | u=paste0("http://m.douban.com/movie/subject/",movieid,"/comments") 6 | p<-htmlParse(postForm(u)) 7 | ## 评论总数 8 | total<-sapply(getNodeSet(p, '//div[@class="title"]//span'), xmlValue) 9 | total<-as.integer(gsub("[^0-9]","",total)) 10 | cat("----------There are a total of ",total," short comments.----------\n") 11 | results<-min(total,results) 12 | ## 评论屏数 13 | pages<-ceiling(results/20) 14 | out<-data.frame(matrix(nrow=pages*20,ncol=4),stringsAsFactors=F) 15 | colnames(out)<-c("author","author_id","comment","rating") 16 | nr=1 17 | for(pg in 1:pages){ 18 | #http://m.douban.com/movie/subject/11627047/comments?page=2&session=4c4f1371 19 | u=paste0("http://m.douban.com/movie/subject/",movieid,"/comments?page=",pg) 20 | if(verbose==TRUE){ 21 | cat("Retrieving comments of page ",pg, " from: ",u," .....\n") 22 | } 23 | p<- tryCatch(.refreshForm(u,fresh,verbose),error = function(e){NULL}) 24 | 25 | author<-sapply(getNodeSet(p,'//div[@class="item"]//a[@class="founder"]'), 26 | xmlValue) 27 | author_id<-sapply(getNodeSet(p,'//div[@class="item"]//a[@class="founder"]'), 28 | function(x) xmlGetAttr(x, "href")) 29 | author_id<-unlist(strsplit(author_id,"/movie/people/|/\\?session")) 30 | author_id<-author_id[-grep("=",author_id)] 31 | author_id<-author_id[nchar(author_id)>0] 32 | n1<-getNodeSet(p,'//div[@class="list"]//div[@class="item"]//span') 33 | tmp<-sapply(n1,xmlValue) 34 | tmp<-gsub("[\n ]","",tmp)[1:(length(tmp)-3)] 35 | cmt<-tmp[seq(1,length(tmp),2)] 36 | rating<-tmp[seq(2,length(tmp),2)] 37 | #cat(rating,'\n') 38 | rating<-gsub("[^\\(0-5星]","",rating) 39 | #gsub("[^0-5][^星]","",rating) 40 | rating<-gsub("[0-5]{1,10}\\(","",rating) 41 | rating<-gsub("[\\(星]","",rating) 42 | out0<-cbind(author=author,author_id=author_id,comment=cmt,rating=rating) 43 | nr2=nrow(out0) 44 | out[nr:(nr+nr2-1),]<-out0 45 | nr=nr+nr2 46 | } 47 | return(out) 48 | } -------------------------------------------------------------------------------- /R/user_note_status.R: -------------------------------------------------------------------------------- 1 | ################## 2 | #' @param u:日记地址 3 | #' 4 | .get_user_note<-function(u){ 5 | p<-.refreshURL(u) 6 | title<-sapply(getNodeSet(p, '//div[@class="note-header"]//h1'),xmlValue) 7 | published<-sapply(getNodeSet(p, '//div[@class="note-header"]//span'),xmlValue) 8 | liked<-sapply(getNodeSet(p, '//span[@class="fav-num"]//a'),xmlValue) 9 | liked<-gsub('[^0-9]','',liked) 10 | recommend<-sapply(getNodeSet(p, '//span[@class="rec-num"]'),xmlValue) 11 | recommend<-gsub('[^0-9]','',recommend) 12 | if(length(recommend)==0)recommend<-NA 13 | if(length(liked)==0)liked<-NA 14 | note<-sapply(getNodeSet(p, '//div[@class="note"]'),xmlValue) 15 | note<-note[nchar(note)>0] 16 | out<-c(title,published,note,liked,recommend,u) 17 | return(out) 18 | } 19 | ############################################################# 20 | ## 获取用户的日记 21 | #'@param userid 22 | #'@param count 23 | #'@param verbose 24 | 25 | user_note_status<-function(userid,count=10,verbose=TRUE){ 26 | u<-paste0("http://www.douban.com/people/",userid,"/notes") 27 | p<-.refreshURL(u) 28 | pages<-sapply(getNodeSet(p, '//body//div[@class="paginator"]//a'),xmlValue) 29 | if(length(pages)==0){ 30 | pages=1 31 | } 32 | else{ 33 | pages<-as.integer(pages[length(pages)-1]) 34 | } 35 | cat("\n--------There is a total of ",pages," pages about notes.--------\n") 36 | href<-c() 37 | for(pg in 1:pages){ 38 | u = paste0("http://www.douban.com/people/", userid, 39 | "/notes?start=", (pg - 1) * count) 40 | cat("Getting note URLs from page ", pg, ": \n ", u, " ...\n") 41 | p<-.refreshURL(u) 42 | href0<-sapply(getNodeSet(p, '//body//div//span[@class="wrap"]//a'),function(x) xmlGetAttr(x, "href")) 43 | href0<-unique(href0[grep('/note/',href0)]) 44 | href<-c(href,href0) 45 | } 46 | total<-length(href) 47 | cat("\n--------There is a total of ",total," notes.--------\n") 48 | df<-data.frame(matrix(nrow=total,ncol=6),stringsAsFactors=F) 49 | colnames(df)<-c("title","published","note","liked","recommend",'url') 50 | if(total>0){ 51 | for(i in 1:total){ 52 | u=href[i] 53 | if(verbose==TRUE){cat(" Getting note from URL: ",i,',', u, " ...\n")} 54 | df[i,]<-.get_user_note(u) 55 | } 56 | } 57 | return(df) 58 | } 59 | 60 | 61 | -------------------------------------------------------------------------------- /R/get_music_discussions.R: -------------------------------------------------------------------------------- 1 | ##http://music.douban.com/subject/3843530/discussion/ 2 | ##musicid=3843530 3 | .get_music_discussion0<-function(u,fresh,verbose){ 4 | .get_movie_discussion0(u,fresh,verbose) 5 | } 6 | ################################################################# 7 | get_music_discussions<-function(musicid,results = 100, fresh = 10,count=20, verbose = TRUE,...){ 8 | u = paste0("http://music.douban.com/subject/", musicid, "/discussion/") 9 | p <- .refreshURL(u, fresh, verbose) 10 | total<-gsub('[^0-9]','',sapply(getNodeSet(p, '//span[@class="count"]'),xmlValue)) 11 | if (length(total)==0) 12 | stop('There is no discussions about this music.') 13 | cat('-----There is a tatal of ',total,' music discussions.-----\n') 14 | pages<-ceiling(min(results,as.integer(total))/count) 15 | out <- data.frame(matrix(nrow = pages * count, ncol = 8), stringsAsFactors = F) 16 | colnames(out) <- c("dicussion_uri", "title", "published", "author", "author_uri", 17 | "dicussion", "useful", "unuseful") 18 | k=1 19 | if(pages>0){ 20 | for(pg in 1:pages){ 21 | u=paste0('http://music.douban.com/subject/',musicid, 22 | '/discussion/?start=',(pg-1)*count,'&sort=vote/') 23 | if(verbose==TRUE) { 24 | #cat('Getting',(pg-1)*20+1,'--',pg*20,'discussions...\n') 25 | cat("Getting music discussion URLS from page=",pg,": ",u,"...\n") 26 | } 27 | p <- .refreshURL(u, fresh, verbose) 28 | n1<-getNodeSet(p ,'//table[@class="olt"]//td/a') 29 | href<-unique(sapply(n1,function(x) xmlGetAttr(x, "href"))) 30 | href<-href[grep('/discussion/',href)] 31 | href <- href[!href %in% out$dicussion_uri] 32 | n=length(href) 33 | if(n>0){ 34 | for(i in 1:n){ 35 | u0<-href[i] 36 | if(verbose==TRUE){ 37 | cat(" Getting ", k, " movie discussion from URL: ", u0, " ...\n") 38 | } 39 | out0<-.get_music_discussion0(u=u0,fresh,verbose) 40 | if(length(out0)==8){ 41 | out[k,]<-out0 42 | k=k+1 43 | } 44 | else{ 45 | cat(" !!!! Getting failed at URL: ", u0, " \n") 46 | } 47 | } 48 | } 49 | } 50 | } 51 | out <- out[!is.na(out[, 1]), ] 52 | return(out) 53 | } 54 | 55 | #a<-get_music_discussions(musicid=3843530,results = 100) -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | ################################### 2 | ## @u:url 3 | ## @fresh:refresh number 4 | 5 | .refreshURL <- function(u, fresh = 15, ssl.verifypeer = TRUE,verbose = TRUE) { 6 | # p<-tryCatch(htmlParse(getURL(u)),error = function(e){NULL}) 7 | p <- tryCatch(getURL(u,ssl.verifypeer=ssl.verifypeer), error = function(e) { 8 | print(e) 9 | return("NULL") 10 | }) 11 | n = nchar(p) 12 | k = 1 13 | while (n < 500) { 14 | if (verbose == TRUE) { 15 | #cat("%%%%% 第 ", k, " 次重新请求URL:", u, ".....\n") 16 | cat("%%%%% \u7b2c ", k, " \u6b21\u91cd\u65b0\u8bf7\u6c42URL:", u, ".....\n") 17 | } 18 | p <- tryCatch(getURL(u,ssl.verifypeer=ssl.verifypeer), error = function(e) { 19 | print(e) 20 | return("NULL") 21 | }) 22 | n = nchar(p) 23 | if (k > 3) { 24 | Sys.sleep(rnorm(k, 3)) 25 | } 26 | if (k > fresh) { 27 | break 28 | } 29 | k = k + 1 30 | } 31 | if (verbose == TRUE) { 32 | if (!is.null(p) & k > 1) { 33 | ## cat("***** 第 ", k - 1, " 次请求成功!\n") 34 | cat("***** \u7b2c ", k - 1, " \u6b21\u8bf7\u6c42\u6210\u529f\uff01\n") 35 | } 36 | } 37 | if(ssl.verifypeer==FALSE){ 38 | return(p) 39 | } 40 | else{ 41 | p <- tryCatch(htmlParse(p), error = function(e) { 42 | print(e) 43 | return(NULL) 44 | }) 45 | return(p) 46 | } 47 | } 48 | ################################### 49 | ## @u:url 50 | ## @fresh:refresh number 51 | 52 | .refreshForm<-function(u,fresh=10,verbose=TRUE){ 53 | p<-tryCatch(htmlParse(postForm(u)),error = function(e){NULL}) 54 | #p<-tryCatch(postForm(u),error = function(e){print(e);return("NULL")}) 55 | n=nchar(p) 56 | k=1 57 | while(is.null(p)){ 58 | if(verbose==TRUE){ 59 | ## cat("%%%%% 第 ",k," 次重新请求URL:",u,".....\n") 60 | cat("%%%%% \u7b2c ",k," \u6b21\u91cd\u65b0\u8bf7\u6c42URL:",u,".....\n") 61 | } 62 | p<-tryCatch(htmlParse(postForm(u)),error = function(e){print(e);return(NULL)}) 63 | #saveXML(p,file=paste0(k,gsub("http:|/|\\?","_",u),".html")) 64 | if(k>3){Sys.sleep(rnorm(k,3))} 65 | if(k>fresh) {break} 66 | k=k+1 67 | } 68 | if(verbose==TRUE){ 69 | if(!is.null(p) & k>1){ 70 | ##cat("***** 第 ",k-1," 次请求成功!\n") 71 | cat("***** \u7b2c ",k-1," \u6b21\u8bf7\u6c42\u6210\u529f\uff01\n") 72 | } 73 | } 74 | #saveXML(p,file=paste0(k,gsub("http:|/|\\?","_",u),".html")) 75 | return(p) 76 | } -------------------------------------------------------------------------------- /R/get_movie_comments.R: -------------------------------------------------------------------------------- 1 | 2 | get_movie_comments<-function(movieid,results=100,fresh=10,verbose=TRUE,...){ 3 | 4 | u=paste0('http://movie.douban.com/subject/',movieid,'/comments') 5 | p<- .refreshURL(u,fresh,verbose) 6 | 7 | total<-sapply(getNodeSet(p, '//body//span[@class="total"]'),xmlValue) 8 | total<-as.integer(gsub("[^0-9]","",total)) 9 | cat('------------There is a total of ',total,' short comments.--------\n') 10 | 11 | results<-min(total,results) 12 | pages<-ceiling(results/20) 13 | out<-data.frame(matrix(nrow=pages*20,ncol=6),stringsAsFactors=F) 14 | colnames(out)<-c("author","author_uri","published" ,"comment" ,"votes", "rating") 15 | kind=1 16 | for(pg in 1:pages){ 17 | if(verbose==TRUE){ 18 | cat(' Getting short comments from ',(pg-1)*20+1,'--',pg*20,'...\n') 19 | } 20 | u=paste0('http://movie.douban.com/subject/',movieid,'/comments?start=',(pg-1)*20+1,'&limit=20&sort=new_score') 21 | p<- .refreshURL(u,fresh,verbose) 22 | ## 短评内容 23 | comment<-gsub('\n| ','',sapply(getNodeSet(p, '//div[@class="comment"]//p'),xmlValue)) 24 | ## 评论时间 25 | publised<-sapply(getNodeSet(p, '//div[@class="comment"]//span[@class=""]'),xmlValue) 26 | publised<-gsub("[\n ]","",publised) 27 | ##作者及其URI 28 | n1<-getNodeSet(p, '//div[@class="comment"]//span[@class="comment-info"]//a') 29 | author<-sapply(n1,xmlValue) 30 | author_uri<-sapply(n1,function(x) xmlGetAttr(x, "href")) 31 | ##有用的票数 32 | votes<-sapply(getNodeSet(p, '//div[@class="comment"]//span[@class="comment-vote"]//span'),xmlValue) 33 | #votes<-gsub("[^0-9]","",votes) 34 | ##评分 35 | #n2<-getNodeSet(p, '//div[@class="comment"]//span[@class="comment-info"]//span') 36 | n2<-getNodeSet(p, '//div[@class="comment"]//span[@class="comment-info"]') 37 | mm=length(n2) 38 | rating<-c() 39 | for(jj in 1:mm){ 40 | a=xmlSApply(n2[[jj]],xmlAttrs) 41 | a=gsub('[a-z]','',a$span[['class']]) 42 | rating<-c(rating,a) 43 | #cat(jj,a,'\n') 44 | } 45 | 46 | out0<-cbind(author=author,author_uri=author_uri, 47 | publised=publised,comment=comment, 48 | votes=votes,rating=rating) 49 | nr=nrow(out0) 50 | out[kind:(kind+nr-1),]<-out0 51 | kind<-kind+nr 52 | } 53 | out<-out[!is.na(out[,1]),] 54 | return(out) 55 | } 56 | 57 | 58 | #http://movie.douban.com/subject/5308265/comments 59 | #x=get_movie_comments(movieid=5308265,results=100) 60 | -------------------------------------------------------------------------------- /R/get_music_info.R: -------------------------------------------------------------------------------- 1 | ##http://music.douban.com/subject/3843530/ 2 | ##musicid=3843530 3 | get_music_info<-function(musicid,...){ 4 | u=paste0('http://music.douban.com/subject/',musicid,'/') 5 | p<-.refreshURL(u) 6 | pa<-'[\n ]|\\(\u8c46\u74e3|\\)' ##[\n ]|\\(豆瓣|\\) 7 | Encoding(pa)<-"UTF-8" 8 | title<-gsub(pa,'',sapply(getNodeSet(p, '//head//title'),xmlValue)) 9 | attribute<-gsub('[\n]| ','',sapply(getNodeSet(p, '//div[@id="info"]'),xmlValue)) 10 | author<-sapply(getNodeSet(p, '//div[@id="info"]//span[@class="pl"]//a'),xmlValue)[1] 11 | ## 12 | song<-sapply(getNodeSet(p, '//ul[@class="song-items"]//div[@data-title]'),xmlValue) 13 | track<-sapply(getNodeSet(p, '//ul[@class="song-items"]//span[@class="n_doulists unfoldable"]'), 14 | xmlValue) 15 | songs<-data.frame(title=song,track=track,stringsAsFactors=F) 16 | ## 17 | summary <-gsub('[\n ]','',sapply(getNodeSet(p, '//span[@class="all hidden"]'),xmlValue)) 18 | if(length(summary )==0) 19 | summary <-gsub('[\n ]','',sapply(getNodeSet(p, '//span[@property="v:summary"]'),xmlValue)) 20 | ## 21 | 22 | labels<-gsub('[\n ]','', 23 | sapply(getNodeSet(p, '//div[@id="db-tags-section"]//div'),xmlValue)) 24 | labels<-unlist(strsplit(labels,'\\(|\\)')) 25 | tags<-data.frame( tag_label=labels[seq(1,length(labels)-1,2)], 26 | tag_freq=as.integer(labels[seq(2,length(labels),2)]), 27 | stringsAsFactors=F) 28 | # tags$tag_label<-gsub("","",tags$tag_label) 29 | 30 | rating<-gsub('[\n ]','',sapply(getNodeSet(p, '//div[@id="interest_sectl"]'),xmlValue)) 31 | rating<-gsub("\u4eba\u8bc4\u4ef7",'',rating) #\u4eba\u8bc4\u4ef7 人评价 32 | rating<-as.numeric(unlist(strsplit(rating,'\\(|\\)|%'))) 33 | rating[-c(1,2)]<-rating[-c(1,2)]/100 34 | if(length(rating)==7) 35 | names(rating)<-c('average','votes','stars5','stars4','stars3','stars2','stars1') 36 | ## 37 | comments_total<-gsub('[^0-9]','',sapply(getNodeSet(p, '//div//p[@class="pl"]//a'),xmlValue))[1] 38 | listeners<-sapply(getNodeSet(p, '//div[@id="collector"]//p//a'),xmlValue) 39 | listeners<-gsub("[^0-9]","",listeners) 40 | names(listeners)<-c('doings','collections','wishes') 41 | 42 | large <- sapply(getNodeSet(p, '//div[@id="mainpic"]//a'), 43 | function(x) xmlGetAttr(x, "href")) 44 | medium <- sapply(getNodeSet(p, '//div[@id="mainpic"]//img'), 45 | function(x) xmlGetAttr(x, "src")) 46 | image <- c(medium = medium, large = large) 47 | list(title=title, 48 | author=author,rating=rating, 49 | summary=summary, 50 | tags=tags, songs=songs, 51 | href=u,image=image, 52 | comments_total=comments_total, 53 | listeners=listeners,attribute=attribute) 54 | } 55 | 56 | ##x<-get_music_info(musicid=1404439) -------------------------------------------------------------------------------- /R/get_book_reviews.R: -------------------------------------------------------------------------------- 1 | 2 | .get_book_review0<-function(u,fresh=10,verbose=TRUE,...){ 3 | p<-.refreshURL(u=u,fresh,verbose) 4 | ## title 5 | title<-sapply(getNodeSet(p, '//title'), xmlValue) 6 | title<-gsub("[\n ]","",title) 7 | ## 作者及其URI 8 | n1 <- getNodeSet(p, '//div[@class="article"]//div[@class="piir"]//a')[1] 9 | author<-sapply(n1, xmlValue) 10 | author_uri<-sapply(n1, function(x) xmlGetAttr(x, "href")) 11 | ##评分 12 | rating<-sapply(getNodeSet(p, '//span[@property="v:rating"]'), xmlValue) 13 | ##评论发表时间 14 | published<-sapply(getNodeSet(p, '//span[@property="v:dtreviewed"]'), xmlValue) 15 | ##评论内容 16 | review<-sapply(getNodeSet(p, '//span[@property="v:description"]'), xmlValue) 17 | review<-gsub("\r","",review) 18 | ## useful and unuseful 19 | useful<-sapply(getNodeSet(p ,'//span[@class="useful"]//em'), xmlValue) 20 | unuseful<-sapply(getNodeSet(p ,'//span[@class="unuseful"]//em'), xmlValue) 21 | out<-c(review_uri=u,title=title,published=published,author=author, 22 | author_uri=author_uri,review=review, rating=rating, 23 | useful=useful,unuseful=unuseful) 24 | return(out) 25 | } 26 | 27 | get_book_reviews<-function(bookid,results=100,fresh=10,count=25,verbose=TRUE,...){ 28 | u=paste0('http://book.douban.com/subject/',bookid,'/reviews') 29 | p<-.refreshURL(u,fresh,verbose) 30 | total<-sapply(getNodeSet(p, '//span[@class="count"]'), xmlValue) 31 | total<-as.integer(gsub("[^0-9]","",total)) 32 | pages<-ceiling(min(results,total)/count) 33 | 34 | cat('\n----------------There is a total of',total,'reviews.----------------\n\n') 35 | ## 预定义输出dataFrame大小 36 | out <- data.frame(matrix(nrow = pages * count, ncol = 9), stringsAsFactors = F) 37 | colnames(out) <- c("review_uri", "title", "published", "author", "author_uri", "review", 38 | "rating", "useful", "unuseful") 39 | ## out nrow index 40 | k = 1 41 | 42 | for(pg in 1:pages){ 43 | #cat(' Getting',(pg-1)*25+1,'--',pg*25,'comments...\n') 44 | u=paste0('http://book.douban.com/subject/',bookid,'/reviews?score=&start=', 45 | (pg-1)*count) 46 | cat("Getting review URLs from page ", pg, ": ", u, " ...\n") 47 | p<-.refreshURL(u,fresh,verbose) 48 | n1<-getNodeSet(p, '//div[@class="ctsh"]//div//a') 49 | href<-sapply(n1, function(x) xmlGetAttr(x, "href")) 50 | href<-unique(href[grep("/review/",href)]) 51 | href <- href[!href %in% out$review_uri] 52 | n<-length(href) 53 | if(n>0){ 54 | for(i in 1:n){ 55 | u0<-href[i] 56 | if (verbose == TRUE) { 57 | cat(" Getting ", k, " movie review from URL: ", u0, " ...\n") 58 | } 59 | out0<-.get_book_review0(u=u0,fresh,verbose) 60 | if(length(out0)==9){ 61 | out[k,]<-out0 62 | k<-k+1 63 | } else { 64 | cat(" !!!! Getting failed at URL: ", u0, " \n") 65 | } 66 | } 67 | 68 | } 69 | } 70 | 71 | out <- out[!is.na(out[, 1]), ] 72 | return(out) 73 | } 74 | 75 | -------------------------------------------------------------------------------- /R/get_book_notes.R: -------------------------------------------------------------------------------- 1 | ## get some infomation of a note by note url 2 | .get_book_note0<-function(u,fresh,verbose,...){ 3 | p<-.refreshURL(u,fresh, verbose) 4 | title<-gsub('[\n ]','',sapply(getNodeSet(p, '//title'), xmlValue)) 5 | n1<-getNodeSet(p, '//div[@class="article"]//div[@class="info"]//a') 6 | author<-sapply(n1,xmlValue)[1] 7 | author_uri<-sapply(n1,function(x) xmlGetAttr(x, "href"))[1] 8 | published<-sapply(getNodeSet(p, '//span[@class="pubtime"]'), xmlValue) 9 | note<-sapply(getNodeSet(p, '//pre[@id="link-report"]'), xmlValue) 10 | 11 | rating<-sapply(getNodeSet(p, '//div[@class="mod profile clearfix"]//span[@class]'), 12 | function(x) xmlGetAttr(x, "class"))[2] 13 | rating<-gsub('[^0-9]','',rating) 14 | 15 | v2<-sapply(getNodeSet(p, '//p[@class="pl info"]//span'), xmlValue) 16 | v2<-gsub("[^0-9]","",v2) 17 | readers<-v2[1] 18 | collectors<-v2[2] 19 | 20 | out<-c(note_uri=u,title=title,published=published,author=author, 21 | author_uri=author_uri,note=note,rating=rating, 22 | readers=readers,collectors=collectors) 23 | return(out) 24 | } 25 | ############################################################### 26 | get_book_notes<-function(bookid,results=100,fresh=10,count=10,verbose=TRUE,...){ 27 | u=paste0('http://book.douban.com/subject/',bookid,'/annotation') 28 | p<-.refreshURL(u,fresh, verbose) 29 | total<-sapply(getNodeSet(p, '//title'), xmlValue) 30 | total<-gsub("^([^)(]*)","",total) 31 | total<-as.integer(gsub("[^0-9]","",total)) 32 | cat('\n------------There is a total of',total,'notes.------------\n\n') 33 | pages<-ceiling(min(total,results)/count) 34 | out <- data.frame(matrix(nrow = pages * count, ncol = 9), stringsAsFactors = F) 35 | colnames(out) <- c("note_uri", "title", "published", "author", "author_uri", "note", 36 | "rating", "readers", "collectors") 37 | ## output dataFrame nrow index 38 | k = 1 39 | for(pg in 1:pages){ 40 | #cat('Getting',(pg-1)*10+1,'--',pg*10,'notes...\n') 41 | u=paste0('http://book.douban.com/subject/',bookid, 42 | '/annotation?sort=rank&start=',(pg-1)*count) 43 | cat("Getting note URLs from page",pg,": ",u,"...\n") 44 | p<-.refreshURL(u,fresh, verbose) 45 | href<-sapply(getNodeSet(p, '//div[@class="nlst"]//a[@href]'), 46 | function(x) xmlGetAttr(x, "href")) 47 | href<-unique(href[grep("/annotation/",href)]) 48 | href <- href[!href %in% out$note_uri] 49 | n=length(href) 50 | if(n>0){ 51 | for(i in 1:n){ 52 | u0<-href[i] 53 | if(verbose==TRUE){ 54 | cat(" Getting ", k, " book note from URL: ", u0, " ...\n") 55 | } 56 | out0<-.get_book_note0(u=u0,fresh,verbose) 57 | if(length(out0)==9){ 58 | out[k,]<-out0 59 | k=k+1 60 | } 61 | else{ 62 | cat(" !!!! Getting failed at URL: ", u0, " \n") 63 | } 64 | } 65 | } 66 | } 67 | out <- out[!is.na(out[, 1]), ] 68 | return(out) 69 | } 70 | -------------------------------------------------------------------------------- /R/get_book_discussions.R: -------------------------------------------------------------------------------- 1 | ##http://book.douban.com/subject/1291204/discussion/ 2 | ##bookid=1291204 3 | .get_book_discussion0<-function(u,fresh,verbose,...){ 4 | p<-.refreshURL(u,fresh, verbose) 5 | title<-gsub('[\n ]','',sapply(getNodeSet(p, '//head//title'),xmlValue)) 6 | published<-sapply(getNodeSet(p, '//div[@class="article"]//span[@class="mn"]'),xmlValue) 7 | published<-gsub("\n| ","", published) 8 | n1<-getNodeSet(p, '//div[@class="article"]//span[@class="pl2"]//a') 9 | author<-gsub("[\n ]","",sapply(n1,xmlValue)[1]) 10 | author_uri<-sapply(n1,function(x) xmlGetAttr(x, "href"))[1] 11 | dicussion<-sapply(getNodeSet(p, '//div[@class="article"]//span[@class=""]')[1],xmlValue) 12 | useful<-sapply(getNodeSet(p, '//span[@class="useful"]//em'),xmlValue) 13 | unuseful<-sapply(getNodeSet(p, '//span[@class="unuseful"]//em'),xmlValue) 14 | 15 | out<-c(dicussion_uri=u,title=title,published=published,author=author, 16 | author_uri=author_uri,dicussion=dicussion,useful=useful,unuseful=unuseful) 17 | return(out) 18 | } 19 | ########################################################################### 20 | get_book_discussions<-function(bookid,results=100,fresh=10,count=20,verbose=TRUE){ 21 | u=paste0('http://book.douban.com/subject/',bookid,'/discussion/') 22 | p<-.refreshURL(u,fresh, verbose) 23 | total<-gsub("[^0-9]","", sapply(getNodeSet(p ,'//span[@class="count"]'),xmlValue)) 24 | 25 | if (length(total)==0) 26 | stop('There is no discussions about this book.') 27 | cat('\n--------------There is a total of ',total,'discussions.--------------\n\n') 28 | 29 | pages<-ceiling(min(results,as.integer(total))/count) 30 | out <- data.frame(matrix(nrow = pages * count, ncol = 8), stringsAsFactors = F) 31 | colnames(out) <- c("dicussion_uri", "title", "published", "author", "author_uri", 32 | "dicussion", "useful", "unuseful") 33 | k=1 34 | if(pages>0){ 35 | for(pg in 1:pages){ 36 | u=paste0('http://book.douban.com/subject/',bookid, 37 | '/discussion/?start=',(pg-1)*20,'&sort=vote/') 38 | if(verbose==TRUE) {cat(' Getting discussion URLs from URL:',u,' ...\n')} 39 | p<-.refreshURL(u,fresh, verbose) 40 | 41 | n1<-getNodeSet(p ,'//table[@class="olt"]//td/a') 42 | href<-unique(sapply(n1,function(x) xmlGetAttr(x, "href"))) 43 | href<-href[grep('/discussion/',href)] 44 | href <- href[!href %in% out$dicussion_uri] 45 | n=length(href) 46 | if(n<1) next 47 | for(i in 1:n){ 48 | u0<-href[i] 49 | if(verbose==TRUE){ 50 | cat(" Getting ", k, " book discussion from URL: ", u0, " ...\n") 51 | } 52 | out0<-.get_book_discussion0(u=u0,fresh,verbose) 53 | if(length(out0)==8){ 54 | out[k,]<-out0 55 | k=k+1 56 | } 57 | else{ 58 | cat(" !!!! Getting failed at URL: ", u0, " \n") 59 | } 60 | } 61 | } 62 | } 63 | out <- out[!is.na(out[, 1]), ] 64 | return(out) 65 | } 66 | # b<-get_book_discussions(bookid=1291204,results=100) -------------------------------------------------------------------------------- /R/get_movie_info.R: -------------------------------------------------------------------------------- 1 | get_movie_info<-function(movieid,...){ 2 | u=paste0('http://movie.douban.com/subject/',movieid,'/') 3 | p<-.refreshURL(u) 4 | #pagetree<-htmlParse(getURL(strurl)) 5 | ##base infomation 6 | author <-sapply(getNodeSet(p,'//div[@id="info"]//a[@rel="v:directedBy"]'),xmlValue) 7 | #stars <-sapply(getNodeSet(p,'//div[@id="info"]//a[@rel="v:starring"]'),xmlValue) 8 | #genre <-sapply(getNodeSet(p,'//div[@id="info"]//span[@property="v:genre"]'),xmlValue) 9 | #runtime<-sapply(getNodeSet(p,'//div[@id="info"]//span[@property="v:runtime"]'),xmlValue) 10 | title<-sapply(getNodeSet(p, '//span[@property="v:itemreviewed"]'), xmlValue) 11 | attribute<-gsub('[\n ]','',sapply(getNodeSet(p,'//div[@id="info"]'), xmlValue)) 12 | ##rating 13 | rating<-sapply(getNodeSet(p, '//div[@id="interest_sectl"]'), xmlValue) 14 | rating<-unlist(strsplit(gsub('\n','',rating),' ')) 15 | rating<-rating[nchar(rating)>0] 16 | rating[2]<-gsub('[^0-9]','',rating[2]) 17 | rating<-as.numeric(gsub('%','',rating)) 18 | if(length(rating)<7){ 19 | rating<-c(rating,rep(0,7-length(rating))) 20 | } 21 | rating[3:7]<-rating[3:7]/100 22 | names(rating)<-c('average','votes','stars5','stars4','stars3','stars2','stars1') 23 | ##the introduction of movie 24 | intronode <- getNodeSet(p, '//span[@class="all hidden"]') 25 | if(length(intronode)==0) 26 | intronode <- getNodeSet(p, '//span[@property="v:summary"]') 27 | summary<-gsub("[\n ]","",sapply(intronode, xmlValue)) 28 | 29 | ## tags 30 | labelinfo<-sapply(getNodeSet(p, '//div[@id="db-tags-section"]//div'), xmlValue) 31 | if(length(labelinfo)==0) 32 | labelinfo<-sapply(getNodeSet(p, '//div[@class="tags-body"]//a'), xmlValue) 33 | labelinfo<-unlist(strsplit(labelinfo,' |\\(|\\)')) 34 | tags<-data.frame(tag_label=labelinfo[seq(1,length(labelinfo),2)], 35 | tag_freq=labelinfo[seq(2,length(labelinfo),2)], 36 | stringsAsFactors=F) 37 | ## total of reviews 38 | reviews_total<-gsub("[^0-9]","",sapply(getNodeSet(p, '//div[@class="review-more"]//a'), xmlValue)) 39 | #total of comments 40 | comments_total<-gsub('[^0-9]','', 41 | sapply(getNodeSet(p, '//div[@id="comments-section"]//span//a')[1], xmlValue)) 42 | #if(length(short_comments)==0)short_comments<-0 43 | #if(length(long_comments)==0)long_comments<-0 44 | ##audience 45 | audience<-sapply(getNodeSet(p, '//div[@class="subject-others-interests-ft"]//a'), xmlValue) 46 | audience<-as.integer(gsub('[^0-9]','',audience)) 47 | if(length(audience)==3) 48 | names(audience)<-c('doings','collections','wishes') 49 | 50 | image<-sapply(getNodeSet(p, '//div[@class="indent"]//div[@id="mainpic"]//img'), 51 | function(x) xmlGetAttr(x, "src")) 52 | 53 | list(title=title,author=author, 54 | rating=rating, 55 | summary=summary, 56 | tags=tags, 57 | href=u,image=image, 58 | reviews_total=reviews_total, 59 | comments_total=comments_total, 60 | audience=audience,attribute=attribute) 61 | } 62 | ## get_movie_info(movieid=10527209) 63 | -------------------------------------------------------------------------------- /R/get_movie_discussions.R: -------------------------------------------------------------------------------- 1 | #http://movie.douban.com/subject/5308265/discussion/ 2 | ##x<-get_movie_discussions(movieid=5308265,results = 100) 3 | .get_movie_discussion0<-function(u,fresh,verbose){ 4 | p<-.refreshURL(u,fresh, verbose) 5 | title<-gsub('[\n ]','',sapply(getNodeSet(p, '//head//title'),xmlValue)) 6 | published<-sapply(getNodeSet(p, '//div[@class="article"]//span[@class="mn"]'),xmlValue) 7 | published<-gsub("\n| ","", published) 8 | n1<-getNodeSet(p, '//div[@class="article"]//span[@class="pl2"]//a') 9 | author<-gsub("[\n ]","",sapply(n1,xmlValue)[1]) 10 | author_uri<-sapply(n1,function(x) xmlGetAttr(x, "href"))[1] 11 | dicussion<-sapply(getNodeSet(p, '//div[@class="article"]//span[@class=""]')[1],xmlValue) 12 | useful<-sapply(getNodeSet(p, '//span[@class="useful"]//em'),xmlValue) 13 | unuseful<-sapply(getNodeSet(p, '//span[@class="unuseful"]//em'),xmlValue) 14 | out<-c(dicussion_uri=u,title=title,published=published,author=author, 15 | author_uri=author_uri,dicussion=dicussion,useful=useful,unuseful=unuseful) 16 | return(out) 17 | } 18 | ################################################# 19 | get_movie_discussions<-function(movieid,results = 100, fresh = 10,count=20, verbose = TRUE,...){ 20 | u = paste0("http://movie.douban.com/subject/", movieid, "/discussion/") 21 | p <- .refreshURL(u, fresh, verbose) 22 | total<-gsub('[^0-9]','',sapply(getNodeSet(p, '//span[@class="count"]'),xmlValue)) 23 | if (length(total)==0) 24 | stop('There is no discussions about this movie.') 25 | cat('-----There is a tatal of ',total,' movie discussions.-----\n') 26 | pages<-ceiling(min(results,as.integer(total))/count) 27 | out <- data.frame(matrix(nrow = pages * count, ncol = 8), stringsAsFactors = F) 28 | colnames(out) <- c("dicussion_uri", "title", "published", "author", "author_uri", 29 | "dicussion", "useful", "unuseful") 30 | k=1 31 | if(pages>0){ 32 | for(pg in 1:pages){ 33 | u=paste0('http://movie.douban.com/subject/',movieid, 34 | '/discussion/?start=',(pg-1)*count,'&sort=vote/') 35 | if(verbose==TRUE) { 36 | #cat('Getting',(pg-1)*20+1,'--',pg*20,'discussions...\n') 37 | cat("Getting movie discussion URLS from page=",pg,": ",u,"...\n") 38 | } 39 | p <- .refreshURL(u, fresh, verbose) 40 | n1<-getNodeSet(p ,'//table[@class="olt"]//td/a') 41 | href<-unique(sapply(n1,function(x) xmlGetAttr(x, "href"))) 42 | href<-href[grep('/discussion/',href)] 43 | href <- href[!href %in% out$dicussion_uri] 44 | n=length(href) 45 | if(n>0){ 46 | for(i in 1:n){ 47 | u0<-href[i] 48 | if(verbose==TRUE){ 49 | cat(" Getting ", k, " movie discussion from URL: ", u0, " ...\n") 50 | } 51 | out0<-.get_movie_discussion0(u=u0,fresh,verbose) 52 | if(length(out0)==8){ 53 | out[k,]<-out0 54 | k=k+1 55 | } 56 | else{ 57 | cat(" !!!! Getting failed at URL: ", u0, " \n") 58 | } 59 | } 60 | } 61 | } 62 | } 63 | out <- out[!is.na(out[, 1]), ] 64 | return(out) 65 | } -------------------------------------------------------------------------------- /R/get.book.review.R: -------------------------------------------------------------------------------- 1 | get.book.review<-function(bookid,results=100,verbose=TRUE){ 2 | 3 | .get_review<-function(bookid,start,results,verbose){ 4 | u=paste0("http://api.douban.com/book/subject/",bookid, 5 | "/reviews?start-index=",start,"&max-results=",results) 6 | p=getURL(u) 7 | pagetree <- htmlParse(p,asTree=F) 8 | 9 | review_id<-sapply(getNodeSet(pagetree, '//entry//id'), xmlValue) 10 | review_id<-gsub("[^0-9]","",review_id) 11 | #sapply(getNodeSet(pagetree, '//entry//link[@rel="alternate"]'),function(x) xmlGetAttr(x, "href")) 12 | title<-sapply(getNodeSet(pagetree, '//entry//title'), xmlValue) 13 | Encoding(title)<-"UTF-8" 14 | 15 | author<-sapply(getNodeSet(pagetree, '//entry//author//name'), xmlValue) 16 | Encoding(author)<-"UTF-8" 17 | author_id<-sapply(getNodeSet(pagetree, '//entry//author//uri'), xmlValue) 18 | author_id<-gsub("[^0-9]","",author_id) 19 | published<-gsub("T|\\+08:00"," ",sapply(getNodeSet(pagetree, '//entry//published'), xmlValue)) 20 | updated<-gsub("T|\\+08:00"," ",sapply(getNodeSet(pagetree, '//entry//updated'), xmlValue)) 21 | comments<-sapply(getNodeSet(pagetree, '//entry//comments'),function(x) xmlGetAttr(x, "value")) 22 | useless<-sapply(getNodeSet(pagetree, '//entry//useless'),function(x) xmlGetAttr(x, "value")) 23 | votes<-sapply(getNodeSet(pagetree, '//entry//votes'),function(x) xmlGetAttr(x, "value")) 24 | #rating<-sapply(getNodeSet(pagetree, '//entry//rating'),function(x) xmlGetAttr(x, "value")) 25 | n=length(review_id) 26 | review<-vector(length=n) 27 | rating<-vector(length=n) 28 | for(i in 1:n){ 29 | u<-paste0("http://book.douban.com/review/",review_id[i],"/") 30 | if(verbose==TRUE){ 31 | cat(" Getting review form: ",i," ",u," ..... \n") 32 | } 33 | p=getURL(u) 34 | p <- htmlParse(p,asTree=F) 35 | review0<-sapply(getNodeSet(p, '//span[@property="v:description"]'), xmlValue) 36 | review0<-gsub("\r","",review0) 37 | rating[i]<-sapply(getNodeSet(p, '//span[@property="v:rating"]'), xmlValue) 38 | 39 | if(nchar(review0)<2)review0<-NA 40 | review[i]<-review0 41 | } 42 | out<-data.frame(title=title,review_id=review_id,review=review,author=author, 43 | author_uri=author_id,published=published,updated=updated, 44 | comments=comments,useless=useless,votes=votes, 45 | rating=rating,stringsAsFactors=F) 46 | return(out) 47 | } 48 | #################################### 49 | u=paste0("http://api.douban.com/book/subject/",bookid, 50 | "/reviews?start-index=1&max-results=1") 51 | p=htmlParse(getURL(u)) 52 | totalresults<-sapply(getNodeSet(p, '//totalresults'), xmlValue) 53 | cat("There are a total of ",totalresults," reviews.\n") 54 | results<-min(results,as.integer(totalresults)) 55 | starts<-(0:floor(results/50))*50+1 56 | n=length(starts) 57 | ends<-rep(50,n) 58 | ends[n]<-results-50*(n-1) 59 | if(results%%50==0){ 60 | starts<-starts[1:(n-1)] 61 | ends<-ends[1:(n-1)] 62 | } 63 | n=length(starts) 64 | out<-c() 65 | for(i in 1:n ){ 66 | cat('Getting page: ',i,"\n") 67 | out0<-.get_review(bookid,start=starts[i],results=ends[i],verbose) 68 | out<-rbind(out0,out) 69 | } 70 | 71 | return(out) 72 | } 73 | -------------------------------------------------------------------------------- /R/get_music_reviews.R: -------------------------------------------------------------------------------- 1 | ##http://music.douban.com/subject/3843530/reviews?sort=time 2 | #x<-get_music_reviews(musicid=3843530,results = 100,verbosr=T) 3 | .get_music_review0 <- function(u, fresh = 10, verbose = TRUE, ...) { 4 | p <- .refreshURL(u = u, fresh, verbose) 5 | ## review_id review_id<-gsub('[^0-9]','',u) title 6 | title <- sapply(getNodeSet(p, '//head//title'), xmlValue) 7 | ## 发表时间 8 | published <- sapply(getNodeSet(p, "//span[@property=\"v:dtreviewed\"]"), xmlValue) 9 | ## 作者,URI 10 | author <- sapply(getNodeSet(p, '//span[@property="v:reviewer"]'), xmlValue) 11 | author_uri <- sapply(getNodeSet(p, '//div[@id="content"]//span[@class="pl2"]//a'), 12 | function(x) xmlGetAttr(x, "href"))[1] 13 | ## 评分 14 | rating <- sapply(getNodeSet(p, "//span[@property=\"v:rating\"]"), xmlValue) 15 | ## 影评内容 16 | review <- sapply(getNodeSet(p, "//span[@property=\"v:description\"]"), xmlValue) 17 | if (length(review) == 0) { 18 | review <- sapply(getNodeSet(p, "//div[@property=\"v:description\"]"), xmlValue) 19 | } 20 | #review <- gsub("\r", "", review) 21 | ## 有用 & 没用的次数 22 | x0 <- sapply(getNodeSet(p, "//div[@class=\"main-panel-useful\"]//em"), xmlValue) 23 | useful = x0[1] 24 | unuseful = x0[2] 25 | if (length(x0) == 0 ) { 26 | useful <- sapply(getNodeSet(p, "//span[@class=\"useful\"]//em"), xmlValue) 27 | unuseful <- sapply(getNodeSet(p, "//span[@class=\"unuseful\"]//em"), xmlValue) 28 | } 29 | out <- c(review_uri = u, title = title, published = published, author = author, author_uri = author_uri, 30 | review = review, rating = rating, useful = useful, unuseful = unuseful) 31 | return(out) 32 | } 33 | ########################################################################### 34 | get_music_reviews<-function(musicid,results = 100, fresh = 10,count=25, verbose = TRUE,...){ 35 | u = paste0('http://music.douban.com/subject/',musicid,'/reviews?sort=time') 36 | p <- .refreshURL(u, fresh, verbose) 37 | 38 | total<-gsub('[^0-9]','',sapply(getNodeSet(p, '//span[@class="count"]'),xmlValue)) 39 | if (length(total)==0) 40 | stop('There is no reviews about this music.') 41 | cat('-----There is a tatal of ',total,' music reviews.-----\n') 42 | pages<-ceiling(min(results,as.integer(total))/count) 43 | out <- data.frame(matrix(nrow = pages * count, ncol = 9), stringsAsFactors = F) 44 | colnames(out) <- c("review_uri", "title", "published", "author", "author_uri", "review", 45 | "rating", "useful", "unuseful") 46 | ## out nrow index 47 | k = 1 48 | if(pages>0){ 49 | for (pg in 1:pages) { 50 | u=paste0('http://music.douban.com/subject/',musicid,'/reviews?sort=time&start=', 51 | (pg-1)*count,"&filter=&limit=", count) 52 | if(verbose==TRUE){ 53 | cat(' Getting music comment URLS from page=',pg-1,': ',u,'...\n') 54 | } 55 | p <- .refreshURL(u, fresh, verbose) 56 | n1 <- getNodeSet(p, '//div[@id="content"]//div[@class="article"]//a[@title]') 57 | href <- unique(sapply(n1, function(x) xmlGetAttr(x, "href"))) 58 | href <- href[grep("/review/", href)] 59 | href <- href[!href %in% out$review_uri] 60 | n = length(href) 61 | if (n > 0) { 62 | for (i in 1:n) { 63 | u0 = href[i] 64 | if (verbose == TRUE) { 65 | cat(" %%%% Getting ", k, " music review from URL: ", u0, " ...\n") 66 | } 67 | out0 <- tryCatch(.get_music_review0(u = u0, fresh, verbose), error = function(e) { 68 | NULL 69 | }) 70 | if (length(out0) == 9) { 71 | out[k, ] <- out0 72 | k = k + 1 73 | } 74 | if (length(out0) != 9) { 75 | cat(" !!!! Getting failed at URL: ", u0, " \n") 76 | } 77 | } 78 | } 79 | } 80 | } 81 | out <- out[!is.na(out[, 1]), ] 82 | return(out) 83 | } 84 | -------------------------------------------------------------------------------- /R/get_movie_reviews.R: -------------------------------------------------------------------------------- 1 | ##movieid=5308265 2 | ## m=get_movie_reviews(movieid=5308265,results=50) 3 | ######################################################## 4 | ### 获取每篇评论的信息 5 | .get_movie_review0 <- function(u, fresh = 10, verbose = TRUE, ...) { 6 | p <- .refreshURL(u = u, fresh, verbose) 7 | ## review_id review_id<-gsub('[^0-9]','',u) title 8 | title <- sapply(getNodeSet(p, "//div[@id=\"content\"]//span[@property=\"v:summary\"]"), xmlValue) 9 | ## 发表时间 10 | published <- sapply(getNodeSet(p, "//span[@property=\"v:dtreviewed\"]"), xmlValue) 11 | ## 作者,URI 12 | author <- sapply(getNodeSet(p, "//span[@property=\"v:reviewer\"]"), xmlValue) 13 | author_uri <- sapply(getNodeSet(p, "//div[@class=\"main-hd\"]//a[@href]"), function(x) xmlGetAttr(x, 14 | "href"))[1] 15 | ## 评分 16 | rating <- sapply(getNodeSet(p, "//span[@property=\"v:rating\"]"), xmlValue) 17 | ## 影评内容 18 | review <- sapply(getNodeSet(p, "//span[@property=\"v:description\"]"), xmlValue) 19 | if (length(review) == 0) { 20 | review <- sapply(getNodeSet(p, "//div[@property=\"v:description\"]"), xmlValue) 21 | } 22 | #review <- gsub("\r", "", review) 23 | ## 有用 & 没用的次数 24 | x0 <- sapply(getNodeSet(p, "//div[@class=\"main-panel-useful\"]//em"), xmlValue) 25 | useful = x0[1] 26 | unuseful = x0[2] 27 | if (length(useful) == 0 | length(unuseful) == 0) { 28 | useful <- sapply(getNodeSet(p, "//span[@class=\"useful\"]//em"), xmlValue) 29 | unuseful <- sapply(getNodeSet(p, "//span[@class=\"unuseful\"]//em"), xmlValue) 30 | } 31 | out <- c(review_uri = u, title = title, published = published, author = author, author_uri = author_uri, 32 | review = review, rating = rating, useful = useful, unuseful = unuseful) 33 | return(out) 34 | } 35 | ########################################################################### 36 | #### 37 | get_movie_reviews <- function(movieid, results = 100, fresh = 10, count = 10, verbose = TRUE, 38 | ...) { 39 | u = paste0("http://movie.douban.com/subject/", movieid, "/reviews") 40 | p <- .refreshURL(u, fresh, verbose) 41 | ## 总评论数 42 | total <- sapply(getNodeSet(p, "//head//title"), xmlValue) 43 | total <- unlist(strsplit(total, " "))[-1] 44 | total <- as.integer(gsub("[^0-9]", "", total[length(total)])) 45 | 46 | cat("--------------There is a total of", total, "reviews.------------\n") 47 | 48 | pages <- ceiling(min(results, total) * 2/count) 49 | ## 预定义输出dataFrame大小 50 | out <- data.frame(matrix(nrow = pages * count, ncol = 9), stringsAsFactors = F) 51 | colnames(out) <- c("review_uri", "title", "published", "author", "author_uri", "review", 52 | "rating", "useful", "unuseful") 53 | ## out nrow index 54 | k = 1 55 | 56 | for (pg in 1:pages) { 57 | u = paste0("http://movie.douban.com/subject/", movieid, "/reviews?start=", (pg - 1) * 58 | count, "&filter=&limit=", count) 59 | # cat('Getting',(pg-1)*count+1,'--',pg*count,'reviews:',u,' ...\n') 60 | cat("Getting review URLs from page ", pg, ": ", u, " ...\n") 61 | p <- .refreshURL(u, fresh, verbose) 62 | n1 <- getNodeSet(p, "//div[@class=\"review\"]//div[@class=\"review-hd\"]//a[@title]") 63 | href <- unique(sapply(n1, function(x) xmlGetAttr(x, "href"))) 64 | href <- href[grep("/review/", href)] 65 | href <- href[!href %in% out$review_uri] 66 | n = length(href) 67 | if (n > 0) { 68 | for (i in 1:n) { 69 | u0 = href[i] 70 | 71 | if (verbose == TRUE) { 72 | cat(" %%%% Getting ", k, " movie review from URL: ", u0, " ...\n") 73 | } 74 | out0 <- tryCatch(.get_movie_review0(u = u0, fresh, verbose), error = function(e) { 75 | NULL 76 | }) 77 | if (length(out0) == 9) { 78 | out[k, ] <- out0 79 | k = k + 1 80 | } 81 | if (length(out0) != 9) { 82 | cat(" !!!! Getting failed at URL: ", u0, " \n") 83 | } 84 | } 85 | } 86 | 87 | } 88 | out <- out[!is.na(out[, 1]), ] 89 | return(out) 90 | } 91 | 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /R/get.movie.review.R: -------------------------------------------------------------------------------- 1 | 2 | ################################################## 3 | ## @u:url 4 | ## @fresh:refresh number 5 | .get_movie_review1<-function(u,fresh=10,verbose=TRUE){ 6 | p<- tryCatch(.refreshForm(u,fresh,verbose),error = function(e){NULL}) 7 | if(is.null(p)){ 8 | warning("Getting failed:",u,".\n") 9 | out<-NULL 10 | return(out) 11 | } 12 | if(!is.null(p)){ 13 | ##标题 14 | title<-sapply(getNodeSet(p, '//head//title'), xmlValue) 15 | #sapply(getNodeSet(p, '//body//div[@class="itm"]//span'), xmlValue) 16 | ##作者昵称及URI 17 | author<-sapply(getNodeSet(p, '//body//div[@class="itm"]//a[@class="founder"]'), xmlValue) 18 | author_uri<-sapply(getNodeSet(p, '//body//div[@class="itm"]//a[@class="founder"]'), 19 | function(x) xmlGetAttr(x, "href")) 20 | author_uri<-paste0("http://m.douban.com",author_uri) 21 | ##推荐次数 22 | votes<-sapply(getNodeSet(p, '//body//span[@class="forbidden"]'), xmlValue) 23 | votes<-as.integer(gsub("[^0-9]","",votes)) 24 | #### 25 | st<-sapply(getNodeSet(p, '//body//div[@class="itm"]//span'), xmlValue) 26 | ##发表时间 27 | published<-st[grep("\\d{4}\\-[0-1]",st)] 28 | ##评分 29 | rating<-gsub("[^0-9]","",st[grep("\\d{4}\\-[0-1]",st)-1]) 30 | ####本评论的显示页数 31 | pgs<-sapply(getNodeSet(p, '//body//div[@class="paginator"]//span'), xmlValue) 32 | pgs<-as.integer(gsub("1/| ","",pgs)) 33 | #rev<-vector(length=pgs) 34 | rev<-NULL 35 | ##评论内容 36 | rev0<-paste(sapply(getNodeSet(p, '//body//div//p'), xmlValue),collapse="") 37 | pa<-'\\(\u8f6c\u4e0b\u9875\\)' ### \\(转下页\\) 38 | rev[1]<-gsub(pa,"",rev0) 39 | preU<-unlist(strsplit(u,"\\?id="))[1] 40 | review_id<-gsub("[^0-9]","",preU) 41 | ##分页读取评论内容 42 | if(pgs>1){ 43 | for(pg in 2:pgs){ 44 | u0<-paste0(preU,"?page=",pg) 45 | #p<-tryCatch(htmlParse(postForm(u0)),error = function(e){NULL}) 46 | p<- tryCatch(.refreshForm(u0,fresh,verbose),error = function(e){NULL}) 47 | 48 | if(!is.null(p)){ 49 | rev0<-paste(sapply(getNodeSet(p, '//body//div//p'), xmlValue),collapse="") 50 | rev[pg]<-gsub(pa,"",rev0) 51 | } 52 | } 53 | } 54 | review<-paste(rev,collapse="") 55 | out<-c(review_id=review_id,title=title,author=author,author_uri=author_uri, 56 | review=review,published=published,votes=votes,rating=rating) 57 | #cat("---length:",length(out),"\n", 58 | # out[c("title","author","author_uri", "published","votes","rating")],'\n') 59 | return(out) 60 | } 61 | 62 | } 63 | ##################################################################### 64 | ## movieid:movie id of douban 65 | ## results:get the number of reviews 66 | get.movie.review<-function(movieid,results=100,fresh=20,verbose=TRUE){ 67 | u=paste0("http://m.douban.com/movie/subject/",movieid,"/reviews") 68 | p<-htmlParse(postForm(u)) 69 | ## 评论总数 70 | total<-sapply(getNodeSet(p, '//div[@class="title"]//span'), xmlValue) 71 | total<-as.integer(gsub("[^0-9]","",total)) 72 | cat("----------There are a total of ",total," reviews.----------\n") 73 | results<-min(total,results) 74 | ## 评论屏数 75 | pages<-ceiling(results/40) 76 | 77 | #n1<-getNodeSet(p, '//div[@class="list"]//div[@class="item"]//a') 78 | #session<-sapply(n1,function(x) xmlGetAttr(x, "href"))[1] 79 | #session<-unlist(strsplit(session,"=")) 80 | #session<-session[length(session)] 81 | ##预定义输出大小 82 | out<-data.frame(matrix(nrow=pages*40,ncol=8),stringsAsFactors=F) 83 | colnames(out)<-c("review_id","title","author","author_uri","review", 84 | "published","votes","rating") 85 | ## k 为out行标 86 | k=1 87 | er=1 88 | for(i in 1:pages){ 89 | #http://m.douban.com/movie/subject/3530403/reviews?page=1&session=368f6186 90 | #u=paste0("http://m.douban.com/movie/subject/",movieid, 91 | # "/reviews?page=",i,"&session=",session) 92 | u=paste0("http://m.douban.com/movie/subject/",movieid, 93 | "/reviews?page=",i) 94 | if(verbose==TRUE){ 95 | cat("Retrieving URLs of page ",i, " of reviews from: ",u," .....\n") 96 | } 97 | #p<-tryCatch(htmlParse(postForm(u)),error = function(e){NULL}) 98 | p<- tryCatch(.refreshForm(u,fresh,verbose),error = function(e){NULL}) 99 | if(is.null(p)){ 100 | cat("!!!!!! Host Forbidden:",u,' .\n') 101 | next 102 | } 103 | n1<-getNodeSet(p, '//div[@class="list"]//div[@class="item"]//a') 104 | href<-sapply(n1,function(x) xmlGetAttr(x, "href")) 105 | href<-href[grep("/movie/review/",href)] 106 | n=length(href) 107 | for(j in 1:n){ 108 | u0=paste0("http://m.douban.com",href[j]) 109 | if(verbose==TRUE){ 110 | cat(" Retrieving ",k, "reviews from: ",u0," .....\n") 111 | } 112 | ### 无法打开页面返回NULL,通常是网络异常,Forbidden 113 | out0<-tryCatch(.get_movie_review1(u=u0,fresh,verbose),error = function(e){NULL}) 114 | if(length(out0)<8){ 115 | cat(er," ------Getting failed:",u0,".\n") 116 | #write(out0,file=paste0(er,".txt")) 117 | er=er+1 118 | } 119 | if(length(out0)==8){ 120 | out[k,]<-out0 121 | k=k+1 122 | } 123 | 124 | } 125 | } 126 | out<-out[!is.na(out[,1]),] 127 | return(out) 128 | } 129 | -------------------------------------------------------------------------------- /R/get_book_info.R: -------------------------------------------------------------------------------- 1 | 2 | get_book_info<-function(bookid,...){ 3 | u=paste0('http://book.douban.com/subject/',bookid,'/') 4 | p<-.refreshURL(u) 5 | ###title 6 | title<-sapply(getNodeSet(p, '//span[@property="v:itemreviewed"]'), xmlValue) 7 | ## author ISBN ... 8 | author<-gsub('[\n ]','', 9 | sapply(getNodeSet(p, '//div[@id="info"]//span//a'), xmlValue)) 10 | author<-paste0(author,collapse=' ') 11 | att1<-sapply(getNodeSet(p, '//div[@id="info"]//span[@class="pl"]'), xmlValue) 12 | att1<-gsub(' ','',att1) 13 | att2<-gsub('\n','',sapply(getNodeSet(p, '//div[@id="info"]'), xmlValue)) 14 | attribute<-list() 15 | ind=which(att1=='\u4f5c\u8005') 16 | a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1]))) 17 | 18 | attribute$author<-a1[2] 19 | ind=which(att1=='\u51fa\u7248\u793e:') 20 | a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1]))) 21 | attribute$publisher<-a1[2] 22 | ind=which(att1=='\u51fa\u7248\u5e74:') 23 | a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1]))) 24 | attribute$pubdate<-a1[2] 25 | ind=which(att1=='\u9875\u6570:') 26 | a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1]))) 27 | attribute$pages<-a1[2] 28 | ind=which(att1=='\u5b9a\u4ef7:') 29 | a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1]))) 30 | attribute$price<-gsub(' ','',a1[2]) 31 | ind=which(att1=='\u88c5\u5e27:') 32 | a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1]))) 33 | attribute$binding<-gsub(' ','',a1[2]) 34 | 35 | ind=which(att1=='\u526f\u6807\u9898:') 36 | if(length(ind)>0){ 37 | a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1]))) 38 | attribute$subtitle<-gsub(' ','',a1[2]) 39 | } 40 | ind=which(att1=='\u8bd1\u8005') 41 | if(length(ind)>0){ 42 | a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1]))) 43 | attribute$translator<-gsub(' |:','',a1[2]) 44 | } 45 | ind=which(att1=='\u539f\u4f5c\u540d:') 46 | if(length(ind)>0){ 47 | a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1]))) 48 | attribute$origin_title<-gsub(':| ','',a1[2]) 49 | } 50 | ## rating 51 | score<-as.numeric(gsub('[\n ]','', 52 | sapply(getNodeSet(p, '//strong[@property="v:average"]'), xmlValue))) 53 | votes<-as.integer(sapply(getNodeSet(p, '//span[@property="v:votes"]'), xmlValue)) 54 | if(length(votes)==0)votes<-NA 55 | rating<-sapply(getNodeSet(p, '//div[@id="interest_sectl"]//div'), xmlValue) 56 | rating<-gsub('[\n%]|\\([^\\(\\)]*\\)','',rating) 57 | rating<-unlist(strsplit(rating,' ')) 58 | rating<-rating[nchar(rating)>0] 59 | rating<-as.numeric(rating[nchar(rating)>0])[-1]/100 60 | if(length(rating)<5)rating<-c(rating,rep(NA,5-length(rating))) 61 | rating<-c(score,votes,rating) 62 | names(rating)<-c('average','votes','stars5','stars4','stars3','stars2','stars1') 63 | 64 | medium<-sapply(getNodeSet(p, '//div[@class="indent"]//div[@id="mainpic"]//a'), 65 | function(x) xmlGetAttr(x, "href")) 66 | large<-sapply(getNodeSet(p, '//div[@class="indent"]//div[@id="mainpic"]//img'), 67 | function(x) xmlGetAttr(x, "src")) 68 | image<-c(medium =medium ,large=large) 69 | ##content introduction,author introduction 70 | contentinfo<-gsub('[\n ]','',sapply(getNodeSet(p, '//div[@class="intro"]'), 71 | xmlValue)) 72 | if(length(contentinfo)>2){ 73 | pa<-"" ##...\\(展开全部\\) 74 | contentinfo<-contentinfo[-grep("...\\(\u5c55\u5f00\u5168\u90e8\\)",contentinfo)] 75 | } 76 | 77 | #clen=length(contentinfo) 78 | summary<-contentinfo[1] 79 | author_intro<-contentinfo[2] 80 | 81 | ##labels 82 | #labels_amount <- gsub('[^0-9]','',sapply( 83 | # getNodeSet(p, '//div[@id="db-tags-section"]//h2'),xmlValue)) 84 | labelinfo<-gsub('\n','', 85 | sapply(getNodeSet(p, '//div[@id="db-tags-section"]//div'), xmlValue)) 86 | labelinfo<-unlist(strsplit(labelinfo,' ')) 87 | #labelinfo<-gsub('<[^<>]*>|\n','',labelinfo) 88 | labelinfo<-labelinfo[nchar(labelinfo)>1] 89 | 90 | .label_tran<-function(x){ 91 | x=strsplit(x,'[\\(\\)]') 92 | n=length(x) 93 | tag_label<-tag_freq<-c() 94 | for(i in 1:n){ 95 | temp<-x[[i]] 96 | temp<-temp[nchar(temp)>0] 97 | tag_label[i]<-temp[length(temp)-1] 98 | tag_freq[i]<-temp[length(temp)] 99 | } 100 | data.frame(tag_label,tag_freq,stringsAsFactors=F) 101 | } 102 | tags<-.label_tran(labelinfo) 103 | 104 | ## reader info 105 | readnode <- getNodeSet(p, '//div[@class="indent"]//p[@class="pl"]//a') 106 | readers<-as.integer(gsub('[^0-9]','',sapply(readnode, xmlValue))) 107 | if(length(readers)<3)readers<-c(readers,rep(NA,3-length(readers))) 108 | names(readers)<-c('doings','collections','wishes') 109 | ## 110 | comments_total<-gsub('[^0-9]','', 111 | sapply(getNodeSet(p, '//div[@id="reviews"]//h2'), xmlValue)) 112 | notes_total<-gsub('[^0-9]','',sapply(getNodeSet(p, '//div[@class="hd"]'), xmlValue)) 113 | attribute$author_intro<-author_intro 114 | list(title=title, 115 | author=author, 116 | rating=rating, 117 | href=u, 118 | summary=summary, 119 | author_intro=author_intro, 120 | image=image, 121 | tags=tags, 122 | comments_total=comments_total, 123 | notes_total=notes_total, 124 | readers=readers, 125 | attribute=attribute) 126 | } 127 | -------------------------------------------------------------------------------- /R/user_movie_status.R: -------------------------------------------------------------------------------- 1 | ## extract some infomation about a movie 2 | .extract_movie<-function(x){ 3 | b=unlist(xmlToList(x)) 4 | Encoding(b)<-"UTF-8" 5 | movieid<-gsub("[^0-9]","",b["div.a..attrs.href"]) 6 | user_rating<-b[b %in% c("rating1-t","rating2-t", 7 | "rating3-t","rating4-t","rating5-t")] 8 | user_rating<-gsub("[^0-9]","",user_rating) 9 | if(length(user_rating)==0){user_rating<-NA} 10 | 11 | watching_date<-b[which(b=="date")-1] 12 | #if(length(watching_date)==0)watching_date<-NA 13 | tagg<-"\u6807\u7b7e: " ;Encoding(tagg)<-"UTF-8" ##标签: 14 | user_tags<-gsub(tagg,"",b[which(b=="tags")-1]) 15 | if(length(user_tags)==0){user_tags<-NA} 16 | comment<-gsub("[\n ]","",b[which(b=="comment")-1]) 17 | if(length(comment)==0){comment<-NA} 18 | 19 | tmp<-get.movie.info(movieid) 20 | title<-tmp$title 21 | summary<-tmp$summary 22 | author<-paste(tmp$author,collapse=",") 23 | average_rating<-tmp$rating[2] 24 | image<-tmp$image 25 | movie_type<-paste(tmp$attribute$movie_type,collapse=' ') 26 | year<-tmp$attribute$year 27 | cast<-paste(tmp$attribute$cast,collapse=',') 28 | tags<-paste(tmp$tags[,1],collapse=' ') 29 | duration<-tmp$attribute$movie_duration[1] 30 | if(length(duration)==0){duration<-NA} 31 | out<-c(movieid,title,author,user_rating,user_tags, 32 | comment,summary,average_rating,image,cast,tags,duration) 33 | names(out)<-NULL 34 | return(out) 35 | } 36 | ######################################################## 37 | ##coll<-.user_movie_what0(userid="",count=15,what="collect") 38 | ##@what:do,collect,wish 39 | .user_movie_what0<-function(userid,count=15,what="collect",verbose=TRUE){ 40 | u<-paste0('http://movie.douban.com/people/',userid,'/',what) 41 | p<-.refreshURL(u) 42 | tag_label<-sapply(getNodeSet(p, '//ul[@class="tag-list mb10"]//a'),xmlValue) 43 | tag_freq<-as.integer(sapply(getNodeSet(p, '//ul[@class="tag-list mb10"]//span'), 44 | xmlValue)) 45 | tags<-data.frame(tag_label=tag_label, 46 | tag_freq=tag_freq, stringsAsFactors=F) 47 | ##### 48 | total<-sapply(getNodeSet(p, '//head//title'),xmlValue) 49 | total<-substr(total,regexpr("\\([^)]+\\)",total),nchar(total)) 50 | total<-gsub("[^0-9]","",total) 51 | cat("\n--------There is a total of ",total," ",what,"movies.----------\n") 52 | pages<-ceiling(as.integer(total)/count) 53 | df<-data.frame(matrix(nrow=pages*count,ncol=13),stringsAsFactors=F) 54 | colnames(df)<-c("movieid","title","author","user_rating","user_tags", 55 | "comment","summary","average_rating","image", 56 | "cast","tags","duration","watching_date") 57 | k=1 58 | if(pages>0){ 59 | for(pg in 1:pages){ 60 | u<-paste0('http://movie.douban.com/people/',userid,'/',what, 61 | "?start=",(pg-1)*15,"&sort=time&rating=all&filter=all&mode=grid") 62 | if(verbose==TRUE){ 63 | cat("Getting ",what," movies infomation from:",u,"...\n") 64 | } 65 | p<-.refreshURL(u) 66 | node<-getNodeSet(p, '//div[@class="item"]') 67 | watching_date<-sapply(getNodeSet(p, '//li//span[@class="date"]'),xmlValue) 68 | 69 | df0<-t(sapply(node,.extract_movie)) 70 | df0<-cbind(df0,watching_date) 71 | nr=nrow(df0) 72 | df[k:(k+nr-1),]<-df0 73 | k=k+nr 74 | } 75 | } 76 | df<- df[!is.na(df[, 1]), ] 77 | return(list(tags=tags,df=df)) 78 | } 79 | 80 | ######################################## 81 | ## 获取所有发表过的影评 82 | .user_movie.review0<-function(userid,count=5,verbose=TRUE){ 83 | u<-paste0("http://movie.douban.com/people/",userid,"/reviews") 84 | p<-.refreshURL(u) 85 | total<-sapply(getNodeSet(p, '//head//title'),xmlValue) 86 | total<-substr(total,regexpr("\\([^)]+\\)",total),nchar(total)) 87 | total<-as.integer(gsub("[^0-9]","",total)) 88 | cat("\n--------There is a total of ",total,"reviews.--------\n") 89 | pages<-ceiling(total/count) 90 | out <- data.frame(matrix(nrow = pages * count, ncol = 9), 91 | stringsAsFactors = F) 92 | colnames(out) <- c("review_uri", "title", "published", "author", 93 | "author_uri", "review", "rating", "useful", "unuseful") 94 | k = 1 95 | if(pages>0){ 96 | for(pg in 1:pages){ 97 | u = paste0("http://movie.douban.com/people/", userid, 98 | "/reviews?start=", (pg - 1) * count) 99 | cat("Getting review URLs from page ", pg, ": ", u, " ...\n") 100 | p <- .refreshURL(u, verbose) 101 | href <- sapply(getNodeSet(p, '//li[@class="nlst"]//a'), function(x) xmlGetAttr(x, "href")) 102 | href <- unique(href[grep("/review/", href)]) 103 | href <- href[!href %in% out$review_uri] 104 | n <- length(href) 105 | if (n > 0) { 106 | for (i in 1:n) { 107 | u0 <- href[i] 108 | if (verbose == TRUE) { 109 | cat(" Getting ", k, " movie review from URL: ", u0, " ...\n") 110 | } 111 | out0 <- .get_movie_review0(u = u0, verbose) 112 | if (length(out0) == 9) { 113 | out[k, ] <- out0 114 | k <- k + 1 115 | } 116 | else { 117 | cat(" !!!! Getting failed at URL: ", u0," \n") 118 | } 119 | } 120 | } 121 | } 122 | } 123 | out <- out[!is.na(out[, 1]), ] 124 | return(out) 125 | } 126 | 127 | ############################################## 128 | ##@userid 129 | ##@front 130 | user_movie_status<-function(userid,verbose=TRUE,front=TRUE){ 131 | cat("--------Retrieving information about read movies.--------\n") 132 | tmp<-.user_movie_what0(userid,count=15,what="collect",verbose) 133 | collect_tags<-tmp$tags 134 | collect_df<-tmp$df 135 | cat("--------Retrieving information about reading movies.--------\n") 136 | tmp<-.user_movie_what0(userid,count=15,what="do",verbose) 137 | do_tags<-tmp$tags 138 | do_df<-tmp$df 139 | cat("--------Retrieving information about the movies you want to read.--------\n") 140 | tmp<-.user_movie_what0(userid,count=15,what="wish",verbose) 141 | wish_tags<-tmp$tags 142 | wish_df<-tmp$df 143 | cat("--------Retrieving your published movie reviews. --------\n") 144 | reviews<-.user_movie.review0(userid,verbose) 145 | 146 | collect_images<-list() 147 | if(front==TRUE){ 148 | if(!require(EBImage)){ 149 | source("http://bioconductor.org/biocLite.R") 150 | biocLite("EBImage") 151 | require(EBImage) 152 | } 153 | ##cat("--------正在获取已观电影的海报图片--------...\n") 154 | cat("--------\u6b63\u5728\u83b7\u53d6\u5df2\u89c2\u7535\u5f71\u7684\u6d77\u62a5\u56fe\u7247--------...\n") 155 | images<-collect_df$image 156 | m<-length(images) 157 | for(i in 1:m){ 158 | collect_images[[i]]<-resize(readImage(images[i]),w=60,h=80) 159 | } 160 | } 161 | list(collect_tags=collect_tags, 162 | collect_df=collect_df, 163 | do_tags=do_tags, 164 | do_df=do_df, 165 | wish_tags=wish_tags, 166 | wish_df=wish_df, 167 | reviews=reviews, 168 | collect_images=collect_images) 169 | } 170 | -------------------------------------------------------------------------------- /R/user_book_status.R: -------------------------------------------------------------------------------- 1 | ## extract some infomation about a book 2 | .extract_book<-function(x){ 3 | b=unlist(xmlToList(x)) 4 | bookid<-gsub("[^0-9]","",b["h2.a..attrs.href"]) 5 | pub<-gsub("\n| ","",b[which(b=="pub")-1]) 6 | user_rating<-b[b %in% c("rating1-t","rating2-t", 7 | "rating3-t","rating4-t","rating5-t")] 8 | user_rating<-gsub("[^0-9]","",user_rating) 9 | if(length(user_rating)==0){user_rating<-NA} 10 | pa<-"[\n ]|\u8bfb|\u8fc7|\u60f3|\u5728" ##[\n ]|读|过|想|在 11 | Encoding(pa)<-"UTF-8" 12 | reading_date<-gsub(pa,"",b[which(b=="date")-1]) 13 | tagg<-"\u6807\u7b7e: " ;Encoding(tagg)<-"UTF-8" ##标签: 14 | user_tags<-gsub(tagg,"",b[which(b=="tags")-1]) 15 | if(length(user_tags)==0){user_tags<-NA} 16 | comment<-gsub("[\n ]","",b[which(b=="comment")-1]) 17 | if(length(comment)==0){comment<-NA} 18 | tmp<-get_book_info(bookid) 19 | title<-tmp$title 20 | summary<-tmp$summary 21 | author<-paste(tmp$author,collapse=" ") 22 | average_rating<-tmp$rating[3] 23 | image<-tmp$image["medium"] 24 | pages<-tmp$attribute$pages 25 | price<-gsub("\u5143","",tmp$attribute$price) #元 26 | out<-c(bookid,title,author,user_rating,reading_date,user_tags,pub, 27 | comment,summary,average_rating,image,pages,price) 28 | names(out)<-NULL 29 | return(out) 30 | } 31 | ######################################################## 32 | ##coll<-.user_book_what0(userid="qxde01",count=15,what="collect") 33 | ##@what:do,collect,wish 34 | .user_book_what0<-function(userid,count=15,what="collect",verbose=TRUE){ 35 | u<-paste0('http://book.douban.com/people/',userid,'/',what) 36 | p<-.refreshURL(u) 37 | tag_label<-sapply(getNodeSet(p, '//ul[@class="tag-list mb10"]//a'),xmlValue) 38 | tag_freq<-as.integer(sapply(getNodeSet(p, '//ul[@class="tag-list mb10"]//span'), 39 | xmlValue)) 40 | tags<-data.frame(tag_label=tag_label, 41 | tag_freq=tag_freq, stringsAsFactors=F) 42 | ##### 43 | ##\\((.*?)\\) <[^<>]*> \\([^)]+\\) "(\{[^}]+\})(/[^/{}]*/?)?" 44 | total<-sapply(getNodeSet(p, '//head//title'),xmlValue) 45 | total<-substr(total,regexpr("\\([^)]+\\)",total),nchar(total)) 46 | total<-gsub("[^0-9]","",total) 47 | cat("\n--------There is a total of ",total," ",what,"books.----------\n") 48 | pages<-ceiling(as.integer(total)/count) 49 | df<-data.frame(matrix(nrow=pages*count,ncol=13),stringsAsFactors=F) 50 | colnames(df)<-c("bookid","title","author","user_rating","reading_date", 51 | "user_tags","pub","comment","summary","average_rating", 52 | "image","pages","price") 53 | k=1 54 | if(pages>0){ 55 | for(pg in 1:pages){ 56 | u<-paste0('http://book.douban.com/people/',userid,'/',what, 57 | "?start=",(pg-1)*15,"&sort=time&rating=all&filter=all&mode=grid") 58 | if(verbose==TRUE){ 59 | cat("Getting ",what," book infomation from:",u,"...\n") 60 | } 61 | p<-.refreshURL(u) 62 | node<-getNodeSet(p, '//li[@class="subject-item"]//div[@class="info"]') 63 | df0<-t(sapply(node,.extract_book)) 64 | nr=nrow(df0) 65 | df[k:(k+nr-1),]<-df0 66 | k=k+nr 67 | } 68 | } 69 | df<- df[!is.na(df[, 1]), ] 70 | return(list(tags=tags,df=df)) 71 | } 72 | ######################################## 73 | ## 获取所有发表过的书评 74 | ##myrev<-.user_book.review0(userid="qxde01") 75 | .user_book.review0<-function(userid,count=5,verbose=TRUE){ 76 | u<-paste0("http://book.douban.com/people/",userid,"/reviews") 77 | p<-.refreshURL(u) 78 | total<-sapply(getNodeSet(p, '//head//title'),xmlValue) 79 | total<-substr(total,regexpr("\\([^)]+\\)",total),nchar(total)) 80 | total<-as.integer(gsub("[^0-9]","",total)) 81 | cat("\n--------There is a total of ",total,"reviews.--------\n") 82 | pages<-ceiling(total/count) 83 | out <- data.frame(matrix(nrow = pages * count, ncol = 9), 84 | stringsAsFactors = F) 85 | colnames(out) <- c("review_uri", "title", "published", "author", 86 | "author_uri", "review", "rating", "useful", "unuseful") 87 | k = 1 88 | if(pages>0){ 89 | for(pg in 1:pages){ 90 | u = paste0("http://book.douban.com/people/", userid, 91 | "/reviews?start=", (pg - 1) * count) 92 | cat("Getting review URLs from page ", pg, ": ", u, " ...\n") 93 | p <- .refreshURL(u, verbose) 94 | href <- sapply(getNodeSet(p, '//div[@class="tlst"]//a'), function(x) xmlGetAttr(x, "href")) 95 | href <- unique(href[grep("/review/", href)]) 96 | href <- href[!href %in% out$review_uri] 97 | n <- length(href) 98 | if (n > 0) { 99 | for (i in 1:n) { 100 | u0 <- href[i] 101 | if (verbose == TRUE) { 102 | cat(" Getting ", k, " movie review from URL: ", u0, " ...\n") 103 | } 104 | out0 <- .get_book_review0(u = u0, verbose) 105 | if (length(out0) == 9) { 106 | out[k, ] <- out0 107 | k <- k + 1 108 | } 109 | else { 110 | cat(" !!!! Getting failed at URL: ", u0," \n") 111 | } 112 | } 113 | } 114 | } 115 | } 116 | out <- out[!is.na(out[, 1]), ] 117 | return(out) 118 | } 119 | ########################################### 120 | # myann<-.user_book.annotation0(userid="qxde01") 121 | .user_book.annotation0<-function(userid,count=5,verbose=TRUE){ 122 | u<-paste0("http://book.douban.com/people/",userid,"/annotation/") 123 | p<-.refreshURL(u) 124 | total<-sapply(getNodeSet(p, '//head//title'),xmlValue) 125 | total<-substr(total,regexpr("\\([^)]+\\)",total),nchar(total)) 126 | total<-as.integer(gsub("[^0-9]","",total)) 127 | cat("\n--------There is a total of ",total,"annotation.--------\n") 128 | pages<-ceiling(total/count) 129 | out <- data.frame(matrix(nrow = pages * count, ncol = 9), 130 | stringsAsFactors = F) 131 | colnames(out) <- c("note_uri", "title", "published", "author", 132 | "author_uri", "note", "rating", "readers", "collectors") 133 | k=1 134 | if(pages>0){ 135 | for(pg in 1:pages){ 136 | #http://book.douban.com/people/Quantum_Panda/annotation/?start=5 137 | u = paste0("http://book.douban.com/people/", userid, 138 | "/annotation/?start=", (pg - 1) * count) 139 | cat("Getting note URLs from page", pg, ": ", u, "...\n") 140 | p <- .refreshURL(u, verbose) 141 | href <- sapply(getNodeSet(p,'//div[@class="annotations-context"]//a[@href]'), 142 | function(x) xmlGetAttr(x, "href")) 143 | href <- unique(href[grep(".com/annotation/", href)]) 144 | href <- href[-grep("#comments", href)] 145 | href <- href[!href %in% out$note_uri] 146 | n = length(href) 147 | if (n > 0) { 148 | for (i in 1:n) { 149 | u0 <- href[i] 150 | if (verbose == TRUE) { 151 | cat(" Getting ", k, " book note from URL: ", u0, " ...\n") 152 | } 153 | out0 <- .get_book_note0(u = u0, fresh, verbose) 154 | if (length(out0) == 9) { 155 | out[k, ] <- out0 156 | k = k + 1 157 | } 158 | else { 159 | cat(" !!!! Getting failed at URL: ", u0," \n") 160 | } 161 | } 162 | } 163 | } 164 | } 165 | out <- out[!is.na(out[, 1]), ] 166 | return(out) 167 | } 168 | ############################################## 169 | ##@userid 170 | ##@front 171 | ## qxde<-user_book_status(userid="qxde01") 172 | ## im<-qxde[[9]] 173 | #png("front.png",width=60*9,height=80*8) 174 | #display(combine(im),method="raster",all=T) 175 | #dev.off() 176 | user_book_status<-function(userid,verbose=TRUE,front=TRUE){ 177 | cat("--------Retrieving information about read books.--------\n") 178 | tmp<-.user_book_what0(userid,count=15,what="collect",verbose) 179 | collect_tags<-tmp$tags 180 | collect_df<-tmp$df 181 | cat("--------Retrieving information about reading books.--------\n") 182 | tmp<-.user_book_what0(userid,count=15,what="do",verbose) 183 | do_tags<-tmp$tags 184 | do_df<-tmp$df 185 | cat("--------Retrieving information about the books you want to read.--------\n") 186 | tmp<-.user_book_what0(userid,count=15,what="wish",verbose) 187 | wish_tags<-tmp$tags 188 | wish_df<-tmp$df 189 | cat("--------Retrieving your published book reviews. --------\n") 190 | reviews<-.user_book.review0(userid,verbose) 191 | cat("--------Retrieving your reading notes.--------\n") 192 | notes<-.user_book.annotation0(userid,verbose) 193 | 194 | collect_images<-list() 195 | if(front==TRUE){ 196 | if(!require(EBImage)){ 197 | source("http://bioconductor.org/biocLite.R") 198 | biocLite("EBImage") 199 | require(EBImage) 200 | } 201 | ##cat("--------正在获取已读书籍的封面图片--------...\n") 202 | cat("--------\u6b63\u5728\u83b7\u53d6\u5df2\u8bfb\u4e66\u7c4d\u7684\u5c01\u9762\u56fe\u7247--------...\n") 203 | images<-collect_df$image 204 | m<-length(images) 205 | for(i in 1:m){ 206 | collect_images[[i]]<-resize(readImage(images[i]),w=60,h=80) 207 | } 208 | } 209 | list(collect_tags=collect_tags, 210 | collect_df=collect_df, 211 | do_tags=do_tags, 212 | do_df=do_df, 213 | wish_tags=wish_tags, 214 | wish_df=wish_df, 215 | reviews=reviews, 216 | notes=notes, 217 | collect_images=collect_images) 218 | } 219 | -------------------------------------------------------------------------------- /R/user_movie_viz.R: -------------------------------------------------------------------------------- 1 | ##' 观影综合统计 2 | MovieStatSummary<-function(x,YEAR=NULL,NR_REVIEW=0){ 3 | cat(" \u89c2\u5f71\u6982\u8981\u7edf\u8ba1 ......\n") 4 | n=nrow(x) #number 5 | star5num<-nrow(x[x$user_rating=="5",]) # star5 number 6 | amount<-table(x$month) 7 | maxMonth<-names(amount[amount==max(amount)])[1] 8 | maxAmount<-max(amount) ##观看最多的月份 9 | averageAmount<-n/(length(amount))##每月几部 10 | duration<-as.double(x$duration) 11 | ## 没有时长数据的用平均值代替 12 | durationTotal<-n*sum(duration,na.rm = T)/length(duration[!is.na(duration)])##总时长 13 | durationAmount<-tapply(as.integer(x$duration),x$month,sum,na.rm = T) 14 | maxMonthduration<-names(durationAmount[durationAmount==max(durationAmount)])[1] 15 | maxAmountduration<-max(durationAmount) ##时长最大 16 | averageAmountduration<-sum(durationAmount)/n ##每月平均观影时长 17 | 18 | str1<-paste(YEAR,"\u5171\u89c2\u770b\u7535\u5f71",n, 19 | "\u90e8,\u5408\u8ba1\u65f6\u957f", 20 | round(durationTotal,digits=0),"\u5206\u949f") 21 | str2<-paste("\u4e94\u661f\u597d\u7535\u5f71",star5num,"\u90e8") 22 | str3<-paste(maxMonth,"\u6708\u6700\u591a\u89c2\u770b\u4e86",maxAmount,"\u90e8") 23 | str4<-paste("\u5e73\u5747\u6bcf\u6708\u89c2\u770b",round(averageAmount,digits=2),"\u90e8") 24 | str5<-paste(maxMonthduration,"\u6708\u6700\u591a\u89c2\u770b", 25 | maxAmountduration,"\u5206\u949f,\u5e73\u5747\u6bcf\u6708\u89c2\u770b", 26 | round(averageAmountduration, digits = 2),"\u5206\u949f") 27 | str6<-paste("\u5171\u53d1\u8868\u5f71\u8bc4",NR_REVIEW, "\u7bc7") 28 | #Encoding(str1)<-"UTF-8";Encoding(str2)<-"UTF-8";Encoding(str3)<-"UTF-8" 29 | #Encoding(str4)<-"UTF-8";Encoding(str5)<-"UTF-8";Encoding(str6)<-"UTF-8" 30 | 31 | #png('MovieStatSummary.png',width=720,height=540) 32 | op<-par(mar=c(0,0,0,0),bty="n",yaxt="n",xaxt="n") 33 | cols<-rainbow(50,start = 0.5) 34 | plot(c(0, 1), c(0, 1),xlab="",ylab="",type="n") 35 | rect(0, 0, 1, 1, col = c(NA,0),border = NA, lwd = 2) 36 | ##我的观影统计 37 | text(0.5,0.95,"\u6211\u7684\u89c2\u5f71\u7edf\u8ba1",cex=4,col="blue",,font = 2) 38 | text(0.5,0.8,str1,cex=2,col=cols[1]) 39 | text(0.5,0.7,str2,cex=2,col=cols[9]) 40 | text(0.5,0.6,str3,cex=2,col=cols[18]) 41 | text(0.5,0.5,str4,cex=2,col=cols[27]) 42 | text(0.5,0.4,str5,cex=2,col=cols[36]) 43 | if(NR_REVIEW>0){ 44 | text(0.5,0.3,str6,cex=2,col=cols[45]) 45 | } 46 | ##生活不是电影,但电影是生活,是镜子 47 | ##梦想始于剧本,而终结于电影 48 | text(0.5,0.2,"\u751f\u6d3b\u4e0d\u662f\u7535\u5f71,\u4f46\u7535\u5f71\u662f\u751f\u6d3b,\u662f\u955c\u5b50",cex=4,col=cols[30],font = 2) 49 | text(0.5,0.1,"\u68a6\u60f3\u59cb\u4e8e\u5267\u672c,\u800c\u7ec8\u7ed3\u4e8e\u7535\u5f71", 50 | cex=4,col=cols[30], font = 2) 51 | par(op) 52 | #dev.off() 53 | rm(x) 54 | } 55 | #################################### 56 | ##按月统计 57 | ##' @x:collect_df$month 58 | ##statByMonth(collect_df$month) 59 | MovieStatByMonth<-function(x){ 60 | ## 根据月份统计观影数量 61 | cat(" \u6839\u636e\u6708\u4efd\u7edf\u8ba1\u89c2\u5f71\u6570\u91cf......\n") 62 | amount<-table(x) 63 | #png('barByMonthMovie.png',width=720,height=540) 64 | op<-par(mar=c(2.5,0,2,0)) 65 | cols<-rainbow(5*length(amount),start = 0.5)[1:length(amount)] 66 | barplot(amount,space=0,ylim=c(0,max(amount)+1),col=cols,yaxt="n") 67 | text((1:length(amount))-0.5,amount+0.3,labels=amount,col=cols,cex=2,font=2) 68 | ## 观影数量by月 69 | title(main=list("\u89c2\u5f71\u6570\u91cfby\u6708",cex=2,col="blue")) 70 | par(op) 71 | #dev.off() 72 | rm(x) 73 | } 74 | ##################################### 75 | ## 按时长统计 76 | ##' @x:collect_df 77 | ## statByPage(collect_df[,c("month","pages")]) 78 | MovieStatByPage<-function(x){ 79 | ## 统计观影时长by月 80 | cat(" \u6309\u6708\u7edf\u8ba1\u89c2\u5f71\u65f6\u957f......\n") 81 | aver=sum(x$duration,na.rm = T)/length(x$duration[!is.na(x$duration)]) 82 | x$duration[is.na(x$duration)]<-aver 83 | amount<-tapply(as.integer(x$duration),x$month,sum,na.rm = T) 84 | amount2<-amount/max(amount) 85 | #png('barByPageMovie.png',width=720,height=540) 86 | op<-par(mar=c(2.5,0,3,0)) 87 | cols<-topo.colors(5*length(amount),alpha=0.5)[1:length(amount)] 88 | barplot(amount2,space=0,ylim=c(0,1.1),col=cols,yaxt="n") 89 | text((1:length(amount))-0.5,amount2+0.05,labels=amount,col=cols,cex=1.5,font=2) 90 | title(main=list("\u89c2\u5f71\u65f6\u957fby\u6708(\u5206\u949f)",cex=2,col="blue")) 91 | par(op) 92 | #dev.off() 93 | rm(x) 94 | } 95 | ################################################### 96 | ##' 根据标签的TF进行层次聚类 97 | ##' clusterCloudByTags(x=qxde$collect_df,k=8) 98 | MovieclusterCloudByTags<-function(x,k=10,max.words=100){ 99 | if(!require(wordcloud)){ 100 | install.packages("wordcloud") 101 | require(wordcloud) 102 | } 103 | tagsDtm<-.df2dtm(df=x,content="tags",word.min=1,type='movie') 104 | 105 | cat(" \u6839\u636e\u7535\u5f71\u6807\u7b7e\u8fdb\u884c\u5c42\u6b21\u805a\u7c7b......\n") 106 | diss<-dissimilarity(tagsDtm,method='cosine') 107 | hc <- hclust(diss, method = "ward") 108 | #png('hclustByTagsMovie.png',width=720,height=540) 109 | op<-par(mar=c(0,0,3,0)) 110 | if(k>nrow(x))k=nrow(x)-1 111 | ## 这些书自动归类 \u8fd9\u4e9b\u4e66\u81ea\u52a8\u5f52\u7c7b 112 | plot(hc,frame.plot=F,ann=T,hang=0.2,col=4,cex=2, 113 | main=list('\u7535\u5f71\u5f52\u7c7b', 114 | cex=1.5,col="blue"),ylab='',xlab='',sub='') 115 | if(k>1){ 116 | rect.hclust(hc,k, border = rainbow(k)) 117 | } 118 | par(op) 119 | #dev.off() 120 | 121 | tagsmat<-as.matrix(tagsDtm) 122 | freq<-colSums(tagsmat) 123 | word<-colnames(tagsDtm) 124 | word<-gsub("","",word) 125 | df<-data.frame(word=word,freq=freq) 126 | df<-df[order(df$freq,decreasing=T),] 127 | row.names(df)<-NULL 128 | ##cat(" #### 对我的标签绘制wordcloud......\n") 129 | cat(" \u5bf9\u7535\u5f71\u6807\u7b7e\u7ed8\u5236wordcloud......\n") 130 | #png('wordcloudByTagsMovie.png',width=720,height=720) 131 | wordcloud(words=df$word, 132 | freq=df$freq, 133 | min.freq=1,scale=c(5,1.4), 134 | max.words=max.words, 135 | random.order=F, 136 | ordered.colors=F, 137 | colors=topo.colors(length(table(df$freq)))) 138 | ## 我的口味\u6211\u7684\u53e3\u5473 139 | title(main=list('\u6211\u7684\u53e3\u5473',cex=2,col="blue",font = 2)) 140 | #dev.off() 141 | rm(x,df,diss,tagsDtm) 142 | } 143 | ######################################### 144 | ##' 将数据转化为关系形式 145 | ##' @x:collect_df[,c("title","user_tags")] 146 | MovieGraphByTag<-function(x){ 147 | if(!require(igraph)){ 148 | install.packages("igraph") 149 | require(igraph) 150 | } 151 | ##cat(" #### 绘制电影标签之间的关系图.....\n") 152 | cat(" \u7ed8\u5236\u7535\u5f71\u6807\u7b7e\u4e4b\u95f4\u7684\u5173\u7cfb\u56fe.....\n") 153 | .todf<-function(x){ 154 | x<-as.character(x) 155 | ## 标签:\u6807\u7b7e: 156 | taggg<-'\u6807\u7b7e:';Encoding(taggg)<-"UTF-8" 157 | x<-gsub(taggg,'',c(x[1],x[2])) 158 | left=x[1] 159 | right<-unlist(strsplit(x[2],' ')) 160 | right<-right[nchar(right)>0] 161 | right<-right[right!=left] 162 | left<-rep(left,length(right)) 163 | cbind(left,right) 164 | } 165 | tag_list<-apply(x[,c("title","tags")],1,.todf) 166 | n=length(tag_list) 167 | tag<-c() 168 | for(i in 1:n){ 169 | tag<-rbind(tag,tag_list[[i]]) 170 | } 171 | g<-graph.data.frame(tag,directed=F) 172 | title<-unique(x$title) 173 | col=rep(2,length(V(g)$name)) 174 | col[match(V(g)$name,title)]<-4 175 | vcex=col;vcex[vcex==4]=2;vcex[vcex==2]=1.5 176 | # png('MovieGraphByTag.png',width=720,height=720) 177 | plot(g, layout=layout.fruchterman.reingold, vertex.size=col, 178 | vertex.label.dist=0, vertex.color=col+1, 179 | edge.arrow.size=0.5,vertex.label=V(g)$name,vertex.label.cex=vcex, 180 | vertex.label.color=col+2) 181 | ## 电影与标签之间的关系图 182 | title(main=list("\u7535\u5f71\u4e0e\u6807\u7b7e\u4e4b\u95f4\u7684\u5173\u7cfb\u56fe", 183 | font=2,cex=2,col="blue")) 184 | #dev.off() 185 | rm(x,tag,g) 186 | } 187 | #################################### 188 | ## 189 | ##collect_df[,c('author','cast')] 190 | ##导演和演员的合作关系 191 | actor2df<-function(x){ 192 | f_split<-function(x){ 193 | left<-unlist(strsplit(as.character(x[1]),split=',')) 194 | right<-unlist(strsplit(as.character(x[2]),',')) 195 | n1=length(left) 196 | n2=length(right) 197 | df<-c() 198 | for(i in 1:n1){ 199 | tmp<-cbind(left=rep(left[i],n2),right=right) 200 | df<-rbind(df,tmp) 201 | } 202 | df 203 | } 204 | x[,2]<-gsub('\u7279\u9080\u987e\u95ee\uff1a|\u5236\u7247\u4eba\uff1a|\u7279\u9080\u987e\u95ee\uff1a','',x[,2]) 205 | x<-x[nchar(x[,2])>0,] 206 | x<-x[nchar(x[,1])>0,] 207 | n=nrow(x) 208 | df<-c() 209 | for (i in 1:n){ 210 | tmp<-f_split(x[i,]) 211 | #cat(i,'\n') 212 | df<-rbind(df,tmp) 213 | } 214 | df<-df[df[,1]!=df[,2],] 215 | unique(df) 216 | } 217 | ################################## 218 | #' @x:collect_df[,c('author','cast')] 219 | #' actorGraph(x=collect_df[,c('author','cast')]) 220 | actorGraph<-function(x){ 221 | cat(' \u5bfc\u6f14\u548c\u6f14\u5458\u7684\u5408\u4f5c\u5173\u7cfb.....\n') 222 | df<-actor2df(x) 223 | g<-graph.data.frame(df,directed=F) 224 | dirctor<-unique(df[,1]) 225 | actor<-unique(df[,2]) 226 | col=rep(2,length(V(g)$name)) 227 | col[match(V(g)$name,dirctor)]<-4 228 | vcex=col;vcex[vcex==4]=1.2;vcex[vcex==2]=1 229 | png('actorGraphg.png',width=900,height=900) 230 | plot(g, layout=layout.fruchterman.reingold, vertex.size=col, 231 | vertex.label.dist=0, vertex.color=col+1, 232 | edge.arrow.size=0.5,vertex.label=V(g)$name,vertex.label.cex=vcex, 233 | vertex.label.color=col+2) 234 | title(main=list("\u5bfc\u6f14\u548c\u6f14\u5458\u7684\u5408\u4f5c\u5173\u7cfb", 235 | font=2,cex=2,col="blue")) 236 | dev.off() 237 | } 238 | 239 | #actorGraph(x) 240 | 241 | ####################################################### 242 | ##电影统计可视化函数 243 | ##' @x:user_movie_status的输出 244 | ##' @YEAR:统计年份 245 | ##' load("qxde.rda") 246 | ##' data(stopwords) 247 | ##' user_movie_viz(x=qxde,YEAR="2012",stopwords=stopwords) 248 | user_movie_viz<-function(x,YEAR="2013",stopwords=stopwords){ 249 | ## 正在预处理数据 250 | cat("\u6b63\u5728\u9884\u5904\u7406\u6570\u636e......\n") 251 | collect_df<-x$collect_df 252 | 253 | collect_df$duration<-as.integer(gsub('\\([^)]+\\)|[^0-9]','',collect_df$duration)) 254 | year<-substr(collect_df$watching_date,1,4) 255 | month<-substr(collect_df$watching_date,1,7) 256 | collect_df<-data.frame(collect_df, 257 | month=month,year=year,stringsAsFactors=F) 258 | reviews<-x$reviews 259 | collect_images<-x$collect_images 260 | if(!is.null(YEAR)){ 261 | collect_df<-collect_df[year==YEAR,] 262 | collect_images<-collect_images[year==YEAR] 263 | r_year<-substr(reviews$published,1,4) 264 | reviews<-reviews[r_year==YEAR,] 265 | } 266 | NR_REVIEW=nrow(reviews) 267 | 268 | ## 拼图大小 269 | n<-length(collect_images)^0.5 270 | n1<-ceiling(n);n2=n1 271 | if((n1-1)^2+floor(n)>n^2){n2=floor(n)} 272 | require(EBImage) 273 | cat("\u6b63\u5728\u7ed8\u5236\u7edf\u8ba1\u56fe......\n") ##正在绘制统计图 274 | cat(" \u7ed8\u5236\u7535\u5f71\u6d77\u62a5\u62fc\u56fe......\n") 275 | front<-combine(x=collect_images) 276 | png("Moviefront.png",width=64*n1,height=80*n2) 277 | display(x=front,method="raster",all=T) 278 | dev.off() 279 | png("Movie.png",width=640,height=3600) 280 | op<-par(mfrow=c(6,1),mar=c(2,0,1.5,0)) 281 | ##基本统计 282 | MovieStatSummary(x=collect_df,YEAR=YEAR,NR_REVIEW) 283 | ##按月统计 284 | MovieStatByMonth(x=collect_df$month) 285 | ##按页数统计 286 | MovieStatByPage(x=collect_df[,c("month","duration")]) 287 | ## 层次聚类与wordcloud 288 | MovieclusterCloudByTags(x=collect_df,k=8) 289 | ##绘制书籍标签之间的关系图 290 | MovieGraphByTag(x=collect_df[,c("title","tags")]) 291 | dev.off() 292 | ## 评论关键词wordcloud 293 | xx<-collect_df[!is.na(collect_df$comment),] 294 | if (length(nrow(xx))>0){ 295 | wordcloudByComment(x=xx,stopwords=stopwords,filename='MoviewordcloudByComment') 296 | } 297 | ## 导演和演员的合作关系 298 | actorGraph(x=collect_df[,c('author','cast')]) 299 | ## 信息图存放位置:\u4fe1\u606f\u56fe\u5b58\u653e\u4f4d\u7f6e 300 | cat("\u4fe1\u606f\u56fe\u5b58\u653e\u4f4d\u7f6e:",getwd(),"\n") 301 | cat("\u751f\u6210\u7684\u56fe\u7247\u4e3a:\n", 302 | dir(getwd(),"png"),"\n") #生成的图片为: 303 | } 304 | -------------------------------------------------------------------------------- /R/user_book_viz.R: -------------------------------------------------------------------------------- 1 | ##' source('user_book_viz.R',encoding='UTF-8') 2 | ##' 阅读综合统计 3 | ##' @x:collect_df 4 | ##' @YEAR:统计年份 5 | ##' @NR_REVIEW:书评篇数 6 | ##' @NR_NOTE:笔记篇数 7 | ##' statSummary(x=collect_df2) 8 | statSummary<-function(x,YEAR=NULL,NR_REVIEW=0,NR_NOTE=0){ 9 | #cat(" #### 阅读综合统计......\n") 10 | cat(" #### \u9605\u8bfb\u7efc\u5408\u7edf\u8ba1......\n") 11 | n=nrow(x) #number 12 | star5num<-nrow(x[x$user_rating=="5",]) # star5 number 13 | amount<-table(x$month) 14 | maxMonth<-names(amount[amount==max(amount)]) 15 | maxAmount<-max(amount) 16 | averageAmount<-(length(amount)*30.42)/sum(amount) 17 | priceTotal<-sum(as.double(x$price),na.rm = T) 18 | pageAmount<-tapply(as.integer(x$pages),x$month,sum,na.rm = T) 19 | maxMonthPage<-names(pageAmount[pageAmount==max(pageAmount)])[1] 20 | maxAmountPage<-max(pageAmount)[1] 21 | averageAmountPage<-sum(pageAmount)/(length(pageAmount)*30.42) 22 | ############################################### 23 | ##' 共阅读过,\u5171\u9605\u8bfb\u8fc7 24 | ##' 本书,总价值 \u672c\u4e66,\u603b\u4ef7\u503c 25 | ##'str1<-paste(YEAR,"共阅读过",n,"本书,总价值",priceTotal,"¥") 26 | ##' 五星好书 \u4e94\u661f\u597d\u4e66,本 \u672c 27 | ##' str2<-paste("五星好书",star5num,"本") 28 | ##' 月最勤快读了,\u6708\u6700\u52e4\u5feb\u8bfb\u4e86 29 | ##' str3<-paste(maxMonth,"月最勤快读了",maxAmount,"本") 30 | ##' str4<-paste("平均",round(averageAmount,digits=2),"天读一本书") 31 | ##' str5<-paste(maxMonthPage,"月最多读了",maxAmountPage, "页,平均每天读", 32 | ##' round(averageAmountPage, digits = 2),"页,总共", 33 | ##' sum(pageAmount),"页") 34 | ##' str6<-paste("发表书评",NR_REVIEW,"篇,读书笔记",NR_NOTE,"篇") 35 | ############################################### 36 | str1<-paste(YEAR,"\u5171\u9605\u8bfb\u8fc7",n, 37 | "\u672c\u4e66,\u603b\u4ef7\u503c",priceTotal,"\uffe5") 38 | str2<-paste("\u4e94\u661f\u597d\u4e66",star5num,"\u672c") 39 | str3<-paste(maxMonth,"\u6708\u6700\u52e4\u5feb\u8bfb\u4e86",maxAmount,"\u672c") 40 | str4<-paste("\u5e73\u5747",round(averageAmount,digits=2),"\u5929\u8bfb\u4e00\u672c\u4e66") 41 | str5<-paste(maxMonthPage,"\u6708\u6700\u591a\u8bfb\u4e86",maxAmountPage, 42 | "\u9875,\u5e73\u5747\u6bcf\u5929\u8bfb", 43 | round(averageAmountPage, digits = 2),"\u9875,\u603b\u5171", 44 | sum(pageAmount),"\u9875") 45 | str6<-paste("\u53d1\u8868\u4e66\u8bc4",NR_REVIEW, 46 | "\u7bc7,\u8bfb\u4e66\u7b14\u8bb0",NR_NOTE,"\u7bc7") 47 | Encoding(str1)<-"UTF-8";Encoding(str2)<-"UTF-8";Encoding(str3)<-"UTF-8" 48 | Encoding(str4)<-"UTF-8";Encoding(str5)<-"UTF-8";Encoding(str6)<-"UTF-8" 49 | 50 | png('statSummary.png',width=720,height=540) 51 | op<-par(mar=c(0,0,0,0),bty="n",yaxt="n",xaxt="n") 52 | cols<-rainbow(50,start = 0.5) 53 | plot(c(0, 1), c(0, 1),xlab="",ylab="",type="n") 54 | rect(0, 0, 1, 1, col = c(NA,0),border = NA, lwd = 2) 55 | ##我的阅读统计 \u6211\u7684\u9605\u8bfb\u7edf\u8ba1 56 | text(0.5,0.95,"\u6211\u7684\u9605\u8bfb\u7edf\u8ba1",cex=4,col="blue",,font = 2) 57 | text(0.5,0.8,str1,cex=2,col=cols[1]) 58 | text(0.5,0.7,str2,cex=2,col=cols[5]) 59 | text(0.5,0.6,str3,cex=2,col=cols[10]) 60 | text(0.5,0.5,str4,cex=2,col=cols[15]) 61 | text(0.5,0.4,str5,cex=2,col=cols[20]) 62 | if(NR_REVIEW+NR_NOTE>0){ 63 | text(0.5,0.3,str6,cex=2,col=cols[25]) 64 | } 65 | ##不积跬步,无以至千里\u4e0d\u79ef\u8dec\u6b65,\u65e0\u4ee5\u81f3\u5343\u91cc 66 | ##不积小流,无以成江海 \u4e0d\u79ef\u5c0f\u6d41,\u65e0\u4ee5\u6210\u6c5f\u6d77 67 | text(0.5,0.2,"\u4e0d\u79ef\u8dec\u6b65,\u65e0\u4ee5\u81f3\u5343\u91cc",cex=4,col=cols[30],font = 2) 68 | text(0.5,0.1,"\u4e0d\u79ef\u5c0f\u6d41,\u65e0\u4ee5\u6210\u6c5f\u6d77", 69 | cex=4,col=cols[30], font = 2) 70 | par(op) 71 | dev.off() 72 | rm(x) 73 | } 74 | #################################### 75 | ##按月统计 76 | ##' @x:collect_df$month 77 | ##statByMonth(collect_df$month) 78 | statByMonth<-function(x){ 79 | ## 根据月份统计阅读量 80 | cat(" #### \u6839\u636e\u6708\u4efd\u7edf\u8ba1\u9605\u8bfb\u91cf......\n") 81 | amount<-table(x) 82 | png('barByMonth.png',width=720,height=540) 83 | op<-par(mar=c(2.5,0,2,0)) 84 | cols<-rainbow(5*length(amount),start = 0.5)[1:length(amount)] 85 | barplot(amount,space=0,ylim=c(0,max(amount)+1),col=cols,yaxt="n") 86 | text((1:length(amount))-0.5,amount+0.3,labels=amount,col=cols,cex=2,font=2) 87 | ## 阅读书籍数量by月 \u9605\u8bfb\u4e66\u7c4d\u6570\u91cfby\u6708 88 | title(main=list("\u9605\u8bfb\u4e66\u7c4d\u6570\u91cfby\u6708",cex=2,col="blue")) 89 | par(op) 90 | dev.off() 91 | rm(x) 92 | } 93 | ##################################### 94 | ## 按页数统计 95 | ##' @x:collect_df 96 | ## statByPage(collect_df[,c("month","pages")]) 97 | statByPage<-function(x){ 98 | ## 统计阅读页数 99 | cat(" #### \u7edf\u8ba1\u9605\u8bfb\u9875\u6570......\n") 100 | amount<-tapply(as.integer(x$pages),x$month,sum,na.rm = T) 101 | amount2<-amount/max(amount) 102 | png('barByPage.png',width=720,height=540) 103 | op<-par(mar=c(2.5,0,3,0)) 104 | cols<-topo.colors(5*length(amount),alpha=0.5)[1:length(amount)] 105 | barplot(amount2,space=0,ylim=c(0,1.1),col=cols,yaxt="n") 106 | text((1:length(amount))-0.5,amount2+0.05,labels=amount,col=cols,cex=1.5,font=2) 107 | ## 阅读书籍页数by月 108 | title(main=list("\u9605\u8bfb\u4e66\u7c4d\u9875\u6570by\u6708",cex=2,col="blue")) 109 | par(op) 110 | dev.off() 111 | rm(x) 112 | } 113 | 114 | ############################################# 115 | ##' 将df转化为DocumentTermMatrix 116 | .df2dtm<-function(df,content='word',word.min=2,type='book'){ 117 | if(!require(tm)){ 118 | install.packages("tm") 119 | require(tm) 120 | } 121 | if(type=='book'){ 122 | df <- data.frame(contents = as.character(df[,content]), 123 | id = as.character(df$bookid), 124 | heading = as.character(df$author), 125 | origin=as.character(df$title), 126 | stringsAsFactors = F) 127 | } 128 | else if(type=='movie'){ 129 | df <- data.frame(contents = as.character(df[,content]), 130 | id = as.character(df$movieid), 131 | heading = as.character(df$author), 132 | origin=as.character(df$title), 133 | stringsAsFactors = F) 134 | } 135 | 136 | m <- list(Content = "contents", Heading = "heading", 137 | ID = "origin",Origin="id") 138 | myReader <- readTabular(mapping = m, language = "Zh_cn") 139 | corpus <- Corpus(DataframeSource(df), 140 | readerControl = list(reader = myReader,language = "Zh_cn")) 141 | dtm <- DocumentTermMatrix(corpus, control = list(wordLengths = c(word.min, Inf))) 142 | rowTotals <- apply(dtm , 1, sum) 143 | dtm <- dtm[rowTotals> 0] 144 | return(dtm) 145 | } 146 | ################################################### 147 | ##' 根据标签的TF进行层次聚类 148 | ##' clusterCloudByTags(x=qxde$collect_df,k=8) 149 | clusterCloudByTags<-function(x,k=8,max.words=100){ 150 | if(!require(wordcloud)){ 151 | install.packages("wordcloud") 152 | require(wordcloud) 153 | } 154 | tagsDtm<-.df2dtm(df=x,content="user_tags",word.min=1) 155 | ## tagsDtm<-weightTfIdf(tagsDtm) 156 | ##cat(" #### 根据我的标签对书籍进行层次聚类......\n") 157 | cat(" #### \u6839\u636e\u6211\u7684\u6807\u7b7e\u5bf9\u4e66\u7c4d\u8fdb\u884c\u5c42\u6b21\u805a\u7c7b......\n") 158 | diss<-dissimilarity(tagsDtm,method='cosine') 159 | hc <- hclust(diss, method = "ward") 160 | png('hclustByTags.png',width=720,height=540) 161 | op<-par(mar=c(0,0,3,0)) 162 | if(k>nrow(x))k=nrow(x)-1 163 | ## 这些书自动归类 \u8fd9\u4e9b\u4e66\u81ea\u52a8\u5f52\u7c7b 164 | plot(hc,frame.plot=F,ann=T,hang=0.2,col=4, 165 | main=list('\u8fd9\u4e9b\u4e66\u81ea\u52a8\u5f52\u7c7b', 166 | cex=2,col="blue"),ylab='',xlab='',sub='') 167 | if(k>1){ 168 | rect.hclust(hc,k, border = rainbow(k)) 169 | } 170 | 171 | par(op) 172 | dev.off() 173 | 174 | tagsmat<-as.matrix(tagsDtm) 175 | freq<-colSums(tagsmat) 176 | word<-colnames(tagsDtm) 177 | word<-gsub("","",word) 178 | df<-data.frame(word=word,freq=freq) 179 | df<-df[order(df$freq,decreasing=T),] 180 | row.names(df)<-NULL 181 | ##cat(" #### 对我的标签绘制wordcloud......\n") 182 | cat(" #### \u5bf9\u6211\u7684\u6807\u7b7e\u7ed8\u5236wordcloud......\n") 183 | png('wordcloudByTags.png',width=720,height=720) 184 | wordcloud(words=df$word, 185 | freq=df$freq, 186 | min.freq=1,scale=c(5,1.4), 187 | max.words=max.words, 188 | random.order=F, 189 | ordered.colors=F, 190 | colors=topo.colors(length(table(df$freq)))) 191 | ## 我的口味\u6211\u7684\u53e3\u5473 192 | title(main=list('\u6211\u7684\u53e3\u5473',cex=2,col="blue",font = 2)) 193 | dev.off() 194 | rm(x,df,diss,tagsDtm) 195 | } 196 | ######################################### 197 | ##' 将数据转化为关系形式 198 | ##' @x:collect_df[,c("title","user_tags")] 199 | graphByTag<-function(x){ 200 | if(!require(igraph)){ 201 | install.packages("igraph") 202 | require(igraph) 203 | } 204 | ##cat(" #### 绘制书籍标签之间的关系图.....\n") 205 | cat(" #### \u7ed8\u5236\u4e66\u7c4d\u6807\u7b7e\u4e4b\u95f4\u7684\u5173\u7cfb\u56fe.....\n") 206 | .todf<-function(x){ 207 | x<-as.character(x) 208 | ## 标签:\u6807\u7b7e: 209 | taggg<-'\u6807\u7b7e:';Encoding(taggg)<-"UTF-8" 210 | x<-gsub(taggg,'',c(x[1],x[2])) 211 | left=x[1] 212 | right<-unlist(strsplit(x[2],' ')) 213 | right<-right[nchar(right)>0] 214 | right<-right[right!=left] 215 | left<-rep(left,length(right)) 216 | cbind(left,right) 217 | } 218 | tag_list<-apply(x[,c("title","user_tags")],1,.todf) 219 | n=length(tag_list) 220 | tag<-c() 221 | for(i in 1:n){ 222 | tag<-rbind(tag,tag_list[[i]]) 223 | } 224 | g<-graph.data.frame(tag,directed=F) 225 | title<-unique(x$title) 226 | col=rep(2,length(V(g)$name)) 227 | col[match(V(g)$name,title)]<-4 228 | vcex=col;vcex[vcex==4]=1.2;vcex[vcex==2]=1 229 | png('graphByTag.png',width=720,height=720) 230 | plot(g, layout=layout.fruchterman.reingold, vertex.size=col, 231 | vertex.label.dist=0, vertex.color=col+1, 232 | edge.arrow.size=0.5,vertex.label=V(g)$name,vertex.label.cex=vcex, 233 | vertex.label.color=col+2) 234 | ## 书籍与标签之间的关系图 235 | title(main=list("\u4e66\u7c4d\u4e0e\u6807\u7b7e\u4e4b\u95f4\u7684\u5173\u7cfb\u56fe", 236 | font=2,col="blue")) 237 | dev.off() 238 | rm(x,tag,g) 239 | } 240 | 241 | ####################################### 242 | ##' 分词函数 243 | .f_seg<-function(x,stopwords=NULL){ 244 | if(!require(Rwordseg)){ 245 | install.packages("Rwordseg", repos = "http://R-Forge.R-project.org", type = "source") 246 | require(Rwordseg) 247 | } 248 | seg<-segmentCN(x) 249 | if(!is.null(stopwords)){ 250 | seg<-seg[!seg %in% stopwords] 251 | } 252 | seg<-seg[nchar(seg)>1] 253 | word<-paste(seg,collapse=' ') 254 | word 255 | } 256 | 257 | ######################################### 258 | ##' 对评论进行分词,绘制wordcloud 259 | ##' wordcloudByComment(x=qxde$collect_df) 260 | ## load("stopwords.rda") 261 | wordcloudByComment<-function(x,max.words=100,stopwords,filename='wordcloudByComment'){ 262 | if(!require(wordcloud)){ 263 | install.packages("wordcloud") 264 | require(wordcloud) 265 | } 266 | ##cat(" #### 对评论数据进行分词......\n") 267 | cat(" #### \u5bf9\u8bc4\u8bba\u6570\u636e\u8fdb\u884c\u5206\u8bcd......\n") 268 | x<-x[!is.na(x$comment),] 269 | word<-sapply(x$comment,.f_seg,stopwords) 270 | names(word)<-NULL 271 | word<-unlist(strsplit(word," ")) 272 | tmp<-sort(table(word),decreasing=T) 273 | df<-data.frame(word=names(tmp),freq=tmp[]) 274 | #gsub(':','-',substr(Sys.time(),12,20)) 275 | png(paste0(filename,'.png',sep=''),width=720,height=720) 276 | #par(bg='gray90') 277 | ##cat(" #### 对评论关键词绘制wordcloud......\n") 278 | cat(" #### \u5bf9\u8bc4\u8bba\u5173\u952e\u8bcd\u7ed8\u5236wordcloud......\n") 279 | wordcloud(words=df$word, 280 | freq=df$freq, 281 | min.freq=1,scale=c(5,1.4), 282 | max.words=max.words, 283 | random.order=F, 284 | ordered.colors=F, 285 | colors=rainbow(length(table(df$freq)))) 286 | ## 我的评论关键词\u6211\u7684\u8bc4\u8bba\u5173\u952e\u8bcd 287 | title(main=list('\u6211\u7684\u5410\u69fd', 288 | cex=2,col="blue",font = 2)) 289 | dev.off() 290 | rm(x,df,word) 291 | } 292 | ####################################################### 293 | ##阅读统计可视化函数 294 | ##' @x:user_book_status的输出 295 | ##' @YEAR:统计年份 296 | ##' load("qxde.rda") 297 | ##' data(stopwords) 298 | ##' user_book_viz(x=qxde,YEAR="2012",stopwords=stopwords,back=TRUE) 299 | user_book_viz<-function(x,YEAR="2013",stopwords=stopwords,back=FALSE){ 300 | ## 正在预处理数据 301 | cat("\u6b63\u5728\u9884\u5904\u7406\u6570\u636e......\n") 302 | collect_df<-x$collect_df 303 | collect_df$reading_date<-gsub("[\n \u8bfb\u8fc7\u60f3\u5728]","",collect_df$reading_date) 304 | collect_df$price<-gsub("\u5143","",collect_df$price) 305 | ## 标签: \u6807\u7b7e: 306 | collect_df$user_tags<-gsub("\u6807\u7b7e: ","",collect_df$user_tags) 307 | year<-substr(collect_df$reading_date,1,4) 308 | month<-substr(collect_df$reading_date,1,7) 309 | collect_df<-data.frame(collect_df, 310 | month=month,year=year,stringsAsFactors=F) 311 | reviews<-x$reviews 312 | notes<-x$notes 313 | collect_images<-x$collect_images 314 | if(!is.null(YEAR)){ 315 | collect_df<-collect_df[year==YEAR,] 316 | collect_images<-collect_images[year==YEAR] 317 | r_year<-substr(reviews$published,1,4) 318 | n_year<-substr(notes$published,1,4) 319 | reviews<-reviews[r_year==YEAR,] 320 | notes<-notes[n_year==YEAR,] 321 | } 322 | NR_NOTE<-nrow(notes) 323 | NR_REVIEW=nrow(reviews) 324 | 325 | ## 拼图大小 326 | n<-length(collect_images)^0.5 327 | n1<-ceiling(n);n2=n1 328 | if((n1-1)^2+floor(n)>n^2){n2=floor(n)} 329 | require(EBImage) 330 | cat("\u6b63\u5728\u7ed8\u5236\u7edf\u8ba1\u56fe......\n") ##正在绘制统计图 331 | ##cat(" #### 绘制书籍封面拼图......\n") 332 | cat(" #### \u7ed8\u5236\u4e66\u7c4d\u5c01\u9762\u62fc\u56fe......\n") 333 | front<-combine(x=collect_images) 334 | png("front.png",width=64*n1,height=80*n2) 335 | display(x=front,method="raster",all=T) 336 | dev.off() 337 | ##基本统计 338 | statSummary(x=collect_df,YEAR=YEAR,NR_REVIEW,NR_NOTE) 339 | ##按月统计 340 | statByMonth(x=collect_df$month) 341 | ##按页数统计 342 | statByPage(x=collect_df[,c("month","pages")]) 343 | ## 层次聚类与wordcloud 344 | clusterCloudByTags(x=collect_df,k=8) 345 | ##绘制书籍标签之间的关系图 346 | graphByTag(x=collect_df[,c("title","user_tags")]) 347 | ## 评论关键词wordcloud 348 | wordcloudByComment(x=collect_df,stopwords=stopwords) 349 | stat_base<-readImage("statSummary.png");h_base<-ncol(stat_base) 350 | stat_month<-readImage("barByMonth.png");h_month<-ncol(stat_month) 351 | stat_page<-readImage("barByPage.png");h_page<-ncol(stat_page) 352 | tag_clust<-readImage("hclustByTags.png");h_clust<-ncol(tag_clust) 353 | tag_cloud<-readImage("wordcloudByTags.png");h_tagcloud<-ncol(tag_cloud) 354 | tag_graph<-readImage('graphByTag.png');h_graph<-ncol(tag_graph) 355 | comment_cloud<-readImage("wordcloudByComment.png");h_cmt<-ncol(comment_cloud) 356 | im_front<-readImage("front.png") 357 | im_front<-resize(im_front,720,860);h_front<-ncol(im_front) 358 | #bigImage<-array(dim=c(720,h1+h2+h3+h4+h5+h6+h7+h8,3)) 359 | bigImage<-array(dim=c(720,h_base+h_month+h_page+h_clust+ 360 | h_tagcloud,3)) 361 | ### 添加背景色 362 | if(back==TRUE){ 363 | ## 添加背景色 364 | cat("\u6dfb\u52a0\u80cc\u666f\u8272......\n") 365 | plum = readImage(system.file("images", "plum.jpg", package="Rdouban")) 366 | plum<-resize(plum,720,h_base) 367 | stat_base<-0.8*stat_base+0.2*plum 368 | writeImage(stat_base,"statSummary2.png") 369 | 370 | orchid<-readImage(system.file("images", "orchid.jpg", package="Rdouban")) 371 | orchid<-resize(orchid,720,h_month) 372 | stat_month<-0.8*stat_month+0.2*orchid 373 | writeImage(stat_month,"stat_month2.png") 374 | ###### 375 | peony<-readImage(system.file("images", "peony.jpg", package="Rdouban")) 376 | peony<-resize(peony,720,h_page) 377 | stat_page<-0.9*stat_page+0.1*peony 378 | writeImage(stat_page,"stat_page2.png") 379 | 380 | peach<-readImage(system.file("images", "peach.jpg", package="Rdouban")) 381 | peach<-resize(peach,720,h_clust) 382 | tag_clust<-0.8*tag_clust+0.2*peach 383 | writeImage(tag_clust,"tag_clust2.png") 384 | 385 | #bamboo<-readImage(system.file("images", "bamboo.jpg", package="Rdouban")) 386 | im_front<-resize(im_front,720,h_front) 387 | #im_front<-im_front*0.8+0.2*bamboo 388 | #writeImage(im_front,"front2.png") 389 | 390 | fan<-readImage(system.file("images", "fan.jpg", package="Rdouban")) 391 | fan<-resize(fan,720,h_graph) 392 | tag_graph<-0.8*tag_graph+0.2*fan 393 | writeImage(tag_graph,"graphByTag2.png") 394 | 395 | rm(plum,orchid,peony,peach,fan) 396 | gc() 397 | } 398 | cat("\u5408\u5e76\u4e3a\u5927\u56fe......\n")##合并为大图 399 | #h_base+h_month+h_page+h_clust+h_tagcloud 400 | bigImage<-as.Image(bigImage);colorMode(bigImage)<-2 401 | bigImage[,1:h_base,]<-stat_base 402 | bigImage[,(h_base+1):(h_month+h_base),]<-stat_month 403 | bigImage[,(h_base+h_month+1):(h_page+h_month+h_base),]<-stat_page 404 | bigImage[,(h_base+h_month+h_page+1):(h_clust+h_page+h_month+h_base),]<-tag_clust 405 | bigImage[,(h_base+h_month+h_page+h_clust+1):(h_tagcloud+h_clust+h_page+h_month+h_base),]<-tag_cloud 406 | #bigImage[,(h1+h2+h3+h4+h5+1):(h6+h5+h4+h3+h2+h1),]<-tag_graph 407 | #bigImage[,(h1+h2+h3+h4+h5+h6+1):(h7+h6+h5+h4+h3+h2+h1),]<-comment_cloud 408 | #bigImage[,(h1+h2+h3+h4+h5+h6+h7+1):(h8+h7+h6+h5+h4+h3+h2+h1),]<-im_front 409 | rm(stat_base,stat_month,stat_page,tag_cloud,tag_clust) 410 | gc() 411 | writeImage(bigImage, files="bigImage.png", quality = 85) 412 | rm(bigImage) 413 | bigImage2<-array(dim=c(720,h_cmt+h_graph+h_front,3)) 414 | bigImage2<-as.Image(bigImage2);colorMode(bigImage2)<-2 415 | bigImage2[,1:h_cmt,]<-comment_cloud 416 | bigImage2[,(1+h_cmt):(h_cmt+h_graph),]<-tag_graph 417 | bigImage2[,(1+h_cmt+h_graph):(h_cmt+h_graph+h_front),]<-im_front 418 | writeImage(bigImage2, files="bigImage2.png", quality = 85) 419 | rm(tag_graph,comment_cloud,im_front,bigImage2) 420 | ## 信息图存放位置:\u4fe1\u606f\u56fe\u5b58\u653e\u4f4d\u7f6e 421 | cat("\u4fe1\u606f\u56fe\u5b58\u653e\u4f4d\u7f6e:",getwd(),"\n") 422 | cat("\u751f\u6210\u7684\u56fe\u7247\u4e3a:\n", 423 | dir(getwd(),"png"),"\n") #生成的图片为: 424 | } 425 | --------------------------------------------------------------------------------