├── data
    └── stopwords.rda
├── inst
    └── images
    │   ├── big.png
    │   ├── fan.jpg
    │   ├── daiyu.jpg
    │   ├── dream.jpg
    │   ├── peach.jpg
    │   ├── peony.jpg
    │   ├── plum.jpg
    │   ├── bamboo.jpg
    │   └── orchid.jpg
├── NAMESPACE
├── DESCRIPTION
├── man
    ├── get.book.info.Rd
    ├── get_movie_comments.Rd
    ├── get_movie_info.Rd
    ├── get_book_info.Rd
    ├── get_movie_discussions.Rd
    ├── get_music_discussions.Rd
    ├── get_book_discussions.Rd
    ├── get_music_reviews.Rd
    ├── get_book_notes.Rd
    ├── get_music_info.Rd
    ├── get_book_reviews.Rd
    ├── get_movie_reviews.Rd
    ├── get.movie.info.Rd
    ├── get.movie.comment.Rd
    ├── get.music.info.Rd
    ├── user_movie_viz.Rd
    ├── get.book.review.Rd
    ├── user_status.Rd
    ├── get.movie.review.Rd
    ├── user_book_viz.Rd
    ├── user_note_status.Rd
    ├── user_book_status.Rd
    └── user_movie_status.Rd
├── R
    ├── get_online_tags.R
    ├── get.movie.info.R
    ├── get.book.info.R
    ├── get.music.info.R
    ├── user_status.R
    ├── get.movie.comment.R
    ├── user_note_status.R
    ├── get_music_discussions.R
    ├── utils.R
    ├── get_movie_comments.R
    ├── get_music_info.R
    ├── get_book_reviews.R
    ├── get_book_notes.R
    ├── get_book_discussions.R
    ├── get_movie_info.R
    ├── get_movie_discussions.R
    ├── get.book.review.R
    ├── get_music_reviews.R
    ├── get_movie_reviews.R
    ├── get.movie.review.R
    ├── get_book_info.R
    ├── user_movie_status.R
    ├── user_book_status.R
    ├── user_movie_viz.R
    └── user_book_viz.R
└── README.md


/data/stopwords.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/data/stopwords.rda


--------------------------------------------------------------------------------
/inst/images/big.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/big.png


--------------------------------------------------------------------------------
/inst/images/fan.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/fan.jpg


--------------------------------------------------------------------------------
/inst/images/daiyu.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/daiyu.jpg


--------------------------------------------------------------------------------
/inst/images/dream.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/dream.jpg


--------------------------------------------------------------------------------
/inst/images/peach.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/peach.jpg


--------------------------------------------------------------------------------
/inst/images/peony.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/peony.jpg


--------------------------------------------------------------------------------
/inst/images/plum.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/plum.jpg


--------------------------------------------------------------------------------
/inst/images/bamboo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/bamboo.jpg


--------------------------------------------------------------------------------
/inst/images/orchid.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qxde01/Rdouban/HEAD/inst/images/orchid.jpg


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Default NAMESPACE created by R
 2 | # Remove the previous line if you edit this file
 3 | 
 4 | # Export all names
 5 | exportPattern(".")
 6 | 
 7 | # Import all packages listed as Imports or Depends
 8 | import(
 9 |   RCurl,
10 |   XML,
11 |   RJSONIO
12 | )
13 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: Rdouban
 2 | Version: 0.2-1
 3 | Date: 2013-12-13
 4 | Title: Provide an interface to the douban comment data
 5 | Author: qxde01@gmail.com
 6 | Depends: R (>= 2.12.0), RCurl,XML,RJSONIO
 7 | Description: douban
 8 | Maintainer:  qxde01 <qxde01@gmail.com>
 9 | License: GPL
10 | Encoding: UTF-8
11 | Packaged: 2013-12-13 22:49:05 UTC; sz
12 | Repository: https://github.com/qxde01/Rdouban
13 | Date/Publication: 2013-12-13 22:57:03


--------------------------------------------------------------------------------
/man/get.book.info.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get.book.info}
 3 | \alias{get.book.info}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{豆瓣图书基本信息官方API}
 6 | \description{ 获取豆瓣图书基本信息,包括作者、简介、豆瓣评分等.}
 7 | \usage{
 8 | get.book.info(bookid)
 9 | }
10 | %- maybe also 'usage' for other objects documented here.
11 | \arguments{
12 | \item{bookid}{豆瓣书籍的id号,一串数字,如20326626,可通过图书的豆瓣主页查询}
13 | }
14 | 
15 | \value{
16 |   \item{title}{书名}
17 |   \item{author }{作者}
18 |   \item{rating }{评分：最小值、评分人数、平均值、最大值}
19 |   \item{summary }{内容简介}
20 |   \item{tags }{一个data.frame,常用标签，两列分别是标签名称和标签频数}
21 |   \item{href}{书籍主页网址}
22 |   \item{image }{封面图片网址}
23 |   \item{attribute }{书籍其他信息,比如出本社、价格、页数等}
24 | }
25 | \references{}
26 | \author{
27 |   <\email{qxde01@gmail.com}>
28 | }
29 | 
30 | \examples{
31 | \dontrun{x<-get.book.info(bookid='2567698')}
32 | }
33 | \keyword{book}
34 | \keyword{douban}


--------------------------------------------------------------------------------
/R/get_online_tags.R:
--------------------------------------------------------------------------------
 1 | get_online_tags<-function(results=1000,count=200){
 2 |   pages<-ceiling(results/count)
 3 |   tags<-c()
 4 |   for(pg in 1:pages){
 5 |     u=paste0("http://www.douban.com/online/tag/?start=",(pg-1)*count)
 6 |     cat(pg,", Getting tags from u:",u,"......\n")
 7 |     p=.refreshURL(u)
 8 |     n1<-getNodeSet(p,'//div[@class="article"]//div[@class="indent"]//span')
 9 |     tag<-sapply(n1,xmlValue)
10 |     tag<-gsub("[\n ]|　　","",tag)
11 |     #cat(tag,"\n")
12 |     tag<-unlist(strsplit(tag,"\\(|\\)"))
13 |     word<-tag[seq(1,length(tag),2)]
14 |     freq<-tag[seq(1,length(tag),2)+1]
15 |     url=sapply(getNodeSet(p,'//div[@class="article"]//div[@class="indent"]//a'),
16 |                function(x) xmlGetAttr(x, "href"))
17 |     #cat(length(word),"  ",length(freq),"\n")   
18 |     tags0<-cbind(tag=word,url=url,freq=freq)
19 |     tags<-rbind(tags,tags0)
20 |   }
21 |   row.names(tags)<-NULL
22 |   return(as.data.frame(tags,stringsAsFactors=F))
23 | }


--------------------------------------------------------------------------------
/man/get_movie_comments.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get_movie_comments}
 3 | \alias{get_movie_comments} 
 4 | \title{获取豆瓣电影的短评}
 5 | \description{获取豆瓣电影的短评.}
 6 | \usage{get_movie_comments(movieid,results=100,fresh=10,verbose=TRUE,...)}
 7 | \arguments{
 8 |   \item{movieid}{ 豆瓣电影的id号}
 9 |   \item{results}{获取评论的数量,默认100}
10 |   \item{fresh}{页面异常时最大刷新次数,默认20}
11 |   \item{verbose}{若\code{verbose=TRUE则显示详细过程信息，否则不显示，默认\code{TRUE}}}
12 | }
13 | 
14 | \value{一个data.frame，各列为：
15 |   \item{author}{文章作者昵称}
16 |   \item{author_uri }{作者豆瓣URI}
17 |   \item{published }{短评发表时间}
18 |   \item{votes }{短评被赞为有用的次数}
19 |   \item{comment}{短评内容}
20 |   \item{rating}{作者对电影的评分}
21 |  
22 | }
23 | \note{每页获取短评个数为20,如果results不是20的倍数，则获取的实际影评数量是\code{ceiling(results/20)*20}.}
24 | \author{
25 | qxde01<\email{qxde01@gmail.com}>
26 | }
27 | \seealso{
28 |  \code{\link{get_movie_reviews}}
29 | }
30 | \examples{
31 | ## http://movie.douban.com/subject/5308265/comments
32 | \dontrun{x=get_movie_comments(movieid=5308265,results=100)}
33 | }
34 | \keyword{comment}
35 | \keyword{movie}


--------------------------------------------------------------------------------
/man/get_movie_info.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get_movie_info}
 3 | \alias{get_movie_info}
 4 | \title{获取豆瓣影视的基本信息}
 5 | \description{获取豆瓣影视剧的基本信息,包括电影名称、导演、演员、豆瓣评分等信息.
 6 | }
 7 | \usage{
 8 | get_movie_info(movieid,...)
 9 | }
10 | \arguments{
11 |   \item{movieid}{豆瓣电影和电视剧的ID,由数字组成的字符串,可通过每部电影或电视剧的豆瓣主页查询.}
12 | }
13 | \value{一个列表(list), 包括:
14 |   \item{title}{电影名称}
15 |   \item{author}{导演}
16 |   \item{rating}{用户评分信息,\code{average}为总分,\code{votes}指参与的评分人数,\code{starsx}指对应评分用户数所占的比例}
17 |   \item{summary}{电影或电视剧简介}
18 |   \item{tags}{\code{tag_label}标签名称及对应的使用频率\code{tag_freq}}
19 |   \item{href}{电影主页URL}
20 |   \item{image}{电影海报图片URL}
21 |   \item{reviews_total}{长影评数量}
22 |   \item{comments_total}{短影评数量}
23 |   \item{audience}{观众信息,\code{doings}指正在观看的用户数量,\code{collections}指已观看的用户数量,\code{wishes}指想看的用户数量}
24 |   \item{attribute}{一个字符串,豆瓣影视剧的基本信息,包括电影名称、导演、演员}
25 | }
26 | \author{<\email{qxde01@gmail.com}>
27 | }
28 | 
29 | \examples{
30 | ## http://movie.douban.com/subject/5308265/
31 | \dontrun{m<-get_movie_info(movieid='5308265')}
32 | }
33 | \keyword{movie}
34 | 


--------------------------------------------------------------------------------
/man/get_book_info.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get_book_info}
 3 | \alias{get_book_info} 
 4 | \title{获取豆瓣图书基本信息}
 5 | \description{ 获取豆瓣图书基本信息,包括作者、简介、豆瓣评分等.}
 6 | \usage{get_book_info(bookid,...)
 7 | }
 8 | \arguments{
 9 |   \item{bookid}{豆瓣书籍的id号,一串数字,如20326626,可通过图书的豆瓣主页查询}}
10 | \value{一个列表list,包括:
11 |   \item{title}{书名}
12 |   \item{author}{作者}
13 |   \item{rating}{长度为7的向量,用户评分信息,\code{avergae}为总分,\code{votes}指参与的评分人数,\code{starsx}指对应评分用户的所占的比例}
14 |   \item{href}{书籍主页URL}
15 |   \item{image}{书籍封面图片URL}
16 |   \item{summary}{书籍内容简介}
17 |   \item{author_intro}{作者简介}
18 |   \item{tags}{\code{tag_label}标签名称及对应的使用频率\code{tag_freq}}
19 |   \item{comments_total}{评论数量}
20 |   \item{notes_total}{为笔记数量}
21 |   \item{readers}{\code{doings}指正在阅读的用户数量,\code{collections}指已读的用户数量,\code{wishes}指想读的用户数量}
22 |   \item{attribute}{书籍的基本信息,如作者、出版社、ISBN等} 
23 | }
24 | 
25 | \author{
26 |    <\email{qxde01@gmail.com}>
27 | }
28 | \examples{
29 | ## http://book.douban.com/subject/1291204/
30 | \dontrun{book<-get_book_info(bookid='1291204')}
31 | }
32 | \keyword{book}
33 | \keyword{douban}
34 | 


--------------------------------------------------------------------------------
/man/get_movie_discussions.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get_movie_discussions}
 3 | \alias{get_movie_discussions}
 4 | \title{获取豆瓣电影的话题讨论内容
 5 | }
 6 | \description{获取某个豆瓣电影的话题讨论内容.
 7 | }
 8 | \usage{
 9 | get_movie_discussions(movieid,results = 100, fresh = 10,count=20, verbose = TRUE,...)
10 | }
11 | \arguments{
12 |   \item{movieid}{豆瓣电影ID}
13 |   \item{results}{获取话题讨论的数量,默认100}
14 |   \item{fresh}{页面异常时最大刷新次数,默认10}
15 |   \item{count}{每页discussion_uri数量,默认20，最大20}
16 |   \item{verbose}{若\code{verbose=TRUE则显示详细过程信息，否则不显示，默认\code{TRUE}}}
17 | }
18 | \value{一个data.frame,各列为:
19 |  \item{discussion_uri }{话题主页URL}
20 |  \item{title }{话题标题题}
21 |   \item{published }{话题发表的时间}
22 |  \item{author}{作者昵称}
23 |  \item{author_uri }{作者豆瓣URI}
24 |  \item{discussion }{话题详细内容}
25 |   \item{useful }{被赞为有用的次数}
26 |   \item{unuseful}{被评为无用的次数}
27 | }
28 | 
29 | \author{
30 | qxde01<\email{qxde01@gmail.com}>
31 | }
32 | 
33 | \examples{
34 | ## http://movie.douban.com/subject/5308265/discussion/
35 | \dontrun{get_movie_discussions(movieid=5308265,n=100)}
36 | }
37 | \keyword{discussion}
38 | 


--------------------------------------------------------------------------------
/man/get_music_discussions.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get_music_discussions}
 3 | \alias{get_music_discussions}
 4 | \title{获取豆瓣音乐专辑的话题讨论内容
 5 | }
 6 | \description{获取豆瓣音乐专辑的话题讨论内容.
 7 | }
 8 | \usage{
 9 | get_music_discussions(musicid,results = 100, fresh = 10,count=20, verbose = TRUE,...)
10 | }
11 | \arguments{
12 |   \item{musicid}{豆瓣音乐专辑的ID}
13 |   \item{results}{获取话题讨论的数量,默认100}
14 |   \item{fresh}{页面异常时最大刷新次数,默认10}
15 |   \item{count}{每页discussion_uri数量,默认20，最大20}
16 |   \item{verbose}{若\code{verbose=TRUE则显示详细过程信息，否则不显示，默认\code{TRUE}}}
17 | }
18 | \value{一个data.frame,各列为:
19 |  \item{discussion_uri }{话题主页URL}
20 |  \item{title }{话题标题题}
21 |   \item{published }{话题发表的时间}
22 |  \item{author}{作者昵称}
23 |  \item{author_uri }{作者豆瓣URI}
24 |  \item{discussion }{话题详细内容}
25 |   \item{useful }{被赞为有用的次数}
26 |   \item{unuseful}{被评为无用的次数}
27 | }
28 | 
29 | \author{
30 | qxde01<\email{qxde01@gmail.com}>
31 | }
32 | 
33 | \examples{
34 | ## http://music.douban.com/subject/3843530/discussion/
35 | \dontrun{a<-get_music_discussions(musicid=3843530,results = 100)}
36 | }
37 | \keyword{discussion}
38 | 


--------------------------------------------------------------------------------
/R/get.movie.info.R:
--------------------------------------------------------------------------------
 1 | get.movie.info<-function(movieid){
 2 |   u=paste0("https://api.douban.com/v2/movie/",movieid)
 3 |   #p=getURL(u,ssl.verifypeer = FALSE)
 4 |   p<-.refreshURL(u,ssl.verifypeer = FALSE)
 5 |   reslist <- fromJSON(p)
 6 |   title<-reslist[["title"]]
 7 |   author<-unlist(reslist[["author"]]);names(author)<-NULL
 8 |   rating<-unlist(reslist[["rating"]])
 9 |   tags<-reslist[["tags"]]
10 |   summary<-reslist[["summary"]]
11 |   tags<-data.frame(tag_label=sapply(tags,function(x)x[["name"]]),
12 |                    tag_freq=sapply(tags,function(x)x[["count"]]),
13 |                    stringsAsFactors=F)
14 |   image<-reslist[['image']]
15 |   href<-reslist[["alt"]]
16 |   reslist$title<-NULL;reslist$author<-NULL;
17 |   reslist$rating<-NULL;reslist$tags<-NULL;
18 |   reslist$summary<-NULL;reslist$alt<-NULL
19 |   reslist$image<-NULL
20 |   attribute=reslist$attrs
21 |   
22 |   list(title=title,
23 |        author=author,
24 |        rating=as.double(rating),
25 |        summary=summary,
26 |        tags=tags,
27 |        href=href,
28 |        image=image,
29 |        attribute= attribute)
30 | }


--------------------------------------------------------------------------------
/man/get_book_discussions.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get_book_discussions} 
 3 | \alias{get_book_discussions}
 4 | \title{获取豆瓣图书的话题讨论内容
 5 | }
 6 | \description{获取豆瓣图书话题讨论内容.
 7 | }
 8 | \usage{
 9 | get_book_discussions(bookid,results=100,fresh=10,count=20,verbose=TRUE)
10 | }
11 | \arguments{
12 |   \item{bookid}{ 豆瓣书籍的id号,一串数字,如2567698}
13 |   \item{results}{获取话题讨论的数量,默认100}
14 |   \item{fresh}{页面异常时最大刷新次数,默认10}
15 |   \item{count}{每页discussion_uri数量,默认20，最大20}
16 |   \item{verbose}{若\code{verbose=TRUE则显示详细过程信息，否则不显示，默认\code{TRUE}}}
17 | }
18 | \value{一个data.frame,各列为:
19 |  \item{discussion_uri }{话题主页URL}
20 |  \item{title }{话题标题题}
21 |   \item{published }{话题发表的时间}
22 |  \item{author}{作者昵称}
23 |  \item{author_uri }{作者豆瓣URI}
24 |  \item{discussion }{话题详细内容}
25 |   \item{useful }{被赞为有用的次数}
26 |   \item{unuseful}{被评为无用的次数}
27 | }
28 | \note{}
29 | \author{
30 | qxde01<\email{qxde01@gmail.com}>
31 | }
32 | \seealso{
33 | 
34 | }
35 | \examples{
36 | ## http://book.douban.com/subject/1291204/discussion/
37 | \dontrun{b<-get_book_discussions(bookid=1291204,results=100)}
38 | }
39 | \keyword{discussion}
40 | 


--------------------------------------------------------------------------------
/man/get_music_reviews.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get_music_reviews}
 3 | \alias{get_music_reviews}
 4 | \title{获取豆瓣音乐评论信息.}
 5 | \description{获取豆瓣音乐评论信息.}
 6 | \usage{
 7 | get_music_reviews(musicid,results = 100, fresh = 10,count=25, verbose = TRUE,...)
 8 | }
 9 | %- maybe also 'usage' for other objects documented here.
10 | \arguments{
11 |   \item{musicid}{豆瓣音乐ID}
12 |   \item{results}{获取评论的数量,默认100}
13 |   \item{fresh}{页面异常时最大刷新次数,默认10}
14 |   \item{count}{每页review_uri数量,默认25，最大25}
15 |   \item{verbose}{若\code{verbose=TRUE则显示详细过程信息，否则不显示，默认\code{TRUE}}}
16 | }
17 | \value{一个data.frame，各列为：
18 |  \item{review_uri }{评论主页URL}
19 |  \item{title }{评论文章名称}
20 |   \item{published }{文章发表时间}
21 |  \item{author}{文章作者昵称}
22 |  \item{author_uri }{作者豆瓣URI}
23 |  \item{review }{评论文章详细内容}
24 |  \item{rating }{作者对电影的评分}
25 |   \item{useful}{被评为useful的次数}
26 |   \item{unuseful}{被评为unuseful的次数}  
27 | }
28 | \author{
29 | qxde01<\email{qxde01@gmail.com}>
30 | }
31 | 
32 | \examples{
33 |  ## http://music.douban.com/subject/3843530/reviews
34 |  \dontrun{x<-get_music_reviews(musicid=3843530,results = 100)}
35 | }
36 | \keyword{music}
37 | \keyword{review}
38 | 


--------------------------------------------------------------------------------
/R/get.book.info.R:
--------------------------------------------------------------------------------
 1 | ##'@douban api of book information
 2 | ##'@bookid
 3 | ##'@version
 4 | get.book.info <- function(bookid) {
 5 |   u = paste0("https://api.douban.com/v2/book/", bookid)
 6 |   # p=getURL(u,ssl.verifypeer = FALSE)
 7 |   p <- .refreshURL(u, ssl.verifypeer = FALSE)
 8 |   reslist <- fromJSON(p)
 9 |   title <- reslist[["title"]]
10 |   author <- reslist[["author"]]
11 |   rating <- unlist(reslist[["rating"]])
12 |   tags <- reslist[["tags"]]
13 |   summary <- reslist[["summary"]]
14 |   tags <- data.frame(tag_label = sapply(tags, function(x) x[["name"]]), 
15 |                      tag_freq = sapply(tags,  function(x) x[["count"]]),
16 |                      stringsAsFactors = F)
17 |   image <- reslist[["images"]]
18 |   href <- reslist[["alt"]]
19 |   reslist$title <- NULL
20 |   reslist$author <- NULL
21 |   reslist$rating <- NULL
22 |   reslist$tags <- NULL
23 |   reslist$summary <- NULL
24 |   reslist$alt <- NULL
25 |   reslist$images <- NULL
26 |   attribute = reslist
27 |   
28 |   list(title = title, author = author, rating = as.double(rating), summary = summary, 
29 |        tags = tags, href = href, image = image, attribute = attribute)
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/man/get_book_notes.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get_book_notes}
 3 | \alias{get_book_notes}
 4 | \title{获取豆瓣图书的读书笔记}
 5 | \description{获取豆瓣图书的读书笔记.}
 6 | \usage{get_book_notes(bookid,results=100,fresh=10,count=10,verbose=TRUE,...)}
 7 | \arguments{
 8 |   \item{bookid}{ 豆瓣书籍的id号,一串数字,如2567698}
 9 |   \item{results}{获取评论的数量,默认100}
10 |   \item{fresh}{页面异常时最大刷新次数,默认10}
11 |   \item{count}{每页note_uri数量,默认10，最大10}
12 |   \item{verbose}{若\code{verbose=TRUE则显示详细过程信息，否则不显示，默认\code{TRUE}}}
13 | }
14 | \value{一个data.frame，各列为：
15 |  \item{note_uri }{笔记主页URL}
16 |  \item{title }{笔记标题}
17 |   \item{published }{笔记发表时间}
18 |  \item{author}{作者昵称}
19 |  \item{author_uri }{作者豆瓣URI}
20 |  \item{note }{笔记详细内容}
21 |  \item{rating }{作者对书籍的评分}
22 |   \item{readers}{被阅读的次数}
23 |   \item{collectors}{被收藏的次数}
24 | 
25 | }
26 | \note{若results不是count的倍数，则获取到的数量是\code{ceiling(results/count)*count}.}
27 | \author{
28 |   <\email{qxde01@gmail.com}>
29 | }
30 | 
31 | \seealso{\code{\link{get_book_reviews}}
32 | }
33 | \examples{
34 | ## http://book.douban.com/subject/1291204/annotation
35 | \dontrun{out<-get_book_notes(bookid='1291204',results=50)}
36 | }
37 | \keyword{note}
38 | \keyword{douban}
39 | 


--------------------------------------------------------------------------------
/man/get_music_info.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get_music_info}
 3 | \alias{get_music_info}
 4 | \title{获取豆瓣音乐的专辑信息
 5 | }
 6 | \description{
 7 | 获取豆瓣音乐的专辑信息,包括歌手、专辑名称等。
 8 | }
 9 | \usage{get_music_info(musicid)
10 | }
11 | %- maybe also 'usage' for other objects documented here.
12 | \arguments{
13 |   \item{musicid}{豆瓣音乐专辑ID号 }
14 | }
15 | \value{
16 |   \item{title}{音乐专辑名称}
17 |   \item{author }{作者}
18 |   \item{rating}{评分信息,\code{average}为总评分,\code{votes}为评分听众总数,\code{starx}为对应的评分听众所占总评分的比例 }
19 |   \item{summary }{内容简介}
20 |   \item{tags }{一个data.frame,常用标签，两列分别是标签名称和标签频数}
21 |   \item{songs}{专辑所包括的歌曲名称}
22 |   \item{href}{音乐专辑主页网址}
23 |   \item{image }{封面图片网址}
24 |   \item{listeners}{听众信息,
25 |    \code{doings}正在听的听众数量,\code{collections}已听过的听众数量,
26 |    \code{wishes}指想听的用户数量}
27 |   \item{comments_total}{发表评论的听众数}
28 |   \item{attribute }{专辑的其他信息,比如出版公司、出版时间等}
29 | }
30 | \references{
31 | %% ~put references to the literature/web site here ~
32 | }
33 | \author{
34 |   <\email{qxde01@gmail.com}>
35 | }
36 | \examples{
37 | ## http://music.douban.com/subject/3843530/
38 | \dontrun{music<-get_music_info(musicid='3843530')}
39 | }
40 | \keyword{music}
41 | \keyword{douban}
42 | 


--------------------------------------------------------------------------------
/man/get_book_reviews.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8} 
 2 | \name{get_book_reviews}
 3 | \alias{get_book_reviews}
 4 | \title{获取豆瓣图书的评论文章}
 5 | \description{获取豆瓣图书的评论文章.}
 6 | \usage{get_book_reviews(bookid,results=100,fresh=10,count=10,verbose=TRUE,...)
 7 | }
 8 | \arguments{
 9 |   \item{bookid}{ 豆瓣书籍的id号,一串数字,如2567698}
10 |   \item{results}{获取评论的数量,默认100}
11 |   \item{fresh}{页面异常时最大刷新次数,默认10}
12 |   \item{count}{每页review_uri数量,默认25，最大25}
13 |   \item{verbose}{若\code{verbose=TRUE则显示详细过程信息，否则不显示，默认\code{TRUE}}}
14 | }
15 | \value{一个data.frame，各列为：
16 |  \item{review_uri }{评论主页URL}
17 |  \item{title }{评论文章名称}
18 |   \item{published }{文章发表时间}
19 |  \item{author}{文章作者昵称}
20 |  \item{author_uri }{作者豆瓣URI}
21 |  \item{review }{评论文章详细内容}
22 |  \item{rating }{作者对书籍的评分}
23 |   \item{useful}{被评为useful的次数}
24 |   \item{unuseful}{被评为unuseful的次数}  
25 | }
26 | 
27 | \note{若results不是count的倍数，则获取到的数量是\code{ceiling(results/count)*count}.}
28 | \author{qxde01<\email{qxde01@gmail.com}>
29 | }
30 | \seealso{
31 |  \code{\link{get_book_notes}}
32 | }
33 | \examples{
34 | ## http://book.douban.com/subject/2567698/reviews
35 | \dontrun{out<-get_book_reviews(bookid=2567698,results=500)}
36 | }
37 | \keyword{comment}
38 | \keyword{review}
39 | \keyword{douban}
40 | 


--------------------------------------------------------------------------------
/man/get_movie_reviews.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get_movie_reviews}
 3 | \alias{get_movie_reviews} 
 4 | \title{获取豆瓣电影的长篇影评
 5 | }
 6 | \description{获取豆瓣电影和电视剧的长篇影评.
 7 | }
 8 | \usage{get_movie_reviews(movieid,results=100,fresh=10,count=10,verbose=TRUE,...)
 9 | }
10 | \arguments{
11 |   \item{movieid}{豆瓣电影的id号}
12 |   \item{results}{获取评论的数量,默认100}
13 |   \item{fresh}{页面异常时最大刷新次数,默认10}
14 |   \item{count}{每页review_uri数量,默认10，最大20}
15 |   \item{verbose}{若\code{verbose=TRUE则显示详细过程信息，否则不显示，默认\code{TRUE}}}
16 | 
17 | }
18 | \value{一个data.frame，各列为：
19 |  \item{review_uri }{影评主页URL}
20 |  \item{title }{评论文章名称}
21 |   \item{published }{文章发表时间}
22 |  \item{author}{文章作者昵称}
23 |  \item{author_uri }{作者豆瓣URI}
24 |  \item{review }{评论文章详细内容}
25 |  \item{rating }{作者对电影的评分}
26 |   \item{useful}{被评为useful的次数}
27 |   \item{unuseful}{被评为unuseful的次数}  
28 | }
29 | \author{
30 | qxde01<\email{qxde01@gmail.com}>
31 | }
32 | \note{一个值得注意的情况是不同页之间可能有相同的review_uri,最后获得评论数目可能和results的值不同。}
33 | \seealso{
34 |  \code{\link{get_movie_comments}}
35 | }
36 | \examples{
37 | ## http://movie.douban.com/subject/5308265/reviews
38 | \dontrun{x<-get_movie_reviews(movieid=5308265,results=50)}
39 | }
40 | \keyword{review}
41 | \keyword{comment}
42 | 


--------------------------------------------------------------------------------
/man/get.movie.info.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get.movie.info}
 3 | \alias{get.movie.info}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 | 获取豆瓣电影的基本信息
 7 | }
 8 | \description{
 9 | 获取豆瓣电影的基本信息,API V2接口。
10 | }
11 | \usage{
12 | get.movie.info(movieid)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{movieid}{豆瓣电影的ID,由数字组成,可通过每部电影的豆瓣主页查询.}
17 | }
18 | 
19 | \value{
20 |   \item{title}{电影名称}
21 |   \item{author }{作者}
22 |   \item{rating }{评分：最小值、评分人数、平均值、最大值}
23 |   \item{summary }{内容简介}
24 |   \item{tags }{一个data.frame,常用标签，两列分别是标签名称和标签频数}
25 |   \item{href}{影视主页网址}
26 |   \item{image }{封面图片网址}
27 |   \item{attribute }{一个list,电影其他信息,比如演员、上映时间等}
28 | }
29 | \references{
30 | %% ~put references to the literature/web site here ~
31 | }
32 | \author{
33 |   <\email{qxde01@gmail.com}>
34 | }
35 | \note{
36 | %%  ~~further notes~~
37 | }
38 | 
39 | %% ~Make other sections like Warning with \section{Warning }{....} ~
40 | 
41 | \seealso{
42 | %% ~~objects to See Also as \code{\link{help}}, ~~~
43 | }
44 | \examples{
45 | ## http://movie.douban.com/subject/5308265/
46 | \dontrun{get.movie.info(movieid='5308265')}
47 | }
48 | \keyword{movie}
49 | 
50 | 


--------------------------------------------------------------------------------
/man/get.movie.comment.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get.movie.comment}
 3 | \alias{get.movie.comment}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 | 通过移动网页获取豆瓣电影的短评.
 7 | }
 8 | \description{
 9 | 通过移动网页m.douban.com获取豆瓣电影的短评。
10 | }
11 | \usage{
12 | get.movie.comment(movieid,results=100,fresh=20,verbose=TRUE)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{movieid}{ 豆瓣电影的id号}
17 |   \item{results}{获取评论的数量,默认100}
18 |   \item{fresh}{页面异常时最大刷新次数,默认20}
19 |   \item{verbose}{若\code{verbose=TRUE则显示详细过程信息，否则不显示，默认\code{TRUE}}}
20 | }
21 | 
22 | \details{
23 | %%  ~~ If necessary, more details than the description above ~~
24 | }
25 | \value{
26 |  \item{author}{作者昵称}
27 |  \item{author_id }{作者豆瓣ID}
28 |  \item{comment }{短评内容}
29 |  \item{rating }{作者对电影的评分}
30 | }
31 | \references{
32 | %% ~put references to the literature/web site here ~
33 | }
34 | \author{
35 | qxde01<\email{qxde01@gmail.com}>
36 | }
37 | 
38 | \examples{
39 | 
40 | ## http://m.douban.com/movie/subject/11627047/comments
41 | \dontrun{cmt<-get.movie.comment(movieid=11627047,fresh=20,results=100,verbose=TRUE)}
42 | }
43 | 
44 | \keyword{movie}
45 | \keyword{comment}
46 | \keyword{douban}
47 | 
48 | 


--------------------------------------------------------------------------------
/man/get.music.info.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get.music.info}
 3 | \alias{get.music.info}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 | 获取豆瓣音乐的专辑信息
 7 | }
 8 | \description{
 9 | 获取豆瓣音乐的专辑信息,API V2接口。
10 | }
11 | \usage{
12 | get.music.info(musicid)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 | \item{musicid}{豆瓣音乐专辑ID号 }
17 | }
18 | 
19 | \value{
20 |   \item{title}{音乐专辑名称}
21 |   \item{author }{作者}
22 |   \item{rating }{评分：最小值、评分人数、平均值、最大值}
23 |   \item{summary }{内容简介}
24 |   \item{tags }{一个data.frame,常用标签，两列分别是标签名称和标签频数}
25 |   \item{songs}{专辑所包括的歌曲名称}
26 |   \item{href}{音乐专辑主页网址}
27 |   \item{image }{封面图片网址}
28 |   \item{attribute }{一个list,专辑的其他信息,比如出版公司、出版时间等}
29 | }
30 | \references{
31 | %% ~put references to the literature/web site here ~
32 | }
33 | \author{
34 |   <\email{qxde01@gmail.com}>
35 | }
36 | \note{
37 | %%  ~~further notes~~
38 | }
39 | 
40 | %% ~Make other sections like Warning with \section{Warning }{....} ~
41 | 
42 | \seealso{
43 | %% ~~objects to See Also as \code{\link{help}}, ~~~
44 | }
45 | 
46 | \examples{
47 | ## http://music.douban.com/subject/3843530/
48 | \dontrun{get.music.info(musicid='3843530')}
49 | }
50 | \keyword{music}
51 | \keyword{douban}
52 | 


--------------------------------------------------------------------------------
/man/user_movie_viz.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{user_movie_viz}
 3 | \alias{user_movie_viz}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 | 豆瓣用户观影统计及可视化函数
 7 | }
 8 | \description{
 9 | 豆瓣用户观影统计及可视化函数,观影综合统计、标签云图、海报拼图等。
10 | }
11 | \usage{
12 | user_movie_viz(x,YEAR="2013",stopwords=stopwords)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{x}{有函数\code{user_book_status}产生的数据}
17 |   \item{YEAR}{年份，比如"2013",如果为NULL，则统计所有数据}
18 |   \item{stopwords}{\code{Package:Rwordseg}分词时使用的中文停止词}
19 | }
20 | \details{
21 | %%  ~~ If necessary, more details than the description above ~~
22 | }
23 | \value{
24 | 产生以下图形：
25 | \item{1.}{观影综合统计}
26 | \item{2.}{每月统计(部,时长)}
27 | \item{3.}{根据电影标签归类}
28 | \item{4.}{电影标签云图}
29 | \item{5.}{电影名称与标签之间的关系图}
30 | \item{6.}{短评云图}
31 | \item{7.}{海报拼图}
32 | \item{8.}{导演和演员合作关系图}
33 | 
34 | }
35 | \references{
36 | %% ~put references to the literature/web site here ~
37 | }
38 | \author{
39 | qxde01<\email{qxde01@gmail.com}>
40 | }
41 | 
42 | \examples{
43 | ## http://movie.douban.com/people/qxde01
44 | \dontrun{
45 |    qxde<-user_movie_status(userid="qxde01")
46 |    data(stopwords)
47 |    user_movie_viz(x=qxde,YEAR=NULL,stopwords=stopwords)
48 |  }
49 | }
50 | 
51 | \keyword{ douban }
52 | \keyword{ movie }


--------------------------------------------------------------------------------
/man/get.book.review.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get.book.review}
 3 | \alias{get.book.review}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 | 获取豆瓣图书的评论文章.
 7 | }
 8 | \description{
 9 | 获取豆瓣图书的评论文章,部分使用了官方API V1.
10 | }
11 | \usage{
12 | get.book.review(bookid,results=100,verbose=TRUE)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{bookid}{ 豆瓣书籍的id号,一串数字,如20326626}
17 |   \item{results}{获取评论的数量,默认results=100}
18 |   \item{verbose}{若\code{verbose=TRUE则显示详细过程信息，否则不显示，默认\code{TRUE}}}
19 | }
20 | 
21 | \details{
22 | %%  ~~ If necessary, more details than the description above ~~
23 | }
24 | \value{一个data.frame，各列为：
25 |  \item{title }{评论文章名称}
26 |  \item{review_id }{评论文章的豆瓣ID}
27 |  \item{review }{评论文章详细内容}
28 |  \item{author}{文章作者昵称}
29 |  \item{author_id }{作者豆瓣ID}
30 |  \item{published }{文章发表时间}
31 |  \item{ updated}{文章最后一次修改时间}
32 |  \item{comments }{文章被评论的数量}
33 |  \item{ useless}{文章被赞的次数}
34 |  \item{votes }{文章被推荐次数}
35 |  \item{rating }{作者对书籍的评分}
36 | }
37 | 
38 | \author{
39 | qxde01<\email{qxde01@gmail.com}>
40 | }
41 | 
42 | \examples{
43 | 
44 | ## http://book.douban.com/subject/2567698/reviews
45 | \dontrun{x<-get.book.review(bookid=2567698,results=55)}
46 | }
47 | 
48 | \keyword{comment}
49 | \keyword{review}
50 | \keyword{douban}
51 | 


--------------------------------------------------------------------------------
/man/user_status.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{user_status}
 3 | \alias{user_status}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 | 获取豆瓣用户的说说
 7 | }
 8 | \description{
 9 | 获取豆瓣用户的说说,最多能获取最近的200条
10 | }
11 | \usage{
12 | user_status(usrid,results=200)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{usrid}{豆瓣用户ID}
17 |   \item{results}{获取用户说说的数量,最多可获取最近200条}
18 | }
19 | \details{
20 | %%  ~~ If necessary, more details than the description above ~~
21 | }
22 | \value{
23 | 
24 | 一个数据框(data.frame),各列为：
25 |  \item{data_sid }{数据ID}
26 |  \item{data_object_kind }{说说类别}
27 |  \item{data_target_type}{说说来源类别}
28 |  \item{txt }{说说、分享内容}
29 |  \item{created_at}{说说发表时间}
30 |  \item{saying }{说说内容}
31 |  \item{reply }{回应的次数}
32 | }
33 | \references{
34 | %% ~put references to the literature/web site here ~
35 | }
36 | \author{
37 | qxde01<\email{qxde01@gmail.com}>
38 | }
39 | \note{
40 | %%  ~~further notes~~
41 | }
42 | 
43 | %% ~Make other sections like Warning with \section{Warning }{....} ~
44 | 
45 | \seealso{
46 | %% ~~objects to See Also as \code{\link{help}}, ~~~
47 | }
48 | 
49 | \examples{
50 | ## http://www.douban.com/people/qxde01
51 | \dontrun{qxde<-user_status(userid='qxde01',results=100)}
52 | }
53 |   
54 | \keyword{ douban }
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/R/get.music.info.R:
--------------------------------------------------------------------------------
 1 | get.music.info<-function(musicid){
 2 |   u=paste0("https://api.douban.com/v2/music/",musicid)
 3 |   #p=getURL(u,ssl.verifypeer = FALSE)
 4 |   p<-.refreshURL(u,ssl.verifypeer = FALSE)
 5 |   reslist <- fromJSON(p)
 6 |   title<-reslist[["title"]]
 7 |   author<-unlist(reslist[["author"]]);names(author)<-NULL
 8 |   rating<-unlist(reslist[["rating"]])
 9 |   summary<-reslist[["summary"]]
10 |   tags<-reslist[["tags"]]
11 |   
12 |   tags<-data.frame(tag_label=sapply(tags,function(x)x[["name"]]),
13 |                    tag_freq=sapply(tags,function(x)x[["count"]]),
14 |                    stringsAsFactors=F)
15 |   image<-reslist[['image']]
16 |   href<-reslist[["alt"]]
17 |   reslist$title<-NULL;reslist$author<-NULL;
18 |   reslist$rating<-NULL;reslist$tags<-NULL;
19 |   reslist$summary<-NULL;reslist$alt<-NULL
20 |   reslist$image<-NULL
21 |   attribute=reslist$attrs
22 |   songs<-t(sapply(attribute$songs,
23 |                   function(x)c(x[["index"]],x[["title"]],x[["name"]])))
24 |   attribute$songs<-NULL
25 |   colnames(songs)<-c("index","title","name")
26 |   list(title=title,
27 |        author=author,
28 |        rating=as.double(rating),
29 |        summary=summary,
30 |        tags=tags,
31 |        songs=as.data.frame(songs,stringsAsFactors=F),
32 |        href=href,
33 |        image=image,
34 |        attribute= attribute)
35 | }
36 | 


--------------------------------------------------------------------------------
/man/get.movie.review.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{get.movie.review}
 3 | \alias{get.movie.review}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 | 通过移动网页获取豆瓣电影的长篇评论文章.
 7 | }
 8 | \description{
 9 | 通过移动网页m.douban.com获取豆瓣电影的长篇评论文章.
10 | }
11 | \usage{
12 | get.movie.review(movieid,results=100,fresh=20,verbose=TRUE)
13 | }
14 | 
15 | \arguments{
16 |   \item{movieid}{ 豆瓣电影的id号}
17 |   \item{results}{获取评论的数量,默认100}
18 |   \item{fresh}{页面异常时最大刷新次数,默认20}
19 |   \item{verbose}{若\code{verbose=TRUE则显示详细过程信息，否则不显示，默认\code{TRUE}}}
20 | }
21 | \value{一个data.frame，各列为：
22 |  \item{review_id }{评论文章的豆瓣ID}
23 |  \item{title }{评论文章名称}
24 |  \item{review }{评论文章详细内容}
25 |  \item{author}{文章作者昵称}
26 |  \item{author_uri }{作者豆瓣URI}
27 |  \item{published }{文章发表时间}
28 |  \item{votes }{文章被推荐次数}
29 |  \item{rating }{作者对电影的评分}
30 | }
31 | \references{
32 | %% ~put references to the literature/web site here ~
33 | }
34 | \author{
35 | qxde01<\email{qxde01@gmail.com}>
36 | }
37 | \note{
38 | 连续获取会被豆瓣阻止,出现 Forbidden 错误.
39 | }
40 | 
41 | \seealso{
42 | %% ~~objects to See Also as \code{\link{help}}, ~~~
43 | }
44 | \examples{
45 | 
46 | ## http://m.douban.com/movie/subject/11627047/reviews
47 | \dontrun{longmen<-get.movie.review(movieid=11627047,fresh=20,results=300)}
48 | }
49 | 
50 | \keyword{movie}
51 | \keyword{review}
52 | \keyword{douban}
53 | 


--------------------------------------------------------------------------------
/man/user_book_viz.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{user_book_viz}
 3 | \alias{user_book_viz}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 | 豆瓣用户阅读统计及可视化函数
 7 | }
 8 | \description{
 9 | 豆瓣用户阅读统计的可视化函数,统计阅读量、标签云图、封面拼图等。
10 | }
11 | \usage{
12 | user_book_viz(x, YEAR = "2013", stopwords = stopwords, back = FALSE)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{x}{有函数\code{user_movie_status}产生的数据}
17 |   \item{YEAR}{年份，比如"2013",如果为NULL，则统计所有数据}
18 |   \item{stopwords}{\code{Package:Rwordseg}分词时使用的中文停止词}
19 |   \item{back}{如果为TRUE，则为图片添加背景图}
20 | }
21 | \details{
22 | %%  ~~ If necessary, more details than the description above ~~
23 | }
24 | \value{
25 | 产生以下图形：
26 | \item{1.}{阅读综合统计}
27 | \item{2.}{每月统计(本,页数)}
28 | \item{3.}{根据书籍标签归类}
29 | \item{4.}{读书标签云图}
30 | \item{5.}{短评云图}
31 | \item{6.}{书籍名称与标签之间的关系图}
32 | \item{7.}{封面拼图}
33 | 
34 | }
35 | \references{
36 | %% ~put references to the literature/web site here ~
37 | }
38 | 
39 | \author{
40 | qxde01<\email{qxde01@gmail.com}>
41 | }
42 | 
43 | \examples{
44 | ## http://book.douban.com/people/qxde01
45 | \dontrun{
46 |    qxde<-user_book_status(userid="qxde01")
47 |    data(stopwords)
48 |    user_book_viz(x=qxde,YEAR="2013",stopwords=stopwords,back=TRUE)
49 |  }
50 | }
51 | 
52 | \keyword{ douban }
53 | \keyword{ book }
54 | 


--------------------------------------------------------------------------------
/man/user_note_status.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{user_note_status}
 3 | \alias{user_note_status}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 | 获取豆瓣用户的日记
 7 | }
 8 | \description{
 9 | 获取豆瓣用户的日记. 
10 | }
11 | \usage{
12 | user_note_status(userid,count=10,verbose=TRUE)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{userid}{豆瓣用户id}
17 |   \item{count}{每页显示的日记篇数，默认10}
18 |   \item{verbose}{若为\code{TRUE}则显示详细过程信息,否则不显示，默认\code{TRUE}}
19 | }
20 | \details{
21 | %%  ~~ If necessary, more details than the description above ~~
22 | }
23 | \value{一个data.frame，各列为：
24 | 
25 |   \item{title }{日记的标题}
26 |   \item{published}{日记发表时间}
27 |   \item{note}{日记内容}
28 |   \item{liked}{被喜欢的次数}
29 |   \item{recommend}{被推荐的次数}
30 |   \item{url}{日记的网址}
31 | }
32 | 
33 | \references{
34 | %% ~put references to the literature/web site here ~
35 | }
36 | \author{
37 | qxde01<\email{qxde01@gmail.com}>
38 | }
39 | \note{
40 | %%  ~~further notes~~
41 | }
42 | 
43 | %% ~Make other sections like Warning with \section{Warning }{....} ~
44 | 
45 | \seealso{
46 | %% ~~objects to See Also as \code{\link{help}}, ~~~
47 | }
48 | \examples{
49 | ## http://www.douban.com/people/qxde01/notes
50 | ## 作者没发表过日记，没有任何内容
51 | \dontrun{qxde<-user_note_status(userid="qxde01")}
52 | }
53 | % Add one or more standard keywords, see file 'KEYWORDS' in the
54 | % R documentation directory.
55 | \keyword{ douban }
56 | \keyword{ note}% __ONLY ONE__ keyword per line
57 | 


--------------------------------------------------------------------------------
/man/user_book_status.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{user_book_status}
 3 | \alias{user_book_status}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 | 豆瓣用户的读书信息
 7 | }
 8 | \description{
 9 | 豆瓣用户的读书信息，包括已读书籍、发表书评、笔记等。
10 | }
11 | \usage{
12 | user_book_status(userid,verbose=TRUE,front=TRUE)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{userid}{豆瓣用户id}
17 |   \item{verbose}{若为\code{TRUE}则显示详细过程信息,否则不显示，默认\code{TRUE}}
18 |   \item{front}{若为TRUE,则下载已读书籍的封面图片,否则不下载}
19 | }
20 | \details{
21 | %%  ~~ If necessary, more details than the description above ~~
22 | }
23 | \value{一个list，各项为：
24 | 
25 |   \item{collect_tags }{\code{data.frame},已读书籍的标签}
26 |   \item{collect_df}{\code{data.frame},已读书籍的信息}
27 |   \item{do_tags}{\code{data.frame},正在阅读书籍的标签}
28 |   \item{do_df}{\code{data.frame},正在阅读书籍的信}
29 |   \item{wish_tags}{\code{data.frame},想读书籍的标签}
30 |   \item{wish_df}{\code{data.frame},想读书籍的信息}
31 |   \item{reviews}{\code{data.frame},发表的书评,各列同\code{get_book_reviews}}
32 |   \item{notes}{\code{data.frame},发表的读书笔记,各列同\code{get_book_notes}}
33 |   \item{collect_images}{\code{list},已读书籍的封面图片,分辨率为\code{60x80}}
34 | }
35 | 
36 | \author{
37 | qxde01<\email{qxde01@gmail.com}>
38 | }
39 | 
40 | \examples{
41 | ## http://book.douban.com/people/qxde01
42 | \dontrun{qxde<-user_book_status(userid="qxde01")}
43 | }
44 | % Add one or more standard keywords, see file 'KEYWORDS' in the
45 | % R documentation directory.
46 | \keyword{ douban }
47 | \keyword{ book }% __ONLY ONE__ keyword per line
48 | 


--------------------------------------------------------------------------------
/R/user_status.R:
--------------------------------------------------------------------------------
 1 | .statuses<-function(p){
 2 |   .extract<-function(x){
 3 |     a<-unlist(xmlToList(x))
 4 |     created_at<-a['div.div.div.span..attrs.title']
 5 |     saying<-a['div.div.blockquote.p']
 6 |     saying<-ifelse(is.na(saying),a['div.div.div.blockquote.p'],saying)
 7 |     reply<-gsub('[^0-9]','',a[grep('回应',a)][1])
 8 |     out<-c(created_at,saying,reply)
 9 |     names(out)<-c('created_at','saying','reply')
10 |     return(out)   
11 |   }
12 |   
13 |   n1<-getNodeSet(p,'//div[@class="status-item"]')
14 |   data_sid<-sapply(n1,function(x) xmlGetAttr(x, "data-sid"))
15 |   data_target_type<-sapply(n1,function(x) xmlGetAttr(x, "data-target-type"))
16 |   data_object_kind<-sapply(n1,function(x) xmlGetAttr(x, "data-object-kind"))
17 |   
18 |   n2<-getNodeSet(p,'//div[@class="status-item"]//div[@class="text"]')
19 |   txt<-gsub('\n|   ','',sapply(n2,xmlValue))
20 |   n3<-getNodeSet(p,'//div[@class="status-item"]//div[@class="block block-subject"]')
21 |   
22 |   txt<-gsub('\n|   |> 我来回应','',sapply(n1,xmlValue))
23 |   txt<-gsub('          ','',txt)
24 |   tmp0<-t(sapply(n1,.extract))
25 |   tmp<-cbind(data_sid,data_object_kind,data_target_type,txt,tmp0)
26 |   tmp<-as.data.frame(tmp,stringsAsFactors=F)
27 |   return(tmp)
28 | }
29 | ################################
30 | user_status<-function(userid,results=200){
31 |   pages<-results/20
32 |   df<-c()
33 |   for(pg in 1:pages){
34 |     u<-paste0('http://www.douban.com/people/',userid,'/statuses?p=',pg)
35 |     cat(pg,' Getting saying from:',u,'...\n')
36 |     p<-.refreshURL(u)
37 |     if(!is.null(p)){
38 |       df0<-.statuses(p)
39 |       df<-rbind(df,df0)
40 |     }
41 |   }
42 |   return(df)
43 | }
44 | ##qxde<-user_status(userid='qxde01',results=200)
45 | 


--------------------------------------------------------------------------------
/man/user_movie_status.Rd:
--------------------------------------------------------------------------------
 1 | \encoding{UTF-8}
 2 | \name{user_movie_status}
 3 | \alias{user_movie_status}
 4 | %- Also NEED an '\alias' for EACH other topic documented here.
 5 | \title{
 6 | 豆瓣用户的观影信息
 7 | }
 8 | \description{
 9 | 豆瓣用户的观影信息，包括一看影视记录、发表影评等。
10 | }
11 | \usage{
12 | user_movie_status(userid,verbose=TRUE,front=TRUE)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{userid}{豆瓣用户id}
17 |   \item{verbose}{若为\code{TRUE}则显示详细过程信息,否则不显示，默认\code{TRUE}}
18 |   \item{front}{\code{TRUE},则下载电影海报图片,否则不下载}
19 | }
20 | \details{
21 | %%  ~~ If necessary, more details than the description above ~~
22 | }
23 | \value{一个list，各项为：
24 | 
25 |   \item{collect_tags }{\code{data.frame},已观看影视的标签}
26 |   \item{collect_df}{\code{data.frame},已观看影视的信息}
27 |   \item{do_tags}{\code{data.frame},正在观看影视的标签}
28 |   \item{do_df}{\code{data.frame},正在观看影视的信息}
29 |   \item{wish_tags}{\code{data.frame},想观看影视的标签}
30 |   \item{wish_df}{\code{data.frame},想观看影视的信息}
31 |   \item{reviews}{\code{data.frame},发表的影评,各列同\code{get_movie_reviews}}
32 |   \item{collect_images}{\code{list},已观看影视的海报图片,分辨率为\code{60x80}}
33 | }
34 | \references{
35 | %% ~put references to the literature/web site here ~
36 | }
37 | \author{
38 | qxde01<\email{qxde01@gmail.com}>
39 | }
40 | \note{
41 | %%  ~~further notes~~
42 | }
43 | 
44 | %% ~Make other sections like Warning with \section{Warning }{....} ~
45 | 
46 | \seealso{
47 | %% ~~objects to See Also as \code{\link{help}}, ~~~
48 | }
49 | \examples{
50 | ## http://movie.douban.com/people/qxde01
51 | \dontrun{qxde<-user_movie_status(userid="qxde01")}
52 | }
53 | % Add one or more standard keywords, see file 'KEYWORDS' in the
54 | % R documentation directory.
55 | \keyword{ douban }
56 | \keyword{ movie }% __ONLY ONE__ keyword per line
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Rdouban
 2 | =======
 3 | ### 获取豆瓣网评论数据的R接口，基于RCurl和XML。
 4 | > #### 非官方接口
 5 | * get_book_info	获取豆瓣图书基本信息
 6 | * get_book_reviews	获取豆瓣图书的评论文章
 7 | * get_book_discussions	获取豆瓣图书的话题讨论内容
 8 | * get_book_notes	获取豆瓣图书的读书笔记
 9 | * get_movie_info	获取豆瓣影视的基本信息
10 | * get_movie_reviews	获取豆瓣电影的长篇影评
11 | * get_movie_comments	获取豆瓣电影的短评
12 | * get_movie_discussions	获取关于某个豆瓣电影的话题讨论内容
13 | * get_music_info	获取豆瓣音乐的专辑信息
14 | * get_music_reviews	获取豆瓣音乐评论信息.
15 | * get_music_discussions	获取豆瓣音乐专辑的话题讨论内容
16 | * user_status 获取用户的说说
17 | * user_note_status 获取用户的日记
18 | * user_book_status 获取用户的读书信息
19 | * user_book_viz 对用户的读书信息进行统计和可视化
20 | * user_movie_status 获取用户的影视观看信息
21 | * user_movie_viz 对用户的观影信息进行统计和可视化
22 | 
23 | > #### 下面的函数是上述部分函数的重写，部分采用了官方API
24 | * get.book.info  豆瓣图书信息API V2
25 | * get.movie.info  豆瓣电影信息API V2
26 | * get.music.info  豆瓣音乐信息API V2
27 | * get.book.review  豆瓣图书评论信息,部分采用API V1
28 | * get.movie.review  豆瓣电影长篇评论,通过移动网页m.douban.com,高频率访问会被豆瓣阻止
29 | * get.movie.comment  豆瓣电影短评,通过移动网页m.douban.com,高频率访问会被豆瓣阻止
30 | 
31 | >### 安装
32 | ```
33 | library("devtools")
34 | install_github("Rdouban","qxde01")
35 | ```
36 | >### 例子
37 | 获取作者qxde01的阅读信息，并统计分析和可视化。注意，需要通过`biocLite()`安装`package:EBImage`。
38 | 两篇相关的博文见 豆瓣阅读统计及可视化[上](http://qxde01.blog.163.com/blog/static/6733574420132915952828/),[下](http://qxde01.blog.163.com/blog/static/673357442013355192638/)。博文与函数`user_book_viz`的具体实现是有差异的。
39 | 
40 | ```
41 | library(Rdouban)
42 | qxde<-user_book_status(userid="qxde01")
43 | data(stopwords) ## 中文停止词
44 | ## 生成用户qxde01的2013年阅读信息可视化图形
45 | user_book_viz(x=qxde,YEAR="2013",stopwords=stopwords,back=TRUE)
46 | ``` 
47 | 
48 | >### windows下安装
49 | 在windows下的中文编码至今没有搞定，**坑爹的windows，坑娘的微软**，安装编译时会出错，主要是因为帮助用中文编写，在R/etc/Rprofile.site 添加`options(encoding="UTF-8")`,使用RStudio可以编译通过，查看帮助会有乱码，使用浏览器查看有部分是乱码（需要手动修改浏览器编码查看方式）。在Linux下没有问题。
50 | 
51 | >### 拼接后的效果图
52 | ![拼接后的效果图](inst/images/big.png)


--------------------------------------------------------------------------------
/R/get.movie.comment.R:
--------------------------------------------------------------------------------
 1 | ## movieid:movie id of douban
 2 | ## results:get the number of comments
 3 | 
 4 | get.movie.comment<-function(movieid,fresh=20,results=100,verbose=TRUE){
 5 |   u=paste0("http://m.douban.com/movie/subject/",movieid,"/comments")
 6 |   p<-htmlParse(postForm(u))
 7 |   ## 评论总数
 8 |   total<-sapply(getNodeSet(p, '//div[@class="title"]//span'), xmlValue)
 9 |   total<-as.integer(gsub("[^0-9]","",total))
10 |   cat("----------There are a total of ",total," short comments.----------\n")
11 |   results<-min(total,results)
12 |   ## 评论屏数
13 |   pages<-ceiling(results/20)
14 |   out<-data.frame(matrix(nrow=pages*20,ncol=4),stringsAsFactors=F)
15 |   colnames(out)<-c("author","author_id","comment","rating")
16 |   nr=1
17 |   for(pg in 1:pages){
18 |     #http://m.douban.com/movie/subject/11627047/comments?page=2&session=4c4f1371
19 |     u=paste0("http://m.douban.com/movie/subject/",movieid,"/comments?page=",pg)
20 |     if(verbose==TRUE){
21 |       cat("Retrieving comments of page ",pg, " from: ",u," .....\n")
22 |     }
23 |     p<- tryCatch(.refreshForm(u,fresh,verbose),error = function(e){NULL})
24 |     
25 |     author<-sapply(getNodeSet(p,'//div[@class="item"]//a[@class="founder"]'),
26 |                    xmlValue)
27 |     author_id<-sapply(getNodeSet(p,'//div[@class="item"]//a[@class="founder"]'),
28 |                       function(x) xmlGetAttr(x, "href"))
29 |     author_id<-unlist(strsplit(author_id,"/movie/people/|/\\?session"))
30 |     author_id<-author_id[-grep("=",author_id)]
31 |     author_id<-author_id[nchar(author_id)>0]
32 |     n1<-getNodeSet(p,'//div[@class="list"]//div[@class="item"]//span')
33 |     tmp<-sapply(n1,xmlValue)
34 |     tmp<-gsub("[\n ]","",tmp)[1:(length(tmp)-3)] 
35 |     cmt<-tmp[seq(1,length(tmp),2)]
36 |     rating<-tmp[seq(2,length(tmp),2)]
37 |     #cat(rating,'\n')
38 |     rating<-gsub("[^\\(0-5星]","",rating)
39 |     #gsub("[^0-5][^星]","",rating)
40 |     rating<-gsub("[0-5]{1,10}\\(","",rating)
41 |     rating<-gsub("[\\(星]","",rating)
42 |     out0<-cbind(author=author,author_id=author_id,comment=cmt,rating=rating)
43 |     nr2=nrow(out0)
44 |     out[nr:(nr+nr2-1),]<-out0
45 |     nr=nr+nr2
46 |   }
47 |   return(out)
48 | }


--------------------------------------------------------------------------------
/R/user_note_status.R:
--------------------------------------------------------------------------------
 1 | ##################
 2 | #' @param u:日记地址
 3 | #'
 4 | .get_user_note<-function(u){
 5 |   p<-.refreshURL(u)
 6 |   title<-sapply(getNodeSet(p, '//div[@class="note-header"]//h1'),xmlValue)
 7 |   published<-sapply(getNodeSet(p, '//div[@class="note-header"]//span'),xmlValue)
 8 |   liked<-sapply(getNodeSet(p, '//span[@class="fav-num"]//a'),xmlValue)
 9 |   liked<-gsub('[^0-9]','',liked)
10 |   recommend<-sapply(getNodeSet(p, '//span[@class="rec-num"]'),xmlValue)
11 |   recommend<-gsub('[^0-9]','',recommend)
12 |   if(length(recommend)==0)recommend<-NA
13 |   if(length(liked)==0)liked<-NA
14 |   note<-sapply(getNodeSet(p, '//div[@class="note"]'),xmlValue)
15 |   note<-note[nchar(note)>0]
16 |   out<-c(title,published,note,liked,recommend,u)
17 |   return(out)
18 | }
19 | #############################################################
20 | ## 获取用户的日记
21 | #'@param userid
22 | #'@param count
23 | #'@param verbose
24 | 
25 | user_note_status<-function(userid,count=10,verbose=TRUE){
26 |   u<-paste0("http://www.douban.com/people/",userid,"/notes")
27 |   p<-.refreshURL(u)
28 |   pages<-sapply(getNodeSet(p, '//body//div[@class="paginator"]//a'),xmlValue)
29 |   if(length(pages)==0){
30 |     pages=1
31 |   }
32 |   else{
33 |     pages<-as.integer(pages[length(pages)-1])
34 |   }
35 |   cat("\n--------There is a total of ",pages," pages about notes.--------\n")
36 |   href<-c()
37 |   for(pg in 1:pages){
38 |     u = paste0("http://www.douban.com/people/", userid, 
39 |                "/notes?start=", (pg - 1) * count)
40 |     cat("Getting note URLs from page ", pg, ": \n  ", u, " ...\n")
41 |     p<-.refreshURL(u)
42 |     href0<-sapply(getNodeSet(p, '//body//div//span[@class="wrap"]//a'),function(x) xmlGetAttr(x, "href"))
43 |     href0<-unique(href0[grep('/note/',href0)])
44 |     href<-c(href,href0)
45 |   }
46 |   total<-length(href)
47 |   cat("\n--------There is a total of ",total," notes.--------\n")
48 |   df<-data.frame(matrix(nrow=total,ncol=6),stringsAsFactors=F)
49 |   colnames(df)<-c("title","published","note","liked","recommend",'url')
50 |   if(total>0){
51 |     for(i in 1:total){
52 |       u=href[i]
53 |       if(verbose==TRUE){cat("  Getting note from URL: ",i,',', u, " ...\n")}
54 |       df[i,]<-.get_user_note(u)  
55 |     }
56 |   }
57 |   return(df)
58 | }
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/R/get_music_discussions.R:
--------------------------------------------------------------------------------
 1 | ##http://music.douban.com/subject/3843530/discussion/
 2 | ##musicid=3843530
 3 | .get_music_discussion0<-function(u,fresh,verbose){
 4 |   .get_movie_discussion0(u,fresh,verbose)
 5 | }
 6 | #################################################################
 7 | get_music_discussions<-function(musicid,results = 100, fresh = 10,count=20, verbose = TRUE,...){
 8 |   u = paste0("http://music.douban.com/subject/", musicid, "/discussion/")
 9 |   p <- .refreshURL(u, fresh, verbose)
10 |   total<-gsub('[^0-9]','',sapply(getNodeSet(p, '//span[@class="count"]'),xmlValue))
11 |   if (length(total)==0)
12 |     stop('There is no discussions about this music.')
13 |   cat('-----There is a tatal of ',total,' music discussions.-----\n')
14 |   pages<-ceiling(min(results,as.integer(total))/count)
15 |   out <- data.frame(matrix(nrow = pages * count, ncol = 8), stringsAsFactors = F)
16 |   colnames(out) <- c("dicussion_uri", "title", "published", "author", "author_uri", 
17 |                      "dicussion", "useful", "unuseful")
18 |   k=1
19 |   if(pages>0){
20 |     for(pg in 1:pages){
21 |       u=paste0('http://music.douban.com/subject/',musicid,
22 |                '/discussion/?start=',(pg-1)*count,'&sort=vote/')
23 |       if(verbose==TRUE) {
24 |         #cat('Getting',(pg-1)*20+1,'--',pg*20,'discussions...\n')
25 |         cat("Getting music discussion URLS from page=",pg,": ",u,"...\n")
26 |       }
27 |       p <- .refreshURL(u, fresh, verbose)
28 |       n1<-getNodeSet(p ,'//table[@class="olt"]//td/a')
29 |       href<-unique(sapply(n1,function(x) xmlGetAttr(x, "href")))
30 |       href<-href[grep('/discussion/',href)]
31 |       href <- href[!href %in% out$dicussion_uri]
32 |       n=length(href)
33 |       if(n>0){
34 |         for(i in 1:n){
35 |           u0<-href[i]
36 |           if(verbose==TRUE){
37 |             cat("   Getting ", k, " movie discussion from URL: ", u0, " ...\n")
38 |           }
39 |           out0<-.get_music_discussion0(u=u0,fresh,verbose)
40 |           if(length(out0)==8){
41 |             out[k,]<-out0
42 |             k=k+1
43 |           }
44 |           else{
45 |             cat("  !!!! Getting  failed at URL: ", u0, " \n")
46 |           }       
47 |         }
48 |       }
49 |     }
50 |   }
51 |   out <- out[!is.na(out[, 1]), ]
52 |   return(out)
53 | }
54 | 
55 | #a<-get_music_discussions(musicid=3843530,results = 100)


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
 1 | ###################################
 2 | ## @u:url
 3 | ## @fresh:refresh number
 4 | 
 5 | .refreshURL <- function(u, fresh = 15, ssl.verifypeer = TRUE,verbose = TRUE) {
 6 |   # p<-tryCatch(htmlParse(getURL(u)),error = function(e){NULL})
 7 |   p <- tryCatch(getURL(u,ssl.verifypeer=ssl.verifypeer), error = function(e) {
 8 |     print(e)
 9 |     return("NULL")
10 |   })
11 |   n = nchar(p)
12 |   k = 1
13 |   while (n < 500) {
14 |     if (verbose == TRUE) {
15 |       #cat("%%%%% 第 ", k, " 次重新请求URL:", u, ".....\n")
16 |       cat("%%%%% \u7b2c ", k, " \u6b21\u91cd\u65b0\u8bf7\u6c42URL:", u, ".....\n")
17 |     }
18 |     p <- tryCatch(getURL(u,ssl.verifypeer=ssl.verifypeer), error = function(e) {
19 |       print(e)
20 |       return("NULL")
21 |     })
22 |     n = nchar(p)
23 |     if (k > 3) {
24 |       Sys.sleep(rnorm(k, 3))
25 |     }
26 |     if (k > fresh) {
27 |       break
28 |     }
29 |     k = k + 1
30 |   }
31 |   if (verbose == TRUE) {
32 |     if (!is.null(p) & k > 1) {
33 |       ## cat("***** 第 ", k - 1, " 次请求成功！\n")
34 |       cat("***** \u7b2c ", k - 1, " \u6b21\u8bf7\u6c42\u6210\u529f\uff01\n")
35 |     }
36 |   }
37 |   if(ssl.verifypeer==FALSE){
38 |     return(p)
39 |   }
40 |   else{
41 |     p <- tryCatch(htmlParse(p), error = function(e) {
42 |       print(e)
43 |       return(NULL)
44 |     })
45 |     return(p)
46 |   }
47 | }
48 | ###################################
49 | ## @u:url
50 | ## @fresh:refresh number
51 | 
52 | .refreshForm<-function(u,fresh=10,verbose=TRUE){
53 |   p<-tryCatch(htmlParse(postForm(u)),error = function(e){NULL})
54 |   #p<-tryCatch(postForm(u),error = function(e){print(e);return("NULL")})
55 |   n=nchar(p)
56 |   k=1
57 |   while(is.null(p)){
58 |     if(verbose==TRUE){
59 |       ## cat("%%%%% 第 ",k," 次重新请求URL:",u,".....\n")
60 |       cat("%%%%% \u7b2c ",k," \u6b21\u91cd\u65b0\u8bf7\u6c42URL:",u,".....\n")
61 |     }
62 |     p<-tryCatch(htmlParse(postForm(u)),error = function(e){print(e);return(NULL)})
63 |     #saveXML(p,file=paste0(k,gsub("http:|/|\\?","_",u),".html"))
64 |     if(k>3){Sys.sleep(rnorm(k,3))}
65 |     if(k>fresh) {break}
66 |     k=k+1
67 |   }
68 |   if(verbose==TRUE){
69 |     if(!is.null(p) & k>1){
70 |       ##cat("***** 第 ",k-1," 次请求成功！\n")
71 |       cat("***** \u7b2c ",k-1," \u6b21\u8bf7\u6c42\u6210\u529f\uff01\n")
72 |     }
73 |   }
74 |   #saveXML(p,file=paste0(k,gsub("http:|/|\\?","_",u),".html"))
75 |   return(p)
76 | }


--------------------------------------------------------------------------------
/R/get_movie_comments.R:
--------------------------------------------------------------------------------
 1 | 
 2 | get_movie_comments<-function(movieid,results=100,fresh=10,verbose=TRUE,...){
 3 |   
 4 |   u=paste0('http://movie.douban.com/subject/',movieid,'/comments')
 5 |   p<- .refreshURL(u,fresh,verbose)
 6 |   
 7 |   total<-sapply(getNodeSet(p, '//body//span[@class="total"]'),xmlValue)
 8 |   total<-as.integer(gsub("[^0-9]","",total))
 9 |   cat('------------There is a total of ',total,' short comments.--------\n')
10 |   
11 |   results<-min(total,results)
12 |   pages<-ceiling(results/20)
13 |   out<-data.frame(matrix(nrow=pages*20,ncol=6),stringsAsFactors=F)
14 |   colnames(out)<-c("author","author_uri","published" ,"comment" ,"votes", "rating")
15 |   kind=1
16 |   for(pg in 1:pages){
17 |     if(verbose==TRUE){
18 |       cat(' Getting short comments from ',(pg-1)*20+1,'--',pg*20,'...\n')
19 |     }
20 |     u=paste0('http://movie.douban.com/subject/',movieid,'/comments?start=',(pg-1)*20+1,'&limit=20&sort=new_score')
21 |     p<- .refreshURL(u,fresh,verbose)
22 |     ## 短评内容
23 |     comment<-gsub('\n| ','',sapply(getNodeSet(p, '//div[@class="comment"]//p'),xmlValue))
24 |     ## 评论时间
25 |     publised<-sapply(getNodeSet(p, '//div[@class="comment"]//span[@class=""]'),xmlValue)
26 |     publised<-gsub("[\n ]","",publised)
27 |     ##作者及其URI
28 |     n1<-getNodeSet(p, '//div[@class="comment"]//span[@class="comment-info"]//a')
29 |     author<-sapply(n1,xmlValue)
30 |     author_uri<-sapply(n1,function(x) xmlGetAttr(x, "href"))
31 |     ##有用的票数
32 |     votes<-sapply(getNodeSet(p, '//div[@class="comment"]//span[@class="comment-vote"]//span'),xmlValue)
33 |     #votes<-gsub("[^0-9]","",votes)
34 |     ##评分
35 |     #n2<-getNodeSet(p, '//div[@class="comment"]//span[@class="comment-info"]//span')
36 |     n2<-getNodeSet(p, '//div[@class="comment"]//span[@class="comment-info"]')
37 |     mm=length(n2)
38 |     rating<-c()
39 |     for(jj in 1:mm){
40 |       a=xmlSApply(n2[[jj]],xmlAttrs)
41 |       a=gsub('[a-z]','',a$span[['class']])
42 |       rating<-c(rating,a)
43 |       #cat(jj,a,'\n')
44 |     }
45 | 
46 |     out0<-cbind(author=author,author_uri=author_uri,
47 |                publised=publised,comment=comment,
48 |                votes=votes,rating=rating)  
49 |    nr=nrow(out0)
50 |     out[kind:(kind+nr-1),]<-out0
51 |     kind<-kind+nr
52 |   }
53 |   out<-out[!is.na(out[,1]),]
54 |   return(out)
55 | }
56 |   
57 | 
58 | #http://movie.douban.com/subject/5308265/comments
59 | #x=get_movie_comments(movieid=5308265,results=100)
60 | 


--------------------------------------------------------------------------------
/R/get_music_info.R:
--------------------------------------------------------------------------------
 1 | ##http://music.douban.com/subject/3843530/
 2 | ##musicid=3843530
 3 | get_music_info<-function(musicid,...){
 4 |   u=paste0('http://music.douban.com/subject/',musicid,'/')
 5 |   p<-.refreshURL(u)
 6 |   pa<-'[\n ]|\\(\u8c46\u74e3|\\)' ##[\n ]|\\(豆瓣|\\)
 7 |   Encoding(pa)<-"UTF-8"
 8 |   title<-gsub(pa,'',sapply(getNodeSet(p, '//head//title'),xmlValue))
 9 |   attribute<-gsub('[\n]|   ','',sapply(getNodeSet(p, '//div[@id="info"]'),xmlValue))
10 |   author<-sapply(getNodeSet(p, '//div[@id="info"]//span[@class="pl"]//a'),xmlValue)[1]
11 |   ##
12 |   song<-sapply(getNodeSet(p, '//ul[@class="song-items"]//div[@data-title]'),xmlValue)
13 |   track<-sapply(getNodeSet(p, '//ul[@class="song-items"]//span[@class="n_doulists unfoldable"]'),
14 |                      xmlValue)
15 |   songs<-data.frame(title=song,track=track,stringsAsFactors=F)
16 |   ##
17 |   summary <-gsub('[\n ]','',sapply(getNodeSet(p, '//span[@class="all hidden"]'),xmlValue))
18 |   if(length(summary )==0)
19 |     summary <-gsub('[\n ]','',sapply(getNodeSet(p, '//span[@property="v:summary"]'),xmlValue))
20 |   ##
21 |   
22 |   labels<-gsub('[\n ]','',
23 |                sapply(getNodeSet(p, '//div[@id="db-tags-section"]//div'),xmlValue))
24 |   labels<-unlist(strsplit(labels,'\\(|\\)'))
25 |   tags<-data.frame( tag_label=labels[seq(1,length(labels)-1,2)],
26 |                     tag_freq=as.integer(labels[seq(2,length(labels),2)]),
27 |                     stringsAsFactors=F)
28 |  # tags$tag_label<-gsub("<U+00A0>","",tags$tag_label)
29 | 
30 |   rating<-gsub('[\n ]','',sapply(getNodeSet(p, '//div[@id="interest_sectl"]'),xmlValue))
31 |   rating<-gsub("\u4eba\u8bc4\u4ef7",'',rating) #\u4eba\u8bc4\u4ef7 人评价
32 |   rating<-as.numeric(unlist(strsplit(rating,'\\(|\\)|%')))
33 |   rating[-c(1,2)]<-rating[-c(1,2)]/100
34 |   if(length(rating)==7)
35 |     names(rating)<-c('average','votes','stars5','stars4','stars3','stars2','stars1')
36 |   ##
37 |   comments_total<-gsub('[^0-9]','',sapply(getNodeSet(p, '//div//p[@class="pl"]//a'),xmlValue))[1]
38 |   listeners<-sapply(getNodeSet(p, '//div[@id="collector"]//p//a'),xmlValue)
39 |   listeners<-gsub("[^0-9]","",listeners)
40 |   names(listeners)<-c('doings','collections','wishes')
41 | 
42 |   large <- sapply(getNodeSet(p, '//div[@id="mainpic"]//a'), 
43 |                    function(x) xmlGetAttr(x, "href"))
44 |   medium <- sapply(getNodeSet(p, '//div[@id="mainpic"]//img'), 
45 |                   function(x) xmlGetAttr(x, "src"))
46 |   image <- c(medium = medium, large = large)
47 |   list(title=title,
48 |        author=author,rating=rating,
49 |        summary=summary,
50 |        tags=tags, songs=songs,
51 |        href=u,image=image,
52 |        comments_total=comments_total,
53 |        listeners=listeners,attribute=attribute)
54 | }
55 | 
56 | ##x<-get_music_info(musicid=1404439)


--------------------------------------------------------------------------------
/R/get_book_reviews.R:
--------------------------------------------------------------------------------
 1 | 
 2 | .get_book_review0<-function(u,fresh=10,verbose=TRUE,...){
 3 |   p<-.refreshURL(u=u,fresh,verbose)
 4 |   ## title
 5 |   title<-sapply(getNodeSet(p, '//title'), xmlValue)
 6 |   title<-gsub("[\n ]","",title)
 7 |   ## 作者及其URI
 8 |   n1 <- getNodeSet(p, '//div[@class="article"]//div[@class="piir"]//a')[1]
 9 |   author<-sapply(n1, xmlValue)
10 |   author_uri<-sapply(n1, function(x) xmlGetAttr(x, "href"))  
11 |   ##评分
12 |   rating<-sapply(getNodeSet(p, '//span[@property="v:rating"]'), xmlValue)
13 |   ##评论发表时间
14 |   published<-sapply(getNodeSet(p, '//span[@property="v:dtreviewed"]'), xmlValue)
15 |   ##评论内容
16 |   review<-sapply(getNodeSet(p, '//span[@property="v:description"]'), xmlValue)
17 |   review<-gsub("\r","",review)
18 |   ## useful and unuseful
19 |   useful<-sapply(getNodeSet(p ,'//span[@class="useful"]//em'), xmlValue)
20 |   unuseful<-sapply(getNodeSet(p ,'//span[@class="unuseful"]//em'), xmlValue)
21 |   out<-c(review_uri=u,title=title,published=published,author=author,
22 |          author_uri=author_uri,review=review, rating=rating,
23 |          useful=useful,unuseful=unuseful) 
24 |   return(out)
25 | }
26 | 
27 | get_book_reviews<-function(bookid,results=100,fresh=10,count=25,verbose=TRUE,...){
28 |   u=paste0('http://book.douban.com/subject/',bookid,'/reviews')  
29 |   p<-.refreshURL(u,fresh,verbose)
30 |   total<-sapply(getNodeSet(p, '//span[@class="count"]'), xmlValue)
31 |   total<-as.integer(gsub("[^0-9]","",total))
32 |   pages<-ceiling(min(results,total)/count)
33 |   
34 |   cat('\n----------------There is a total of',total,'reviews.----------------\n\n')
35 |   ## 预定义输出dataFrame大小
36 |   out <- data.frame(matrix(nrow = pages * count, ncol = 9), stringsAsFactors = F)
37 |   colnames(out) <- c("review_uri", "title", "published", "author", "author_uri", "review", 
38 |                      "rating", "useful", "unuseful")
39 |   ## out nrow index
40 |   k = 1
41 |   
42 |   for(pg in 1:pages){
43 |     #cat(' Getting',(pg-1)*25+1,'--',pg*25,'comments...\n')
44 |     u=paste0('http://book.douban.com/subject/',bookid,'/reviews?score=&start=',
45 |              (pg-1)*count)
46 |     cat("Getting review URLs from page ", pg, ": ", u, " ...\n")
47 |     p<-.refreshURL(u,fresh,verbose)
48 |     n1<-getNodeSet(p, '//div[@class="ctsh"]//div//a')
49 |     href<-sapply(n1, function(x) xmlGetAttr(x, "href"))
50 |     href<-unique(href[grep("/review/",href)])
51 |     href <- href[!href %in% out$review_uri]
52 |     n<-length(href)
53 |     if(n>0){
54 |       for(i in 1:n){
55 |         u0<-href[i]
56 |         if (verbose == TRUE) {
57 |           cat("   Getting ", k, " movie review from URL: ", u0, " ...\n")
58 |         }
59 |         out0<-.get_book_review0(u=u0,fresh,verbose)
60 |         if(length(out0)==9){
61 |           out[k,]<-out0
62 |           k<-k+1
63 |         } else {
64 |           cat("  !!!! Getting  failed at URL: ", u0, " \n")
65 |         }    
66 |       }
67 |       
68 |     }
69 |   }
70 | 
71 |   out <- out[!is.na(out[, 1]), ]
72 |   return(out)
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/R/get_book_notes.R:
--------------------------------------------------------------------------------
 1 | ## get some infomation of a note by note url
 2 | .get_book_note0<-function(u,fresh,verbose,...){
 3 |   p<-.refreshURL(u,fresh, verbose)
 4 |   title<-gsub('[\n ]','',sapply(getNodeSet(p, '//title'), xmlValue))
 5 |   n1<-getNodeSet(p, '//div[@class="article"]//div[@class="info"]//a')
 6 |   author<-sapply(n1,xmlValue)[1]
 7 |   author_uri<-sapply(n1,function(x) xmlGetAttr(x, "href"))[1]
 8 |   published<-sapply(getNodeSet(p, '//span[@class="pubtime"]'), xmlValue)
 9 |   note<-sapply(getNodeSet(p, '//pre[@id="link-report"]'), xmlValue)
10 |   
11 |   rating<-sapply(getNodeSet(p, '//div[@class="mod profile clearfix"]//span[@class]'),
12 |                  function(x) xmlGetAttr(x, "class"))[2]
13 |   rating<-gsub('[^0-9]','',rating)
14 |   
15 |   v2<-sapply(getNodeSet(p, '//p[@class="pl info"]//span'), xmlValue)
16 |   v2<-gsub("[^0-9]","",v2)
17 |   readers<-v2[1]
18 |   collectors<-v2[2]
19 |   
20 |   out<-c(note_uri=u,title=title,published=published,author=author,
21 |          author_uri=author_uri,note=note,rating=rating,
22 |          readers=readers,collectors=collectors)
23 |   return(out)
24 | }
25 | ###############################################################
26 | get_book_notes<-function(bookid,results=100,fresh=10,count=10,verbose=TRUE,...){
27 |   u=paste0('http://book.douban.com/subject/',bookid,'/annotation')
28 |   p<-.refreshURL(u,fresh, verbose)
29 |   total<-sapply(getNodeSet(p, '//title'), xmlValue)
30 |   total<-gsub("^([^)(]*)","",total)
31 |   total<-as.integer(gsub("[^0-9]","",total))
32 |   cat('\n------------There is a total of',total,'notes.------------\n\n')
33 |   pages<-ceiling(min(total,results)/count)
34 |   out <- data.frame(matrix(nrow = pages * count, ncol = 9), stringsAsFactors = F)
35 |   colnames(out) <- c("note_uri", "title", "published", "author", "author_uri", "note", 
36 |                      "rating", "readers", "collectors")
37 |   ## output dataFrame nrow index
38 |   k = 1
39 |   for(pg in 1:pages){
40 |     #cat('Getting',(pg-1)*10+1,'--',pg*10,'notes...\n')
41 |     u=paste0('http://book.douban.com/subject/',bookid,
42 |              '/annotation?sort=rank&start=',(pg-1)*count)
43 |     cat("Getting note URLs from page",pg,": ",u,"...\n")
44 |     p<-.refreshURL(u,fresh, verbose)
45 |     href<-sapply(getNodeSet(p, '//div[@class="nlst"]//a[@href]'),
46 |            function(x) xmlGetAttr(x, "href"))
47 |     href<-unique(href[grep("/annotation/",href)])
48 |     href <- href[!href %in% out$note_uri]
49 |     n=length(href)
50 |     if(n>0){
51 |       for(i in 1:n){
52 |         u0<-href[i]
53 |         if(verbose==TRUE){
54 |           cat("   Getting ", k, " book note from URL: ", u0, " ...\n")
55 |         }
56 |         out0<-.get_book_note0(u=u0,fresh,verbose)
57 |         if(length(out0)==9){
58 |           out[k,]<-out0
59 |           k=k+1
60 |         }
61 |         else{
62 |           cat("  !!!! Getting  failed at URL: ", u0, " \n")
63 |         }
64 |       }
65 |     } 
66 |   }
67 |   out <- out[!is.na(out[, 1]), ]
68 |   return(out)
69 | }
70 | 


--------------------------------------------------------------------------------
/R/get_book_discussions.R:
--------------------------------------------------------------------------------
 1 | ##http://book.douban.com/subject/1291204/discussion/
 2 | ##bookid=1291204
 3 | .get_book_discussion0<-function(u,fresh,verbose,...){
 4 |   p<-.refreshURL(u,fresh, verbose)
 5 |   title<-gsub('[\n ]','',sapply(getNodeSet(p, '//head//title'),xmlValue))
 6 |   published<-sapply(getNodeSet(p, '//div[@class="article"]//span[@class="mn"]'),xmlValue)
 7 |   published<-gsub("\n|  ","", published)
 8 |   n1<-getNodeSet(p, '//div[@class="article"]//span[@class="pl2"]//a')
 9 |   author<-gsub("[\n ]","",sapply(n1,xmlValue)[1])
10 |   author_uri<-sapply(n1,function(x) xmlGetAttr(x, "href"))[1]
11 |   dicussion<-sapply(getNodeSet(p, '//div[@class="article"]//span[@class=""]')[1],xmlValue)
12 |   useful<-sapply(getNodeSet(p, '//span[@class="useful"]//em'),xmlValue)
13 |   unuseful<-sapply(getNodeSet(p, '//span[@class="unuseful"]//em'),xmlValue)
14 |   
15 |   out<-c(dicussion_uri=u,title=title,published=published,author=author,
16 |          author_uri=author_uri,dicussion=dicussion,useful=useful,unuseful=unuseful)
17 |   return(out)
18 | }
19 | ###########################################################################
20 | get_book_discussions<-function(bookid,results=100,fresh=10,count=20,verbose=TRUE){
21 |   u=paste0('http://book.douban.com/subject/',bookid,'/discussion/')
22 |   p<-.refreshURL(u,fresh, verbose)
23 |   total<-gsub("[^0-9]","", sapply(getNodeSet(p ,'//span[@class="count"]'),xmlValue))
24 |   
25 |   if (length(total)==0)
26 |     stop('There is no discussions about this book.')
27 |   cat('\n--------------There is a total of ',total,'discussions.--------------\n\n')
28 |   
29 |   pages<-ceiling(min(results,as.integer(total))/count)
30 |   out <- data.frame(matrix(nrow = pages * count, ncol = 8), stringsAsFactors = F)
31 |   colnames(out) <- c("dicussion_uri", "title", "published", "author", "author_uri", 
32 |                      "dicussion", "useful", "unuseful")
33 |   k=1
34 |   if(pages>0){
35 |     for(pg in 1:pages){
36 |       u=paste0('http://book.douban.com/subject/',bookid,
37 |                '/discussion/?start=',(pg-1)*20,'&sort=vote/')
38 |       if(verbose==TRUE) {cat(' Getting discussion URLs from URL:',u,' ...\n')}
39 |       p<-.refreshURL(u,fresh, verbose)
40 |       
41 |       n1<-getNodeSet(p ,'//table[@class="olt"]//td/a')
42 |       href<-unique(sapply(n1,function(x) xmlGetAttr(x, "href")))
43 |       href<-href[grep('/discussion/',href)]
44 |       href <- href[!href %in% out$dicussion_uri]
45 |       n=length(href)
46 |       if(n<1) next
47 |       for(i in 1:n){
48 |         u0<-href[i]
49 |         if(verbose==TRUE){
50 |           cat("   Getting ", k, " book discussion from URL: ", u0, " ...\n")
51 |         }
52 |         out0<-.get_book_discussion0(u=u0,fresh,verbose)
53 |         if(length(out0)==8){
54 |           out[k,]<-out0
55 |           k=k+1
56 |         }
57 |         else{
58 |           cat("  !!!! Getting  failed at URL: ", u0, " \n")
59 |         }       
60 |       }
61 |     }  
62 |   }
63 |   out <- out[!is.na(out[, 1]), ]
64 |   return(out)
65 | }
66 | # b<-get_book_discussions(bookid=1291204,results=100)


--------------------------------------------------------------------------------
/R/get_movie_info.R:
--------------------------------------------------------------------------------
 1 | get_movie_info<-function(movieid,...){
 2 |   u=paste0('http://movie.douban.com/subject/',movieid,'/')
 3 |   p<-.refreshURL(u)
 4 |   #pagetree<-htmlParse(getURL(strurl))
 5 |   ##base infomation
 6 |   author <-sapply(getNodeSet(p,'//div[@id="info"]//a[@rel="v:directedBy"]'),xmlValue)
 7 |   #stars <-sapply(getNodeSet(p,'//div[@id="info"]//a[@rel="v:starring"]'),xmlValue)
 8 |   #genre <-sapply(getNodeSet(p,'//div[@id="info"]//span[@property="v:genre"]'),xmlValue)
 9 |   #runtime<-sapply(getNodeSet(p,'//div[@id="info"]//span[@property="v:runtime"]'),xmlValue)
10 |   title<-sapply(getNodeSet(p, '//span[@property="v:itemreviewed"]'), xmlValue)
11 |   attribute<-gsub('[\n ]','',sapply(getNodeSet(p,'//div[@id="info"]'), xmlValue))
12 |   ##rating
13 |   rating<-sapply(getNodeSet(p, '//div[@id="interest_sectl"]'), xmlValue)
14 |   rating<-unlist(strsplit(gsub('\n','',rating),' '))
15 |   rating<-rating[nchar(rating)>0]
16 |   rating[2]<-gsub('[^0-9]','',rating[2])
17 |   rating<-as.numeric(gsub('%','',rating))
18 |   if(length(rating)<7){
19 |     rating<-c(rating,rep(0,7-length(rating)))
20 |   }
21 |   rating[3:7]<-rating[3:7]/100
22 |   names(rating)<-c('average','votes','stars5','stars4','stars3','stars2','stars1')
23 |   ##the introduction of movie
24 |   intronode <- getNodeSet(p, '//span[@class="all hidden"]')
25 |   if(length(intronode)==0)
26 |     intronode <- getNodeSet(p, '//span[@property="v:summary"]')
27 |   summary<-gsub("[\n ]","",sapply(intronode, xmlValue))
28 |   
29 |   ## tags
30 |   labelinfo<-sapply(getNodeSet(p, '//div[@id="db-tags-section"]//div'), xmlValue)
31 |   if(length(labelinfo)==0)
32 |     labelinfo<-sapply(getNodeSet(p, '//div[@class="tags-body"]//a'), xmlValue)
33 |   labelinfo<-unlist(strsplit(labelinfo,' |\\(|\\)'))
34 |   tags<-data.frame(tag_label=labelinfo[seq(1,length(labelinfo),2)],
35 |                    tag_freq=labelinfo[seq(2,length(labelinfo),2)],
36 |                    stringsAsFactors=F)
37 |   ## total of reviews
38 |   reviews_total<-gsub("[^0-9]","",sapply(getNodeSet(p, '//div[@class="review-more"]//a'), xmlValue))
39 |   #total of comments
40 |   comments_total<-gsub('[^0-9]','',
41 |                       sapply(getNodeSet(p, '//div[@id="comments-section"]//span//a')[1], xmlValue))
42 |   #if(length(short_comments)==0)short_comments<-0
43 |   #if(length(long_comments)==0)long_comments<-0
44 |   ##audience
45 |   audience<-sapply(getNodeSet(p, '//div[@class="subject-others-interests-ft"]//a'), xmlValue)
46 |   audience<-as.integer(gsub('[^0-9]','',audience))
47 |   if(length(audience)==3)
48 |     names(audience)<-c('doings','collections','wishes')
49 | 
50 |   image<-sapply(getNodeSet(p, '//div[@class="indent"]//div[@id="mainpic"]//img'),
51 |                 function(x) xmlGetAttr(x, "src"))
52 | 
53 |   list(title=title,author=author,
54 |        rating=rating,
55 |        summary=summary,
56 |        tags=tags,
57 |        href=u,image=image,
58 |        reviews_total=reviews_total,
59 |        comments_total=comments_total,
60 |        audience=audience,attribute=attribute)
61 | }
62 | ## get_movie_info(movieid=10527209)
63 | 


--------------------------------------------------------------------------------
/R/get_movie_discussions.R:
--------------------------------------------------------------------------------
 1 | #http://movie.douban.com/subject/5308265/discussion/
 2 | ##x<-get_movie_discussions(movieid=5308265,results = 100)
 3 | .get_movie_discussion0<-function(u,fresh,verbose){
 4 |   p<-.refreshURL(u,fresh, verbose)
 5 |   title<-gsub('[\n ]','',sapply(getNodeSet(p, '//head//title'),xmlValue))
 6 |   published<-sapply(getNodeSet(p, '//div[@class="article"]//span[@class="mn"]'),xmlValue)
 7 |   published<-gsub("\n|  ","", published)
 8 |   n1<-getNodeSet(p, '//div[@class="article"]//span[@class="pl2"]//a')
 9 |   author<-gsub("[\n ]","",sapply(n1,xmlValue)[1])
10 |   author_uri<-sapply(n1,function(x) xmlGetAttr(x, "href"))[1]
11 |   dicussion<-sapply(getNodeSet(p, '//div[@class="article"]//span[@class=""]')[1],xmlValue)
12 |   useful<-sapply(getNodeSet(p, '//span[@class="useful"]//em'),xmlValue)
13 |   unuseful<-sapply(getNodeSet(p, '//span[@class="unuseful"]//em'),xmlValue)
14 |   out<-c(dicussion_uri=u,title=title,published=published,author=author,
15 |          author_uri=author_uri,dicussion=dicussion,useful=useful,unuseful=unuseful)
16 |   return(out)
17 | }
18 | #################################################
19 | get_movie_discussions<-function(movieid,results = 100, fresh = 10,count=20, verbose = TRUE,...){
20 |   u = paste0("http://movie.douban.com/subject/", movieid, "/discussion/")
21 |   p <- .refreshURL(u, fresh, verbose)
22 |   total<-gsub('[^0-9]','',sapply(getNodeSet(p, '//span[@class="count"]'),xmlValue))
23 |   if (length(total)==0)
24 |     stop('There is no discussions about this movie.')
25 |   cat('-----There is a tatal of ',total,' movie discussions.-----\n')
26 |   pages<-ceiling(min(results,as.integer(total))/count)
27 |   out <- data.frame(matrix(nrow = pages * count, ncol = 8), stringsAsFactors = F)
28 |   colnames(out) <- c("dicussion_uri", "title", "published", "author", "author_uri", 
29 |                      "dicussion", "useful", "unuseful")
30 |   k=1
31 |   if(pages>0){
32 |     for(pg in 1:pages){
33 |       u=paste0('http://movie.douban.com/subject/',movieid,
34 |                     '/discussion/?start=',(pg-1)*count,'&sort=vote/')
35 |       if(verbose==TRUE) {
36 |         #cat('Getting',(pg-1)*20+1,'--',pg*20,'discussions...\n')
37 |         cat("Getting movie discussion URLS from page=",pg,": ",u,"...\n")
38 |       }
39 |       p <- .refreshURL(u, fresh, verbose)
40 |       n1<-getNodeSet(p ,'//table[@class="olt"]//td/a')
41 |       href<-unique(sapply(n1,function(x) xmlGetAttr(x, "href")))
42 |       href<-href[grep('/discussion/',href)]
43 |       href <- href[!href %in% out$dicussion_uri]
44 |       n=length(href)
45 |       if(n>0){
46 |         for(i in 1:n){
47 |           u0<-href[i]
48 |           if(verbose==TRUE){
49 |             cat("   Getting ", k, " movie discussion from URL: ", u0, " ...\n")
50 |           }
51 |           out0<-.get_movie_discussion0(u=u0,fresh,verbose)
52 |           if(length(out0)==8){
53 |             out[k,]<-out0
54 |             k=k+1
55 |           }
56 |           else{
57 |             cat("  !!!! Getting  failed at URL: ", u0, " \n")
58 |           }       
59 |         }
60 |       }
61 |     }
62 |   }
63 |   out <- out[!is.na(out[, 1]), ]
64 |   return(out)
65 | }


--------------------------------------------------------------------------------
/R/get.book.review.R:
--------------------------------------------------------------------------------
 1 | get.book.review<-function(bookid,results=100,verbose=TRUE){
 2 |   
 3 |   .get_review<-function(bookid,start,results,verbose){
 4 |     u=paste0("http://api.douban.com/book/subject/",bookid,
 5 |              "/reviews?start-index=",start,"&max-results=",results)
 6 |     p=getURL(u)
 7 |     pagetree <- htmlParse(p,asTree=F)
 8 |     
 9 |     review_id<-sapply(getNodeSet(pagetree, '//entry//id'), xmlValue)
10 |     review_id<-gsub("[^0-9]","",review_id)
11 |     #sapply(getNodeSet(pagetree, '//entry//link[@rel="alternate"]'),function(x) xmlGetAttr(x, "href"))
12 |     title<-sapply(getNodeSet(pagetree, '//entry//title'), xmlValue)
13 |     Encoding(title)<-"UTF-8"
14 |     
15 |     author<-sapply(getNodeSet(pagetree, '//entry//author//name'), xmlValue)
16 |     Encoding(author)<-"UTF-8"
17 |     author_id<-sapply(getNodeSet(pagetree, '//entry//author//uri'), xmlValue)
18 |     author_id<-gsub("[^0-9]","",author_id)
19 |     published<-gsub("T|\\+08:00"," ",sapply(getNodeSet(pagetree, '//entry//published'), xmlValue))
20 |     updated<-gsub("T|\\+08:00"," ",sapply(getNodeSet(pagetree, '//entry//updated'), xmlValue))
21 |     comments<-sapply(getNodeSet(pagetree, '//entry//comments'),function(x) xmlGetAttr(x, "value"))
22 |     useless<-sapply(getNodeSet(pagetree, '//entry//useless'),function(x) xmlGetAttr(x, "value"))
23 |     votes<-sapply(getNodeSet(pagetree, '//entry//votes'),function(x) xmlGetAttr(x, "value"))
24 |     #rating<-sapply(getNodeSet(pagetree, '//entry//rating'),function(x) xmlGetAttr(x, "value"))
25 |     n=length(review_id)
26 |     review<-vector(length=n)
27 |     rating<-vector(length=n)
28 |     for(i in 1:n){
29 |       u<-paste0("http://book.douban.com/review/",review_id[i],"/")
30 |       if(verbose==TRUE){
31 |         cat("  Getting review form: ",i," ",u," ..... \n")
32 |       }
33 |       p=getURL(u)
34 |       p <- htmlParse(p,asTree=F)
35 |       review0<-sapply(getNodeSet(p, '//span[@property="v:description"]'), xmlValue)
36 |       review0<-gsub("\r","",review0)
37 |       rating[i]<-sapply(getNodeSet(p, '//span[@property="v:rating"]'), xmlValue)
38 |       
39 |       if(nchar(review0)<2)review0<-NA
40 |       review[i]<-review0
41 |     }
42 |     out<-data.frame(title=title,review_id=review_id,review=review,author=author,
43 |                     author_uri=author_id,published=published,updated=updated,
44 |                     comments=comments,useless=useless,votes=votes,
45 |                     rating=rating,stringsAsFactors=F)
46 |     return(out)
47 |   }
48 |   ####################################
49 |   u=paste0("http://api.douban.com/book/subject/",bookid,
50 |            "/reviews?start-index=1&max-results=1")
51 |   p=htmlParse(getURL(u))
52 |   totalresults<-sapply(getNodeSet(p, '//totalresults'), xmlValue)
53 |   cat("There are a total of ",totalresults," reviews.\n")
54 |   results<-min(results,as.integer(totalresults))
55 |   starts<-(0:floor(results/50))*50+1
56 |   n=length(starts)
57 |   ends<-rep(50,n)
58 |   ends[n]<-results-50*(n-1)
59 |   if(results%%50==0){
60 |     starts<-starts[1:(n-1)]
61 |     ends<-ends[1:(n-1)]
62 |   }
63 |   n=length(starts)
64 |   out<-c()
65 |   for(i in 1:n ){
66 |     cat('Getting page: ',i,"\n")
67 |     out0<-.get_review(bookid,start=starts[i],results=ends[i],verbose)
68 |     out<-rbind(out0,out)
69 |   }
70 |   
71 |   return(out)
72 | }
73 | 


--------------------------------------------------------------------------------
/R/get_music_reviews.R:
--------------------------------------------------------------------------------
 1 | ##http://music.douban.com/subject/3843530/reviews?sort=time
 2 | #x<-get_music_reviews(musicid=3843530,results = 100,verbosr=T)
 3 | .get_music_review0 <- function(u, fresh = 10, verbose = TRUE, ...) {
 4 |   p <- .refreshURL(u = u, fresh, verbose)
 5 |   ## review_id review_id<-gsub('[^0-9]','',u) title
 6 |   title <- sapply(getNodeSet(p, '//head//title'), xmlValue)
 7 |   ## 发表时间
 8 |   published <- sapply(getNodeSet(p, "//span[@property=\"v:dtreviewed\"]"), xmlValue)
 9 |   ## 作者,URI
10 |   author <- sapply(getNodeSet(p, '//span[@property="v:reviewer"]'), xmlValue)
11 |   author_uri <- sapply(getNodeSet(p, '//div[@id="content"]//span[@class="pl2"]//a'), 
12 |                        function(x) xmlGetAttr(x,  "href"))[1]
13 |   ## 评分
14 |   rating <- sapply(getNodeSet(p, "//span[@property=\"v:rating\"]"), xmlValue)
15 |   ## 影评内容
16 |   review <- sapply(getNodeSet(p, "//span[@property=\"v:description\"]"), xmlValue)
17 |   if (length(review) == 0) {
18 |     review <- sapply(getNodeSet(p, "//div[@property=\"v:description\"]"), xmlValue)
19 |   }
20 |   #review <- gsub("\r", "", review)
21 |   ## 有用 & 没用的次数
22 |   x0 <- sapply(getNodeSet(p, "//div[@class=\"main-panel-useful\"]//em"), xmlValue)
23 |   useful = x0[1]
24 |   unuseful = x0[2]
25 |   if (length(x0) == 0 ) {
26 |     useful <- sapply(getNodeSet(p, "//span[@class=\"useful\"]//em"), xmlValue)
27 |     unuseful <- sapply(getNodeSet(p, "//span[@class=\"unuseful\"]//em"), xmlValue)
28 |   }
29 |   out <- c(review_uri = u, title = title, published = published, author = author, author_uri = author_uri, 
30 |            review = review, rating = rating, useful = useful, unuseful = unuseful)
31 |   return(out)
32 | }
33 | ###########################################################################
34 | get_music_reviews<-function(musicid,results = 100, fresh = 10,count=25, verbose = TRUE,...){
35 |   u = paste0('http://music.douban.com/subject/',musicid,'/reviews?sort=time')
36 |   p <- .refreshURL(u, fresh, verbose)
37 |     
38 |   total<-gsub('[^0-9]','',sapply(getNodeSet(p, '//span[@class="count"]'),xmlValue))
39 |   if (length(total)==0)
40 |     stop('There is no reviews about this music.')
41 |   cat('-----There is a tatal of ',total,' music reviews.-----\n')
42 |   pages<-ceiling(min(results,as.integer(total))/count)
43 |   out <- data.frame(matrix(nrow = pages * count, ncol = 9), stringsAsFactors = F)
44 |   colnames(out) <- c("review_uri", "title", "published", "author", "author_uri", "review", 
45 |                      "rating", "useful", "unuseful")
46 |   ## out nrow index
47 |   k = 1
48 |   if(pages>0){
49 |     for (pg in 1:pages) {
50 |       u=paste0('http://music.douban.com/subject/',musicid,'/reviews?sort=time&start=',
51 |                (pg-1)*count,"&filter=&limit=", count)
52 |       if(verbose==TRUE){
53 |         cat(' Getting music comment URLS from page=',pg-1,': ',u,'...\n')
54 |       }
55 |       p <- .refreshURL(u, fresh, verbose)
56 |       n1 <- getNodeSet(p, '//div[@id="content"]//div[@class="article"]//a[@title]')
57 |       href <- unique(sapply(n1, function(x) xmlGetAttr(x, "href")))
58 |       href <- href[grep("/review/", href)]
59 |       href <- href[!href %in% out$review_uri]
60 |       n = length(href)
61 |       if (n > 0) {
62 |         for (i in 1:n) {
63 |           u0 = href[i]
64 |           if (verbose == TRUE) {
65 |             cat("  %%%% Getting ", k, " music review from URL: ", u0, " ...\n")
66 |           }
67 |           out0 <- tryCatch(.get_music_review0(u = u0, fresh, verbose), error = function(e) {
68 |             NULL
69 |           })
70 |           if (length(out0) == 9) {
71 |             out[k, ] <- out0
72 |             k = k + 1
73 |           }
74 |           if (length(out0) != 9) {
75 |             cat("  !!!! Getting  failed at URL: ", u0, " \n")
76 |           }
77 |         }
78 |       }      
79 |     }
80 |   }
81 |   out <- out[!is.na(out[, 1]), ]
82 |   return(out)
83 | }
84 | 


--------------------------------------------------------------------------------
/R/get_movie_reviews.R:
--------------------------------------------------------------------------------
 1 | ##movieid=5308265
 2 | ## m=get_movie_reviews(movieid=5308265,results=50)
 3 | ########################################################
 4 | ### 获取每篇评论的信息
 5 | .get_movie_review0 <- function(u, fresh = 10, verbose = TRUE, ...) {
 6 |   p <- .refreshURL(u = u, fresh, verbose)
 7 |   ## review_id review_id<-gsub('[^0-9]','',u) title
 8 |   title <- sapply(getNodeSet(p, "//div[@id=\"content\"]//span[@property=\"v:summary\"]"), xmlValue)
 9 |   ## 发表时间
10 |   published <- sapply(getNodeSet(p, "//span[@property=\"v:dtreviewed\"]"), xmlValue)
11 |   ## 作者,URI
12 |   author <- sapply(getNodeSet(p, "//span[@property=\"v:reviewer\"]"), xmlValue)
13 |   author_uri <- sapply(getNodeSet(p, "//div[@class=\"main-hd\"]//a[@href]"), function(x) xmlGetAttr(x, 
14 |                                                                                                     "href"))[1]
15 |   ## 评分
16 |   rating <- sapply(getNodeSet(p, "//span[@property=\"v:rating\"]"), xmlValue)
17 |   ## 影评内容
18 |   review <- sapply(getNodeSet(p, "//span[@property=\"v:description\"]"), xmlValue)
19 |   if (length(review) == 0) {
20 |     review <- sapply(getNodeSet(p, "//div[@property=\"v:description\"]"), xmlValue)
21 |   }
22 |   #review <- gsub("\r", "", review)
23 |   ## 有用 & 没用的次数
24 |   x0 <- sapply(getNodeSet(p, "//div[@class=\"main-panel-useful\"]//em"), xmlValue)
25 |   useful = x0[1]
26 |   unuseful = x0[2]
27 |   if (length(useful) == 0 | length(unuseful) == 0) {
28 |     useful <- sapply(getNodeSet(p, "//span[@class=\"useful\"]//em"), xmlValue)
29 |     unuseful <- sapply(getNodeSet(p, "//span[@class=\"unuseful\"]//em"), xmlValue)
30 |   }
31 |   out <- c(review_uri = u, title = title, published = published, author = author, author_uri = author_uri, 
32 |            review = review, rating = rating, useful = useful, unuseful = unuseful)
33 |   return(out)
34 | }
35 | ###########################################################################
36 | #### 
37 | get_movie_reviews <- function(movieid, results = 100, fresh = 10, count = 10, verbose = TRUE, 
38 |                               ...) {
39 |   u = paste0("http://movie.douban.com/subject/", movieid, "/reviews")
40 |   p <- .refreshURL(u, fresh, verbose)
41 |   ## 总评论数
42 |   total <- sapply(getNodeSet(p, "//head//title"), xmlValue)
43 |   total <- unlist(strsplit(total, " "))[-1]
44 |   total <- as.integer(gsub("[^0-9]", "", total[length(total)]))
45 |   
46 |   cat("--------------There is a total of", total, "reviews.------------\n")
47 |   
48 |   pages <- ceiling(min(results, total) * 2/count)
49 |   ## 预定义输出dataFrame大小
50 |   out <- data.frame(matrix(nrow = pages * count, ncol = 9), stringsAsFactors = F)
51 |   colnames(out) <- c("review_uri", "title", "published", "author", "author_uri", "review", 
52 |                      "rating", "useful", "unuseful")
53 |   ## out nrow index
54 |   k = 1
55 |   
56 |   for (pg in 1:pages) {
57 |     u = paste0("http://movie.douban.com/subject/", movieid, "/reviews?start=", (pg - 1) * 
58 |                  count, "&filter=&limit=", count)
59 |     # cat('Getting',(pg-1)*count+1,'--',pg*count,'reviews:',u,' ...\n')
60 |     cat("Getting review URLs from page ", pg, ": ", u, " ...\n")
61 |     p <- .refreshURL(u, fresh, verbose)
62 |     n1 <- getNodeSet(p, "//div[@class=\"review\"]//div[@class=\"review-hd\"]//a[@title]")
63 |     href <- unique(sapply(n1, function(x) xmlGetAttr(x, "href")))
64 |     href <- href[grep("/review/", href)]
65 |     href <- href[!href %in% out$review_uri]
66 |     n = length(href)
67 |     if (n > 0) {
68 |       for (i in 1:n) {
69 |         u0 = href[i]
70 |         
71 |         if (verbose == TRUE) {
72 |           cat("  %%%% Getting ", k, " movie review from URL: ", u0, " ...\n")
73 |         }
74 |         out0 <- tryCatch(.get_movie_review0(u = u0, fresh, verbose), error = function(e) {
75 |           NULL
76 |         })
77 |         if (length(out0) == 9) {
78 |           out[k, ] <- out0
79 |           k = k + 1
80 |         }
81 |         if (length(out0) != 9) {
82 |           cat("  !!!! Getting  failed at URL: ", u0, " \n")
83 |         }
84 |       }
85 |     }
86 |     
87 |   }
88 |   out <- out[!is.na(out[, 1]), ]
89 |   return(out)
90 | }
91 | 
92 |   
93 | 
94 |   
95 |   
96 | 


--------------------------------------------------------------------------------
/R/get.movie.review.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ##################################################
  3 | ## @u:url
  4 | ## @fresh:refresh number
  5 | .get_movie_review1<-function(u,fresh=10,verbose=TRUE){
  6 |   p<- tryCatch(.refreshForm(u,fresh,verbose),error = function(e){NULL})
  7 |   if(is.null(p)){
  8 |     warning("Getting failed:",u,".\n")
  9 |     out<-NULL
 10 |     return(out)
 11 |   }
 12 |   if(!is.null(p)){
 13 |     ##标题
 14 |     title<-sapply(getNodeSet(p, '//head//title'), xmlValue)
 15 |     #sapply(getNodeSet(p, '//body//div[@class="itm"]//span'), xmlValue)
 16 |     ##作者昵称及URI
 17 |     author<-sapply(getNodeSet(p, '//body//div[@class="itm"]//a[@class="founder"]'), xmlValue)
 18 |     author_uri<-sapply(getNodeSet(p, '//body//div[@class="itm"]//a[@class="founder"]'), 
 19 |                        function(x) xmlGetAttr(x, "href"))
 20 |     author_uri<-paste0("http://m.douban.com",author_uri)
 21 |     ##推荐次数
 22 |     votes<-sapply(getNodeSet(p, '//body//span[@class="forbidden"]'), xmlValue)
 23 |     votes<-as.integer(gsub("[^0-9]","",votes))
 24 |     ####
 25 |     st<-sapply(getNodeSet(p, '//body//div[@class="itm"]//span'), xmlValue)
 26 |     ##发表时间
 27 |     published<-st[grep("\\d{4}\\-[0-1]",st)]
 28 |     ##评分
 29 |     rating<-gsub("[^0-9]","",st[grep("\\d{4}\\-[0-1]",st)-1])
 30 |     ####本评论的显示页数
 31 |     pgs<-sapply(getNodeSet(p, '//body//div[@class="paginator"]//span'), xmlValue)
 32 |     pgs<-as.integer(gsub("1/| ","",pgs))
 33 |     #rev<-vector(length=pgs)
 34 |     rev<-NULL
 35 |     ##评论内容
 36 |     rev0<-paste(sapply(getNodeSet(p, '//body//div//p'), xmlValue),collapse="")
 37 |     pa<-'\\(\u8f6c\u4e0b\u9875\\)' ### \\(转下页\\)
 38 |     rev[1]<-gsub(pa,"",rev0)
 39 |     preU<-unlist(strsplit(u,"\\?id="))[1]
 40 |     review_id<-gsub("[^0-9]","",preU)
 41 |     ##分页读取评论内容
 42 |     if(pgs>1){
 43 |       for(pg in 2:pgs){
 44 |         u0<-paste0(preU,"?page=",pg)
 45 |         #p<-tryCatch(htmlParse(postForm(u0)),error = function(e){NULL})
 46 |         p<- tryCatch(.refreshForm(u0,fresh,verbose),error = function(e){NULL})
 47 |         
 48 |         if(!is.null(p)){
 49 |           rev0<-paste(sapply(getNodeSet(p, '//body//div//p'), xmlValue),collapse="")
 50 |           rev[pg]<-gsub(pa,"",rev0)
 51 |         }
 52 |       }
 53 |     }
 54 |     review<-paste(rev,collapse="")  
 55 |     out<-c(review_id=review_id,title=title,author=author,author_uri=author_uri,
 56 |            review=review,published=published,votes=votes,rating=rating)
 57 |    #cat("---length:",length(out),"\n",
 58 |    #     out[c("title","author","author_uri", "published","votes","rating")],'\n')
 59 |     return(out)
 60 |   }
 61 |   
 62 | }
 63 | #####################################################################
 64 | ## movieid:movie id of douban
 65 | ## results:get the number of reviews
 66 | get.movie.review<-function(movieid,results=100,fresh=20,verbose=TRUE){
 67 |   u=paste0("http://m.douban.com/movie/subject/",movieid,"/reviews")
 68 |   p<-htmlParse(postForm(u))
 69 |   ## 评论总数
 70 |   total<-sapply(getNodeSet(p, '//div[@class="title"]//span'), xmlValue)
 71 |   total<-as.integer(gsub("[^0-9]","",total))
 72 |   cat("----------There are a total of ",total," reviews.----------\n")
 73 |   results<-min(total,results)
 74 |   ## 评论屏数
 75 |   pages<-ceiling(results/40)
 76 |   
 77 |   #n1<-getNodeSet(p, '//div[@class="list"]//div[@class="item"]//a')
 78 |   #session<-sapply(n1,function(x) xmlGetAttr(x, "href"))[1]
 79 |   #session<-unlist(strsplit(session,"="))
 80 |   #session<-session[length(session)]
 81 |   ##预定义输出大小
 82 |   out<-data.frame(matrix(nrow=pages*40,ncol=8),stringsAsFactors=F)
 83 |   colnames(out)<-c("review_id","title","author","author_uri","review", 
 84 |                    "published","votes","rating")
 85 |   ## k 为out行标
 86 |   k=1
 87 |   er=1
 88 |   for(i in 1:pages){
 89 |     #http://m.douban.com/movie/subject/3530403/reviews?page=1&session=368f6186
 90 |     #u=paste0("http://m.douban.com/movie/subject/",movieid,
 91 |     #         "/reviews?page=",i,"&session=",session)
 92 |     u=paste0("http://m.douban.com/movie/subject/",movieid,
 93 |              "/reviews?page=",i)
 94 |     if(verbose==TRUE){
 95 |       cat("Retrieving URLs of page ",i, " of reviews from: ",u," .....\n")
 96 |     }
 97 |     #p<-tryCatch(htmlParse(postForm(u)),error = function(e){NULL})
 98 |     p<- tryCatch(.refreshForm(u,fresh,verbose),error = function(e){NULL})
 99 |     if(is.null(p)){
100 |       cat("!!!!!!  Host Forbidden:",u,' .\n')
101 |       next
102 |     }
103 |     n1<-getNodeSet(p, '//div[@class="list"]//div[@class="item"]//a')
104 |     href<-sapply(n1,function(x) xmlGetAttr(x, "href"))
105 |     href<-href[grep("/movie/review/",href)]
106 |     n=length(href)
107 |     for(j in 1:n){
108 |       u0=paste0("http://m.douban.com",href[j])
109 |       if(verbose==TRUE){
110 |         cat("  Retrieving ",k, "reviews from: ",u0," .....\n")
111 |       }
112 |       ### 无法打开页面返回NULL，通常是网络异常,Forbidden
113 |       out0<-tryCatch(.get_movie_review1(u=u0,fresh,verbose),error = function(e){NULL})
114 |       if(length(out0)<8){
115 |         cat(er," ------Getting failed:",u0,".\n")
116 |         #write(out0,file=paste0(er,".txt"))
117 |         er=er+1
118 |       }
119 |       if(length(out0)==8){
120 |         out[k,]<-out0
121 |         k=k+1
122 |       }
123 |       
124 |     }
125 |   }
126 |   out<-out[!is.na(out[,1]),]
127 |   return(out)
128 | }
129 | 


--------------------------------------------------------------------------------
/R/get_book_info.R:
--------------------------------------------------------------------------------
  1 | 
  2 | get_book_info<-function(bookid,...){
  3 |   u=paste0('http://book.douban.com/subject/',bookid,'/')
  4 |   p<-.refreshURL(u)
  5 |   ###title
  6 |   title<-sapply(getNodeSet(p, '//span[@property="v:itemreviewed"]'), xmlValue)
  7 |   ## author ISBN ...
  8 |   author<-gsub('[\n ]','',
  9 |                   sapply(getNodeSet(p, '//div[@id="info"]//span//a'), xmlValue))
 10 |   author<-paste0(author,collapse=' ')
 11 |   att1<-sapply(getNodeSet(p, '//div[@id="info"]//span[@class="pl"]'), xmlValue)
 12 |   att1<-gsub(' ','',att1)
 13 |   att2<-gsub('\n','',sapply(getNodeSet(p, '//div[@id="info"]'), xmlValue))
 14 |   attribute<-list()
 15 |   ind=which(att1=='\u4f5c\u8005')
 16 |   a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1])))
 17 |  
 18 |   attribute$author<-a1[2]
 19 |   ind=which(att1=='\u51fa\u7248\u793e:')
 20 |   a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1])))
 21 |   attribute$publisher<-a1[2]
 22 |   ind=which(att1=='\u51fa\u7248\u5e74:')
 23 |   a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1])))
 24 |   attribute$pubdate<-a1[2]
 25 |   ind=which(att1=='\u9875\u6570:')
 26 |   a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1])))
 27 |   attribute$pages<-a1[2]
 28 |   ind=which(att1=='\u5b9a\u4ef7:')
 29 |   a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1])))
 30 |   attribute$price<-gsub(' ','',a1[2])
 31 |   ind=which(att1=='\u88c5\u5e27:')
 32 |   a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1])))
 33 |   attribute$binding<-gsub(' ','',a1[2])
 34 |   
 35 |   ind=which(att1=='\u526f\u6807\u9898:')
 36 |   if(length(ind)>0){
 37 |     a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1])))
 38 |     attribute$subtitle<-gsub(' ','',a1[2])
 39 |   }
 40 |   ind=which(att1=='\u8bd1\u8005')
 41 |   if(length(ind)>0){
 42 |     a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1])))
 43 |     attribute$translator<-gsub(' |:','',a1[2])
 44 |   }
 45 |   ind=which(att1=='\u539f\u4f5c\u540d:')
 46 |   if(length(ind)>0){
 47 |     a1=unlist(strsplit(att2,paste0(att1[ind],'|',att1[ind+1])))
 48 |     attribute$origin_title<-gsub(':|  ','',a1[2])
 49 |   }
 50 |   ## rating
 51 |   score<-as.numeric(gsub('[\n ]','',
 52 |                          sapply(getNodeSet(p, '//strong[@property="v:average"]'), xmlValue)))
 53 |   votes<-as.integer(sapply(getNodeSet(p, '//span[@property="v:votes"]'), xmlValue))
 54 |   if(length(votes)==0)votes<-NA
 55 |   rating<-sapply(getNodeSet(p, '//div[@id="interest_sectl"]//div'), xmlValue)
 56 |   rating<-gsub('[\n%]|\\([^\\(\\)]*\\)','',rating)
 57 |   rating<-unlist(strsplit(rating,' '))
 58 |   rating<-rating[nchar(rating)>0]
 59 |   rating<-as.numeric(rating[nchar(rating)>0])[-1]/100
 60 |   if(length(rating)<5)rating<-c(rating,rep(NA,5-length(rating)))
 61 |   rating<-c(score,votes,rating)
 62 |   names(rating)<-c('average','votes','stars5','stars4','stars3','stars2','stars1')
 63 |   
 64 |   medium<-sapply(getNodeSet(p, '//div[@class="indent"]//div[@id="mainpic"]//a'),
 65 |                  function(x) xmlGetAttr(x, "href"))
 66 |   large<-sapply(getNodeSet(p, '//div[@class="indent"]//div[@id="mainpic"]//img'),
 67 |                 function(x) xmlGetAttr(x, "src"))
 68 |   image<-c(medium =medium ,large=large)
 69 |   ##content introduction,author introduction
 70 |   contentinfo<-gsub('[\n ]','',sapply(getNodeSet(p, '//div[@class="intro"]'),
 71 |                                      xmlValue))
 72 |   if(length(contentinfo)>2){
 73 |     pa<-"" ##...\\(展开全部\\)
 74 |     contentinfo<-contentinfo[-grep("...\\(\u5c55\u5f00\u5168\u90e8\\)",contentinfo)]
 75 |   }
 76 |     
 77 |   #clen=length(contentinfo)
 78 |   summary<-contentinfo[1]
 79 |   author_intro<-contentinfo[2]
 80 |   
 81 |   ##labels
 82 |   #labels_amount <- gsub('[^0-9]','',sapply(
 83 |   #  getNodeSet(p, '//div[@id="db-tags-section"]//h2'),xmlValue))
 84 |   labelinfo<-gsub('\n','',
 85 |                   sapply(getNodeSet(p, '//div[@id="db-tags-section"]//div'), xmlValue))
 86 |   labelinfo<-unlist(strsplit(labelinfo,' '))
 87 |   #labelinfo<-gsub('<[^<>]*>|\n','',labelinfo)
 88 |   labelinfo<-labelinfo[nchar(labelinfo)>1]
 89 |  
 90 |   .label_tran<-function(x){
 91 |     x=strsplit(x,'[\\(\\)]')
 92 |     n=length(x)
 93 |     tag_label<-tag_freq<-c()
 94 |     for(i in 1:n){
 95 |       temp<-x[[i]]
 96 |       temp<-temp[nchar(temp)>0]
 97 |       tag_label[i]<-temp[length(temp)-1]
 98 |       tag_freq[i]<-temp[length(temp)]
 99 |     }
100 |    data.frame(tag_label,tag_freq,stringsAsFactors=F) 
101 |   }
102 |   tags<-.label_tran(labelinfo)
103 | 
104 |   ## reader info
105 |   readnode <- getNodeSet(p, '//div[@class="indent"]//p[@class="pl"]//a')
106 |   readers<-as.integer(gsub('[^0-9]','',sapply(readnode, xmlValue)))
107 |   if(length(readers)<3)readers<-c(readers,rep(NA,3-length(readers)))
108 |   names(readers)<-c('doings','collections','wishes')
109 |   ##
110 |   comments_total<-gsub('[^0-9]','',
111 |                        sapply(getNodeSet(p, '//div[@id="reviews"]//h2'), xmlValue))
112 |   notes_total<-gsub('[^0-9]','',sapply(getNodeSet(p, '//div[@class="hd"]'), xmlValue))
113 |   attribute$author_intro<-author_intro
114 |   list(title=title,
115 |        author=author,
116 |        rating=rating,
117 |        href=u,
118 |        summary=summary,
119 |        author_intro=author_intro,
120 |        image=image,
121 |        tags=tags,
122 |        comments_total=comments_total,
123 |        notes_total=notes_total,
124 |        readers=readers,
125 |        attribute=attribute)
126 | }
127 | 


--------------------------------------------------------------------------------
/R/user_movie_status.R:
--------------------------------------------------------------------------------
  1 | ## extract some infomation about a movie
  2 | .extract_movie<-function(x){
  3 |   b=unlist(xmlToList(x))
  4 |   Encoding(b)<-"UTF-8"
  5 |   movieid<-gsub("[^0-9]","",b["div.a..attrs.href"])
  6 |   user_rating<-b[b %in% c("rating1-t","rating2-t",
  7 |                           "rating3-t","rating4-t","rating5-t")]
  8 |   user_rating<-gsub("[^0-9]","",user_rating)
  9 |   if(length(user_rating)==0){user_rating<-NA}
 10 | 
 11 |   watching_date<-b[which(b=="date")-1]
 12 |   #if(length(watching_date)==0)watching_date<-NA
 13 |   tagg<-"\u6807\u7b7e: " ;Encoding(tagg)<-"UTF-8"  ##标签: 
 14 |   user_tags<-gsub(tagg,"",b[which(b=="tags")-1])
 15 |   if(length(user_tags)==0){user_tags<-NA}
 16 |   comment<-gsub("[\n ]","",b[which(b=="comment")-1])
 17 |   if(length(comment)==0){comment<-NA}
 18 |   
 19 |   tmp<-get.movie.info(movieid)
 20 |   title<-tmp$title
 21 |   summary<-tmp$summary
 22 |   author<-paste(tmp$author,collapse=",")
 23 |   average_rating<-tmp$rating[2]
 24 |   image<-tmp$image
 25 |   movie_type<-paste(tmp$attribute$movie_type,collapse=' ')
 26 |   year<-tmp$attribute$year
 27 |   cast<-paste(tmp$attribute$cast,collapse=',')
 28 |   tags<-paste(tmp$tags[,1],collapse=' ')
 29 |   duration<-tmp$attribute$movie_duration[1]
 30 |   if(length(duration)==0){duration<-NA}
 31 |   out<-c(movieid,title,author,user_rating,user_tags,
 32 |          comment,summary,average_rating,image,cast,tags,duration)
 33 |   names(out)<-NULL
 34 |   return(out)
 35 | }
 36 | ########################################################
 37 | ##coll<-.user_movie_what0(userid="",count=15,what="collect")
 38 | ##@what:do,collect,wish 
 39 | .user_movie_what0<-function(userid,count=15,what="collect",verbose=TRUE){
 40 |   u<-paste0('http://movie.douban.com/people/',userid,'/',what)
 41 |   p<-.refreshURL(u)
 42 |   tag_label<-sapply(getNodeSet(p, '//ul[@class="tag-list mb10"]//a'),xmlValue)
 43 |   tag_freq<-as.integer(sapply(getNodeSet(p, '//ul[@class="tag-list mb10"]//span'),
 44 |                               xmlValue))
 45 |   tags<-data.frame(tag_label=tag_label,
 46 |                    tag_freq=tag_freq, stringsAsFactors=F)
 47 |   #####
 48 |   total<-sapply(getNodeSet(p, '//head//title'),xmlValue)
 49 |   total<-substr(total,regexpr("\\([^)]+\\)",total),nchar(total))
 50 |   total<-gsub("[^0-9]","",total)
 51 |   cat("\n--------There is a total of ",total," ",what,"movies.----------\n")
 52 |   pages<-ceiling(as.integer(total)/count)
 53 |   df<-data.frame(matrix(nrow=pages*count,ncol=13),stringsAsFactors=F)
 54 |   colnames(df)<-c("movieid","title","author","user_rating","user_tags",
 55 |                   "comment","summary","average_rating","image",
 56 |                   "cast","tags","duration","watching_date")
 57 |   k=1
 58 |   if(pages>0){
 59 |     for(pg in 1:pages){
 60 |       u<-paste0('http://movie.douban.com/people/',userid,'/',what,
 61 |                 "?start=",(pg-1)*15,"&sort=time&rating=all&filter=all&mode=grid")
 62 |       if(verbose==TRUE){
 63 |         cat("Getting ",what," movies infomation from:",u,"...\n")
 64 |       }
 65 |       p<-.refreshURL(u)
 66 |       node<-getNodeSet(p, '//div[@class="item"]')
 67 |       watching_date<-sapply(getNodeSet(p, '//li//span[@class="date"]'),xmlValue)
 68 |       
 69 |       df0<-t(sapply(node,.extract_movie))
 70 |       df0<-cbind(df0,watching_date)
 71 |       nr=nrow(df0)  
 72 |       df[k:(k+nr-1),]<-df0
 73 |       k=k+nr
 74 |     }
 75 |   }
 76 |   df<- df[!is.na(df[, 1]), ]
 77 |   return(list(tags=tags,df=df))
 78 | }
 79 | 
 80 | ########################################
 81 | ## 获取所有发表过的影评
 82 | .user_movie.review0<-function(userid,count=5,verbose=TRUE){
 83 |   u<-paste0("http://movie.douban.com/people/",userid,"/reviews")
 84 |   p<-.refreshURL(u)
 85 |   total<-sapply(getNodeSet(p, '//head//title'),xmlValue)
 86 |   total<-substr(total,regexpr("\\([^)]+\\)",total),nchar(total))
 87 |   total<-as.integer(gsub("[^0-9]","",total))
 88 |   cat("\n--------There is a total of ",total,"reviews.--------\n")
 89 |   pages<-ceiling(total/count)
 90 |   out <- data.frame(matrix(nrow = pages * count, ncol = 9), 
 91 |                     stringsAsFactors = F)
 92 |   colnames(out) <- c("review_uri", "title", "published", "author", 
 93 |                      "author_uri", "review", "rating", "useful", "unuseful")
 94 |   k = 1
 95 |   if(pages>0){
 96 |     for(pg in 1:pages){
 97 |       u = paste0("http://movie.douban.com/people/", userid, 
 98 |                  "/reviews?start=", (pg - 1) * count)
 99 |       cat("Getting review URLs from page ", pg, ": ", u, " ...\n")
100 |       p <- .refreshURL(u, verbose)
101 |       href <- sapply(getNodeSet(p, '//li[@class="nlst"]//a'), function(x) xmlGetAttr(x, "href"))
102 |       href <- unique(href[grep("/review/", href)])
103 |       href <- href[!href %in% out$review_uri]
104 |       n <- length(href)
105 |       if (n > 0) {
106 |         for (i in 1:n) {
107 |           u0 <- href[i]
108 |           if (verbose == TRUE) {
109 |             cat("   Getting ", k, " movie review from URL: ", u0, " ...\n")
110 |           }
111 |           out0 <- .get_movie_review0(u = u0, verbose)
112 |           if (length(out0) == 9) {
113 |             out[k, ] <- out0
114 |             k <- k + 1
115 |           }
116 |           else {
117 |             cat("  !!!! Getting  failed at URL: ", u0," \n")
118 |           }
119 |         }
120 |       }
121 |     }
122 |   }
123 |   out <- out[!is.na(out[, 1]), ]
124 |   return(out)
125 | }
126 | 
127 | ##############################################
128 | ##@userid
129 | ##@front  
130 | user_movie_status<-function(userid,verbose=TRUE,front=TRUE){
131 |   cat("--------Retrieving information about read movies.--------\n")
132 |   tmp<-.user_movie_what0(userid,count=15,what="collect",verbose)
133 |   collect_tags<-tmp$tags
134 |   collect_df<-tmp$df
135 |   cat("--------Retrieving information about reading movies.--------\n")
136 |   tmp<-.user_movie_what0(userid,count=15,what="do",verbose)
137 |   do_tags<-tmp$tags
138 |   do_df<-tmp$df
139 |   cat("--------Retrieving information about the movies you want to read.--------\n")
140 |   tmp<-.user_movie_what0(userid,count=15,what="wish",verbose)
141 |   wish_tags<-tmp$tags
142 |   wish_df<-tmp$df
143 |   cat("--------Retrieving your published movie reviews. --------\n")
144 |   reviews<-.user_movie.review0(userid,verbose)
145 | 
146 |   collect_images<-list()
147 |   if(front==TRUE){
148 |     if(!require(EBImage)){
149 |       source("http://bioconductor.org/biocLite.R")
150 |       biocLite("EBImage")
151 |       require(EBImage)
152 |     }
153 |     ##cat("--------正在获取已观电影的海报图片--------...\n")
154 |     cat("--------\u6b63\u5728\u83b7\u53d6\u5df2\u89c2\u7535\u5f71\u7684\u6d77\u62a5\u56fe\u7247--------...\n")
155 |     images<-collect_df$image
156 |     m<-length(images)
157 |     for(i in 1:m){
158 |       collect_images[[i]]<-resize(readImage(images[i]),w=60,h=80) 
159 |     }
160 |   }
161 |   list(collect_tags=collect_tags,
162 |        collect_df=collect_df,
163 |        do_tags=do_tags,
164 |        do_df=do_df,
165 |        wish_tags=wish_tags,
166 |        wish_df=wish_df,
167 |        reviews=reviews,
168 |        collect_images=collect_images)
169 | }
170 | 


--------------------------------------------------------------------------------
/R/user_book_status.R:
--------------------------------------------------------------------------------
  1 | ## extract some infomation about a book
  2 | .extract_book<-function(x){
  3 |   b=unlist(xmlToList(x))
  4 |   bookid<-gsub("[^0-9]","",b["h2.a..attrs.href"])
  5 |   pub<-gsub("\n|  ","",b[which(b=="pub")-1])
  6 |   user_rating<-b[b %in% c("rating1-t","rating2-t",
  7 |                           "rating3-t","rating4-t","rating5-t")]
  8 |   user_rating<-gsub("[^0-9]","",user_rating)
  9 |   if(length(user_rating)==0){user_rating<-NA}
 10 |   pa<-"[\n ]|\u8bfb|\u8fc7|\u60f3|\u5728" ##[\n ]|读|过|想|在
 11 |   Encoding(pa)<-"UTF-8"
 12 |   reading_date<-gsub(pa,"",b[which(b=="date")-1])
 13 |   tagg<-"\u6807\u7b7e: " ;Encoding(tagg)<-"UTF-8"  ##标签: 
 14 |   user_tags<-gsub(tagg,"",b[which(b=="tags")-1])
 15 |   if(length(user_tags)==0){user_tags<-NA}
 16 |   comment<-gsub("[\n ]","",b[which(b=="comment")-1])
 17 |   if(length(comment)==0){comment<-NA}
 18 |   tmp<-get_book_info(bookid)
 19 |   title<-tmp$title
 20 |   summary<-tmp$summary
 21 |   author<-paste(tmp$author,collapse=" ")
 22 |   average_rating<-tmp$rating[3]
 23 |   image<-tmp$image["medium"]
 24 |   pages<-tmp$attribute$pages
 25 |   price<-gsub("\u5143","",tmp$attribute$price) #元
 26 |   out<-c(bookid,title,author,user_rating,reading_date,user_tags,pub,
 27 |          comment,summary,average_rating,image,pages,price)
 28 |   names(out)<-NULL
 29 |   return(out)
 30 | }
 31 | ########################################################
 32 | ##coll<-.user_book_what0(userid="qxde01",count=15,what="collect")
 33 | ##@what:do,collect,wish 
 34 | .user_book_what0<-function(userid,count=15,what="collect",verbose=TRUE){
 35 |   u<-paste0('http://book.douban.com/people/',userid,'/',what)
 36 |   p<-.refreshURL(u)
 37 |   tag_label<-sapply(getNodeSet(p, '//ul[@class="tag-list mb10"]//a'),xmlValue)
 38 |   tag_freq<-as.integer(sapply(getNodeSet(p, '//ul[@class="tag-list mb10"]//span'),
 39 |                               xmlValue))
 40 |   tags<-data.frame(tag_label=tag_label,
 41 |                    tag_freq=tag_freq, stringsAsFactors=F)
 42 |   #####
 43 |   ##\\((.*?)\\)  <[^<>]*> \\([^)]+\\) "(\{[^}]+\})(/[^/{}]*/?)?"
 44 |   total<-sapply(getNodeSet(p, '//head//title'),xmlValue)
 45 |   total<-substr(total,regexpr("\\([^)]+\\)",total),nchar(total))
 46 |   total<-gsub("[^0-9]","",total)
 47 |   cat("\n--------There is a total of ",total," ",what,"books.----------\n")
 48 |   pages<-ceiling(as.integer(total)/count)
 49 |   df<-data.frame(matrix(nrow=pages*count,ncol=13),stringsAsFactors=F)
 50 |   colnames(df)<-c("bookid","title","author","user_rating","reading_date",
 51 |                   "user_tags","pub","comment","summary","average_rating",
 52 |                   "image","pages","price")
 53 |   k=1
 54 |   if(pages>0){
 55 |     for(pg in 1:pages){
 56 |       u<-paste0('http://book.douban.com/people/',userid,'/',what,
 57 |                 "?start=",(pg-1)*15,"&sort=time&rating=all&filter=all&mode=grid")
 58 |       if(verbose==TRUE){
 59 |         cat("Getting ",what," book infomation from:",u,"...\n")
 60 |       }
 61 |       p<-.refreshURL(u)
 62 |       node<-getNodeSet(p, '//li[@class="subject-item"]//div[@class="info"]')
 63 |       df0<-t(sapply(node,.extract_book))
 64 |       nr=nrow(df0)  
 65 |       df[k:(k+nr-1),]<-df0
 66 |       k=k+nr
 67 |     }
 68 |   }
 69 |   df<- df[!is.na(df[, 1]), ]
 70 |   return(list(tags=tags,df=df))
 71 | }
 72 | ########################################
 73 | ## 获取所有发表过的书评
 74 | ##myrev<-.user_book.review0(userid="qxde01")
 75 | .user_book.review0<-function(userid,count=5,verbose=TRUE){
 76 |   u<-paste0("http://book.douban.com/people/",userid,"/reviews")
 77 |   p<-.refreshURL(u)
 78 |   total<-sapply(getNodeSet(p, '//head//title'),xmlValue)
 79 |   total<-substr(total,regexpr("\\([^)]+\\)",total),nchar(total))
 80 |   total<-as.integer(gsub("[^0-9]","",total))
 81 |   cat("\n--------There is a total of ",total,"reviews.--------\n")
 82 |   pages<-ceiling(total/count)
 83 |   out <- data.frame(matrix(nrow = pages * count, ncol = 9), 
 84 |                     stringsAsFactors = F)
 85 |   colnames(out) <- c("review_uri", "title", "published", "author", 
 86 |                      "author_uri", "review", "rating", "useful", "unuseful")
 87 |   k = 1
 88 |   if(pages>0){
 89 |     for(pg in 1:pages){
 90 |       u = paste0("http://book.douban.com/people/", userid, 
 91 |                  "/reviews?start=", (pg - 1) * count)
 92 |       cat("Getting review URLs from page ", pg, ": ", u, " ...\n")
 93 |       p <- .refreshURL(u, verbose)
 94 |       href <- sapply(getNodeSet(p, '//div[@class="tlst"]//a'), function(x) xmlGetAttr(x, "href"))
 95 |       href <- unique(href[grep("/review/", href)])
 96 |       href <- href[!href %in% out$review_uri]
 97 |       n <- length(href)
 98 |       if (n > 0) {
 99 |         for (i in 1:n) {
100 |           u0 <- href[i]
101 |           if (verbose == TRUE) {
102 |             cat("   Getting ", k, " movie review from URL: ", u0, " ...\n")
103 |           }
104 |           out0 <- .get_book_review0(u = u0, verbose)
105 |           if (length(out0) == 9) {
106 |             out[k, ] <- out0
107 |             k <- k + 1
108 |           }
109 |           else {
110 |             cat("  !!!! Getting  failed at URL: ", u0," \n")
111 |           }
112 |         }
113 |       }
114 |     }
115 |   }
116 |   out <- out[!is.na(out[, 1]), ]
117 |   return(out)
118 | }
119 | ###########################################
120 | # myann<-.user_book.annotation0(userid="qxde01")
121 | .user_book.annotation0<-function(userid,count=5,verbose=TRUE){
122 |   u<-paste0("http://book.douban.com/people/",userid,"/annotation/")
123 |   p<-.refreshURL(u)
124 |   total<-sapply(getNodeSet(p, '//head//title'),xmlValue)
125 |   total<-substr(total,regexpr("\\([^)]+\\)",total),nchar(total))
126 |   total<-as.integer(gsub("[^0-9]","",total))
127 |   cat("\n--------There is a total of ",total,"annotation.--------\n")
128 |   pages<-ceiling(total/count)
129 |   out <- data.frame(matrix(nrow = pages * count, ncol = 9), 
130 |                     stringsAsFactors = F)
131 |   colnames(out) <- c("note_uri", "title", "published", "author", 
132 |                      "author_uri", "note", "rating", "readers", "collectors")
133 |   k=1
134 |   if(pages>0){
135 |     for(pg in 1:pages){
136 |       #http://book.douban.com/people/Quantum_Panda/annotation/?start=5
137 |       u = paste0("http://book.douban.com/people/", userid, 
138 |                  "/annotation/?start=", (pg - 1) * count)
139 |       cat("Getting note URLs from page", pg, ": ", u, "...\n")
140 |       p <- .refreshURL(u, verbose)
141 |       href <- sapply(getNodeSet(p,'//div[@class="annotations-context"]//a[@href]'), 
142 |                      function(x) xmlGetAttr(x, "href"))
143 |       href <- unique(href[grep(".com/annotation/", href)])
144 |       href <- href[-grep("#comments", href)]
145 |       href <- href[!href %in% out$note_uri]
146 |       n = length(href)
147 |       if (n > 0) {
148 |         for (i in 1:n) {
149 |           u0 <- href[i]
150 |           if (verbose == TRUE) {
151 |             cat("   Getting ", k, " book note from URL: ",  u0, " ...\n")
152 |           }
153 |           out0 <- .get_book_note0(u = u0, fresh, verbose)
154 |           if (length(out0) == 9) {
155 |             out[k, ] <- out0
156 |             k = k + 1
157 |           }
158 |           else {
159 |             cat("  !!!! Getting  failed at URL: ", u0," \n")
160 |           }
161 |         }
162 |       }
163 |     }
164 |   }
165 |   out <- out[!is.na(out[, 1]), ]
166 |   return(out)
167 | }
168 | ##############################################
169 | ##@userid
170 | ##@front  
171 | ## qxde<-user_book_status(userid="qxde01")
172 | ## im<-qxde[[9]]
173 | #png("front.png",width=60*9,height=80*8)
174 | #display(combine(im),method="raster",all=T)
175 | #dev.off()
176 | user_book_status<-function(userid,verbose=TRUE,front=TRUE){
177 |   cat("--------Retrieving information about read books.--------\n")
178 |   tmp<-.user_book_what0(userid,count=15,what="collect",verbose)
179 |   collect_tags<-tmp$tags
180 |   collect_df<-tmp$df
181 |   cat("--------Retrieving information about reading books.--------\n")
182 |   tmp<-.user_book_what0(userid,count=15,what="do",verbose)
183 |   do_tags<-tmp$tags
184 |   do_df<-tmp$df
185 |   cat("--------Retrieving information about the books you want to read.--------\n")
186 |   tmp<-.user_book_what0(userid,count=15,what="wish",verbose)
187 |   wish_tags<-tmp$tags
188 |   wish_df<-tmp$df
189 |   cat("--------Retrieving your published book reviews. --------\n")
190 |   reviews<-.user_book.review0(userid,verbose)
191 |   cat("--------Retrieving your reading notes.--------\n")
192 |   notes<-.user_book.annotation0(userid,verbose)
193 |   
194 |   collect_images<-list()
195 |   if(front==TRUE){
196 |     if(!require(EBImage)){
197 |       source("http://bioconductor.org/biocLite.R")
198 |       biocLite("EBImage")
199 |       require(EBImage)
200 |     }
201 |     ##cat("--------正在获取已读书籍的封面图片--------...\n")
202 |     cat("--------\u6b63\u5728\u83b7\u53d6\u5df2\u8bfb\u4e66\u7c4d\u7684\u5c01\u9762\u56fe\u7247--------...\n")
203 |     images<-collect_df$image
204 |     m<-length(images)
205 |     for(i in 1:m){
206 |      collect_images[[i]]<-resize(readImage(images[i]),w=60,h=80) 
207 |     }
208 |   }
209 |   list(collect_tags=collect_tags,
210 |        collect_df=collect_df,
211 |        do_tags=do_tags,
212 |        do_df=do_df,
213 |        wish_tags=wish_tags,
214 |        wish_df=wish_df,
215 |        reviews=reviews,
216 |        notes=notes,
217 |        collect_images=collect_images)
218 | }
219 | 


--------------------------------------------------------------------------------
/R/user_movie_viz.R:
--------------------------------------------------------------------------------
  1 | ##' 观影综合统计
  2 | MovieStatSummary<-function(x,YEAR=NULL,NR_REVIEW=0){
  3 |   cat(" \u89c2\u5f71\u6982\u8981\u7edf\u8ba1 ......\n")
  4 |   n=nrow(x) #number
  5 |   star5num<-nrow(x[x$user_rating=="5",]) # star5 number
  6 |   amount<-table(x$month)
  7 |   maxMonth<-names(amount[amount==max(amount)])[1]
  8 |   maxAmount<-max(amount) ##观看最多的月份
  9 |   averageAmount<-n/(length(amount))##每月几部
 10 |   duration<-as.double(x$duration)
 11 |   ## 没有时长数据的用平均值代替
 12 |   durationTotal<-n*sum(duration,na.rm = T)/length(duration[!is.na(duration)])##总时长
 13 |   durationAmount<-tapply(as.integer(x$duration),x$month,sum,na.rm = T)
 14 |   maxMonthduration<-names(durationAmount[durationAmount==max(durationAmount)])[1]
 15 |   maxAmountduration<-max(durationAmount) ##时长最大
 16 |   averageAmountduration<-sum(durationAmount)/n ##每月平均观影时长
 17 | 
 18 |   str1<-paste(YEAR,"\u5171\u89c2\u770b\u7535\u5f71",n, 
 19 |               "\u90e8,\u5408\u8ba1\u65f6\u957f",
 20 |               round(durationTotal,digits=0),"\u5206\u949f")
 21 |   str2<-paste("\u4e94\u661f\u597d\u7535\u5f71",star5num,"\u90e8")
 22 |   str3<-paste(maxMonth,"\u6708\u6700\u591a\u89c2\u770b\u4e86",maxAmount,"\u90e8")
 23 |   str4<-paste("\u5e73\u5747\u6bcf\u6708\u89c2\u770b",round(averageAmount,digits=2),"\u90e8")
 24 |   str5<-paste(maxMonthduration,"\u6708\u6700\u591a\u89c2\u770b", 
 25 |               maxAmountduration,"\u5206\u949f,\u5e73\u5747\u6bcf\u6708\u89c2\u770b",
 26 |               round(averageAmountduration, digits = 2),"\u5206\u949f")
 27 |   str6<-paste("\u5171\u53d1\u8868\u5f71\u8bc4",NR_REVIEW, "\u7bc7")
 28 |   #Encoding(str1)<-"UTF-8";Encoding(str2)<-"UTF-8";Encoding(str3)<-"UTF-8"
 29 |   #Encoding(str4)<-"UTF-8";Encoding(str5)<-"UTF-8";Encoding(str6)<-"UTF-8"
 30 |   
 31 |   #png('MovieStatSummary.png',width=720,height=540)
 32 |   op<-par(mar=c(0,0,0,0),bty="n",yaxt="n",xaxt="n")
 33 |   cols<-rainbow(50,start = 0.5)
 34 |   plot(c(0, 1), c(0, 1),xlab="",ylab="",type="n")
 35 |   rect(0, 0,   1, 1, col = c(NA,0),border = NA, lwd = 2)
 36 |   ##我的观影统计 
 37 |   text(0.5,0.95,"\u6211\u7684\u89c2\u5f71\u7edf\u8ba1",cex=4,col="blue",,font = 2)
 38 |   text(0.5,0.8,str1,cex=2,col=cols[1])
 39 |   text(0.5,0.7,str2,cex=2,col=cols[9])
 40 |   text(0.5,0.6,str3,cex=2,col=cols[18])
 41 |   text(0.5,0.5,str4,cex=2,col=cols[27])
 42 |   text(0.5,0.4,str5,cex=2,col=cols[36])
 43 |   if(NR_REVIEW>0){
 44 |     text(0.5,0.3,str6,cex=2,col=cols[45])
 45 |   }
 46 |   ##生活不是电影,但电影是生活,是镜子
 47 |   ##梦想始于剧本,而终结于电影
 48 |   text(0.5,0.2,"\u751f\u6d3b\u4e0d\u662f\u7535\u5f71,\u4f46\u7535\u5f71\u662f\u751f\u6d3b,\u662f\u955c\u5b50",cex=4,col=cols[30],font = 2)
 49 |   text(0.5,0.1,"\u68a6\u60f3\u59cb\u4e8e\u5267\u672c,\u800c\u7ec8\u7ed3\u4e8e\u7535\u5f71",
 50 |        cex=4,col=cols[30], font = 2)
 51 |   par(op)
 52 |   #dev.off()
 53 |   rm(x)
 54 | }
 55 | ####################################
 56 | ##按月统计
 57 | ##'  @x:collect_df$month
 58 | ##statByMonth(collect_df$month)
 59 | MovieStatByMonth<-function(x){
 60 |   ## 根据月份统计观影数量
 61 |   cat(" \u6839\u636e\u6708\u4efd\u7edf\u8ba1\u89c2\u5f71\u6570\u91cf......\n")
 62 |   amount<-table(x)
 63 |   #png('barByMonthMovie.png',width=720,height=540)
 64 |   op<-par(mar=c(2.5,0,2,0))
 65 |   cols<-rainbow(5*length(amount),start = 0.5)[1:length(amount)]
 66 |   barplot(amount,space=0,ylim=c(0,max(amount)+1),col=cols,yaxt="n")
 67 |   text((1:length(amount))-0.5,amount+0.3,labels=amount,col=cols,cex=2,font=2)
 68 |   ## 观影数量by月 
 69 |   title(main=list("\u89c2\u5f71\u6570\u91cfby\u6708",cex=2,col="blue"))
 70 |   par(op)
 71 |   #dev.off()
 72 |   rm(x)
 73 | }
 74 | #####################################
 75 | ## 按时长统计
 76 | ##'  @x:collect_df
 77 | ## statByPage(collect_df[,c("month","pages")])
 78 | MovieStatByPage<-function(x){
 79 |   ## 统计观影时长by月
 80 |   cat(" \u6309\u6708\u7edf\u8ba1\u89c2\u5f71\u65f6\u957f......\n")
 81 |   aver=sum(x$duration,na.rm = T)/length(x$duration[!is.na(x$duration)])
 82 |   x$duration[is.na(x$duration)]<-aver
 83 |   amount<-tapply(as.integer(x$duration),x$month,sum,na.rm = T)
 84 |   amount2<-amount/max(amount)
 85 |   #png('barByPageMovie.png',width=720,height=540)
 86 |   op<-par(mar=c(2.5,0,3,0))
 87 |   cols<-topo.colors(5*length(amount),alpha=0.5)[1:length(amount)]
 88 |   barplot(amount2,space=0,ylim=c(0,1.1),col=cols,yaxt="n")
 89 |   text((1:length(amount))-0.5,amount2+0.05,labels=amount,col=cols,cex=1.5,font=2)
 90 |   title(main=list("\u89c2\u5f71\u65f6\u957fby\u6708(\u5206\u949f)",cex=2,col="blue"))
 91 |   par(op)
 92 |   #dev.off()
 93 |   rm(x)
 94 | }
 95 | ###################################################
 96 | ##' 根据标签的TF进行层次聚类
 97 | ##' clusterCloudByTags(x=qxde$collect_df,k=8)
 98 | MovieclusterCloudByTags<-function(x,k=10,max.words=100){
 99 |   if(!require(wordcloud)){
100 |     install.packages("wordcloud")
101 |     require(wordcloud)
102 |   }
103 |   tagsDtm<-.df2dtm(df=x,content="tags",word.min=1,type='movie')
104 | 
105 |   cat(" \u6839\u636e\u7535\u5f71\u6807\u7b7e\u8fdb\u884c\u5c42\u6b21\u805a\u7c7b......\n")
106 |   diss<-dissimilarity(tagsDtm,method='cosine')
107 |   hc <- hclust(diss, method = "ward")
108 |   #png('hclustByTagsMovie.png',width=720,height=540)
109 |   op<-par(mar=c(0,0,3,0))
110 |   if(k>nrow(x))k=nrow(x)-1
111 |   ## 这些书自动归类 \u8fd9\u4e9b\u4e66\u81ea\u52a8\u5f52\u7c7b
112 |   plot(hc,frame.plot=F,ann=T,hang=0.2,col=4,cex=2,
113 |        main=list('\u7535\u5f71\u5f52\u7c7b',
114 |                  cex=1.5,col="blue"),ylab='',xlab='',sub='')
115 |   if(k>1){
116 |     rect.hclust(hc,k, border = rainbow(k))
117 |   }
118 |   par(op)
119 |   #dev.off()
120 |   
121 |   tagsmat<-as.matrix(tagsDtm)
122 |   freq<-colSums(tagsmat)
123 |   word<-colnames(tagsDtm)
124 |   word<-gsub("<U\\+30FB>","",word)
125 |   df<-data.frame(word=word,freq=freq)
126 |   df<-df[order(df$freq,decreasing=T),]
127 |   row.names(df)<-NULL
128 |   ##cat(" #### 对我的标签绘制wordcloud......\n")
129 |   cat(" \u5bf9\u7535\u5f71\u6807\u7b7e\u7ed8\u5236wordcloud......\n")
130 |   #png('wordcloudByTagsMovie.png',width=720,height=720)
131 |   wordcloud(words=df$word,
132 |             freq=df$freq,
133 |             min.freq=1,scale=c(5,1.4),
134 |             max.words=max.words,
135 |             random.order=F,
136 |             ordered.colors=F,
137 |             colors=topo.colors(length(table(df$freq))))
138 |   ## 我的口味\u6211\u7684\u53e3\u5473
139 |   title(main=list('\u6211\u7684\u53e3\u5473',cex=2,col="blue",font = 2))
140 |   #dev.off()
141 |   rm(x,df,diss,tagsDtm)
142 | }
143 | #########################################
144 | ##' 将数据转化为关系形式
145 | ##'  @x:collect_df[,c("title","user_tags")]
146 | MovieGraphByTag<-function(x){
147 |   if(!require(igraph)){
148 |     install.packages("igraph")
149 |     require(igraph)
150 |   }
151 |   ##cat(" #### 绘制电影标签之间的关系图.....\n")
152 |   cat(" \u7ed8\u5236\u7535\u5f71\u6807\u7b7e\u4e4b\u95f4\u7684\u5173\u7cfb\u56fe.....\n")
153 |   .todf<-function(x){
154 |     x<-as.character(x)
155 |     ## 标签:\u6807\u7b7e:
156 |     taggg<-'\u6807\u7b7e:';Encoding(taggg)<-"UTF-8"
157 |     x<-gsub(taggg,'',c(x[1],x[2]))
158 |     left=x[1]
159 |     right<-unlist(strsplit(x[2],' '))
160 |     right<-right[nchar(right)>0]
161 |     right<-right[right!=left]
162 |     left<-rep(left,length(right))
163 |     cbind(left,right)
164 |   }
165 |   tag_list<-apply(x[,c("title","tags")],1,.todf)
166 |   n=length(tag_list)
167 |   tag<-c()
168 |   for(i in 1:n){
169 |     tag<-rbind(tag,tag_list[[i]])
170 |   }
171 |   g<-graph.data.frame(tag,directed=F)
172 |   title<-unique(x$title)
173 |   col=rep(2,length(V(g)$name))
174 |   col[match(V(g)$name,title)]<-4
175 |   vcex=col;vcex[vcex==4]=2;vcex[vcex==2]=1.5
176 |  # png('MovieGraphByTag.png',width=720,height=720)
177 |   plot(g, layout=layout.fruchterman.reingold, vertex.size=col,
178 |        vertex.label.dist=0, vertex.color=col+1, 
179 |        edge.arrow.size=0.5,vertex.label=V(g)$name,vertex.label.cex=vcex,
180 |        vertex.label.color=col+2)
181 |   ## 电影与标签之间的关系图
182 |   title(main=list("\u7535\u5f71\u4e0e\u6807\u7b7e\u4e4b\u95f4\u7684\u5173\u7cfb\u56fe",
183 |                   font=2,cex=2,col="blue"))
184 |   #dev.off()
185 |   rm(x,tag,g)
186 | }
187 | ####################################
188 | ##
189 | ##collect_df[,c('author','cast')]
190 | ##导演和演员的合作关系
191 | actor2df<-function(x){
192 |   f_split<-function(x){
193 |     left<-unlist(strsplit(as.character(x[1]),split=','))
194 |     right<-unlist(strsplit(as.character(x[2]),','))
195 |     n1=length(left)
196 |     n2=length(right)
197 |     df<-c()
198 |     for(i in 1:n1){
199 |       tmp<-cbind(left=rep(left[i],n2),right=right)
200 |       df<-rbind(df,tmp)
201 |     }
202 |     df
203 |   }
204 |   x[,2]<-gsub('\u7279\u9080\u987e\u95ee\uff1a|\u5236\u7247\u4eba\uff1a|\u7279\u9080\u987e\u95ee\uff1a','',x[,2])
205 |   x<-x[nchar(x[,2])>0,]
206 |   x<-x[nchar(x[,1])>0,]
207 |   n=nrow(x)
208 |   df<-c()
209 |   for (i in 1:n){
210 |     tmp<-f_split(x[i,])
211 |     #cat(i,'\n')
212 |     df<-rbind(df,tmp)
213 |   }
214 |   df<-df[df[,1]!=df[,2],]
215 |   unique(df)
216 | }
217 | ##################################
218 | #' @x:collect_df[,c('author','cast')]
219 | #' actorGraph(x=collect_df[,c('author','cast')])
220 | actorGraph<-function(x){
221 |   cat(' \u5bfc\u6f14\u548c\u6f14\u5458\u7684\u5408\u4f5c\u5173\u7cfb.....\n')
222 |   df<-actor2df(x)
223 |   g<-graph.data.frame(df,directed=F)
224 |   dirctor<-unique(df[,1])
225 |   actor<-unique(df[,2])
226 |   col=rep(2,length(V(g)$name))
227 |   col[match(V(g)$name,dirctor)]<-4
228 |   vcex=col;vcex[vcex==4]=1.2;vcex[vcex==2]=1
229 |   png('actorGraphg.png',width=900,height=900)
230 |   plot(g, layout=layout.fruchterman.reingold, vertex.size=col,
231 |        vertex.label.dist=0, vertex.color=col+1, 
232 |        edge.arrow.size=0.5,vertex.label=V(g)$name,vertex.label.cex=vcex,
233 |        vertex.label.color=col+2)
234 |   title(main=list("\u5bfc\u6f14\u548c\u6f14\u5458\u7684\u5408\u4f5c\u5173\u7cfb",
235 |                   font=2,cex=2,col="blue"))
236 |   dev.off()
237 | }
238 | 
239 | #actorGraph(x)
240 | 
241 | #######################################################
242 | ##电影统计可视化函数
243 | ##'  @x:user_movie_status的输出
244 | ##'  @YEAR:统计年份
245 | ##' load("qxde.rda")
246 | ##' data(stopwords)
247 | ##' user_movie_viz(x=qxde,YEAR="2012",stopwords=stopwords)
248 | user_movie_viz<-function(x,YEAR="2013",stopwords=stopwords){
249 |   ## 正在预处理数据
250 |   cat("\u6b63\u5728\u9884\u5904\u7406\u6570\u636e......\n")
251 |   collect_df<-x$collect_df
252 | 
253 |   collect_df$duration<-as.integer(gsub('\\([^)]+\\)|[^0-9]','',collect_df$duration))
254 |   year<-substr(collect_df$watching_date,1,4)
255 |   month<-substr(collect_df$watching_date,1,7)
256 |   collect_df<-data.frame(collect_df,
257 |                          month=month,year=year,stringsAsFactors=F)
258 |   reviews<-x$reviews
259 |   collect_images<-x$collect_images
260 |   if(!is.null(YEAR)){
261 |     collect_df<-collect_df[year==YEAR,]
262 |     collect_images<-collect_images[year==YEAR]
263 |     r_year<-substr(reviews$published,1,4)
264 |     reviews<-reviews[r_year==YEAR,]
265 |   }
266 |   NR_REVIEW=nrow(reviews)
267 |   
268 |   ## 拼图大小
269 |   n<-length(collect_images)^0.5
270 |   n1<-ceiling(n);n2=n1
271 |   if((n1-1)^2+floor(n)>n^2){n2=floor(n)}
272 |   require(EBImage)
273 |   cat("\u6b63\u5728\u7ed8\u5236\u7edf\u8ba1\u56fe......\n") ##正在绘制统计图
274 |   cat(" \u7ed8\u5236\u7535\u5f71\u6d77\u62a5\u62fc\u56fe......\n")
275 |   front<-combine(x=collect_images)
276 |   png("Moviefront.png",width=64*n1,height=80*n2)
277 |   display(x=front,method="raster",all=T)
278 |   dev.off()
279 |   png("Movie.png",width=640,height=3600)
280 |   op<-par(mfrow=c(6,1),mar=c(2,0,1.5,0))
281 |   ##基本统计
282 |   MovieStatSummary(x=collect_df,YEAR=YEAR,NR_REVIEW)
283 |   ##按月统计
284 |   MovieStatByMonth(x=collect_df$month)
285 |   ##按页数统计
286 |   MovieStatByPage(x=collect_df[,c("month","duration")])
287 |   ## 层次聚类与wordcloud
288 |   MovieclusterCloudByTags(x=collect_df,k=8)
289 |   ##绘制书籍标签之间的关系图
290 |   MovieGraphByTag(x=collect_df[,c("title","tags")])
291 |   dev.off()
292 |   ## 评论关键词wordcloud 
293 |   xx<-collect_df[!is.na(collect_df$comment),]
294 |   if (length(nrow(xx))>0){
295 |     wordcloudByComment(x=xx,stopwords=stopwords,filename='MoviewordcloudByComment')
296 |   }
297 |   ## 导演和演员的合作关系
298 |   actorGraph(x=collect_df[,c('author','cast')])
299 |   ## 信息图存放位置：\u4fe1\u606f\u56fe\u5b58\u653e\u4f4d\u7f6e
300 |   cat("\u4fe1\u606f\u56fe\u5b58\u653e\u4f4d\u7f6e:",getwd(),"\n")
301 |   cat("\u751f\u6210\u7684\u56fe\u7247\u4e3a:\n",
302 |       dir(getwd(),"png"),"\n") #生成的图片为:
303 | }
304 | 


--------------------------------------------------------------------------------
/R/user_book_viz.R:
--------------------------------------------------------------------------------
  1 | ##' source('user_book_viz.R',encoding='UTF-8')
  2 | ##' 阅读综合统计
  3 | ##' @x:collect_df
  4 | ##' @YEAR:统计年份
  5 | ##' @NR_REVIEW:书评篇数
  6 | ##' @NR_NOTE:笔记篇数
  7 | ##' statSummary(x=collect_df2)
  8 | statSummary<-function(x,YEAR=NULL,NR_REVIEW=0,NR_NOTE=0){
  9 |   #cat(" #### 阅读综合统计......\n")
 10 |   cat(" #### \u9605\u8bfb\u7efc\u5408\u7edf\u8ba1......\n")
 11 |   n=nrow(x) #number
 12 |   star5num<-nrow(x[x$user_rating=="5",]) # star5 number
 13 |   amount<-table(x$month)
 14 |   maxMonth<-names(amount[amount==max(amount)])
 15 |   maxAmount<-max(amount)
 16 |   averageAmount<-(length(amount)*30.42)/sum(amount)
 17 |   priceTotal<-sum(as.double(x$price),na.rm = T)
 18 |   pageAmount<-tapply(as.integer(x$pages),x$month,sum,na.rm = T)
 19 |   maxMonthPage<-names(pageAmount[pageAmount==max(pageAmount)])[1]
 20 |   maxAmountPage<-max(pageAmount)[1]
 21 |   averageAmountPage<-sum(pageAmount)/(length(pageAmount)*30.42)
 22 |   ###############################################
 23 |   ##' 共阅读过,\u5171\u9605\u8bfb\u8fc7
 24 |   ##' 本书,总价值 \u672c\u4e66,\u603b\u4ef7\u503c
 25 |   ##'str1<-paste(YEAR,"共阅读过",n,"本书,总价值",priceTotal,"￥")
 26 |   ##' 五星好书 \u4e94\u661f\u597d\u4e66,本 \u672c
 27 |   ##' str2<-paste("五星好书",star5num,"本")
 28 |   ##' 月最勤快读了,\u6708\u6700\u52e4\u5feb\u8bfb\u4e86
 29 |   ##' str3<-paste(maxMonth,"月最勤快读了",maxAmount,"本")
 30 |   ##' str4<-paste("平均",round(averageAmount,digits=2),"天读一本书")
 31 |   ##' str5<-paste(maxMonthPage,"月最多读了",maxAmountPage, "页,平均每天读",
 32 |   ##'             round(averageAmountPage, digits = 2),"页,总共",
 33 |   ##'             sum(pageAmount),"页")
 34 |   ##' str6<-paste("发表书评",NR_REVIEW,"篇,读书笔记",NR_NOTE,"篇")
 35 |   ###############################################
 36 |   str1<-paste(YEAR,"\u5171\u9605\u8bfb\u8fc7",n,
 37 |               "\u672c\u4e66,\u603b\u4ef7\u503c",priceTotal,"\uffe5")
 38 |   str2<-paste("\u4e94\u661f\u597d\u4e66",star5num,"\u672c")
 39 |   str3<-paste(maxMonth,"\u6708\u6700\u52e4\u5feb\u8bfb\u4e86",maxAmount,"\u672c")
 40 |   str4<-paste("\u5e73\u5747",round(averageAmount,digits=2),"\u5929\u8bfb\u4e00\u672c\u4e66")
 41 |   str5<-paste(maxMonthPage,"\u6708\u6700\u591a\u8bfb\u4e86",maxAmountPage,
 42 |               "\u9875,\u5e73\u5747\u6bcf\u5929\u8bfb",
 43 |               round(averageAmountPage, digits = 2),"\u9875,\u603b\u5171",
 44 |               sum(pageAmount),"\u9875")
 45 |   str6<-paste("\u53d1\u8868\u4e66\u8bc4",NR_REVIEW,
 46 |               "\u7bc7,\u8bfb\u4e66\u7b14\u8bb0",NR_NOTE,"\u7bc7")
 47 |   Encoding(str1)<-"UTF-8";Encoding(str2)<-"UTF-8";Encoding(str3)<-"UTF-8"
 48 |   Encoding(str4)<-"UTF-8";Encoding(str5)<-"UTF-8";Encoding(str6)<-"UTF-8"
 49 | 
 50 |   png('statSummary.png',width=720,height=540)
 51 |   op<-par(mar=c(0,0,0,0),bty="n",yaxt="n",xaxt="n")
 52 |   cols<-rainbow(50,start = 0.5)
 53 |   plot(c(0, 1), c(0, 1),xlab="",ylab="",type="n")
 54 |   rect(0, 0,   1, 1, col = c(NA,0),border = NA, lwd = 2)
 55 |   ##我的阅读统计 \u6211\u7684\u9605\u8bfb\u7edf\u8ba1
 56 |   text(0.5,0.95,"\u6211\u7684\u9605\u8bfb\u7edf\u8ba1",cex=4,col="blue",,font = 2)
 57 |   text(0.5,0.8,str1,cex=2,col=cols[1])
 58 |   text(0.5,0.7,str2,cex=2,col=cols[5])
 59 |   text(0.5,0.6,str3,cex=2,col=cols[10])
 60 |   text(0.5,0.5,str4,cex=2,col=cols[15])
 61 |   text(0.5,0.4,str5,cex=2,col=cols[20])
 62 |   if(NR_REVIEW+NR_NOTE>0){
 63 |     text(0.5,0.3,str6,cex=2,col=cols[25])
 64 |   }
 65 |   ##不积跬步,无以至千里\u4e0d\u79ef\u8dec\u6b65,\u65e0\u4ee5\u81f3\u5343\u91cc
 66 |   ##不积小流,无以成江海 \u4e0d\u79ef\u5c0f\u6d41,\u65e0\u4ee5\u6210\u6c5f\u6d77
 67 |   text(0.5,0.2,"\u4e0d\u79ef\u8dec\u6b65,\u65e0\u4ee5\u81f3\u5343\u91cc",cex=4,col=cols[30],font = 2)
 68 |   text(0.5,0.1,"\u4e0d\u79ef\u5c0f\u6d41,\u65e0\u4ee5\u6210\u6c5f\u6d77",
 69 |        cex=4,col=cols[30], font = 2)
 70 |   par(op)
 71 |   dev.off()
 72 |   rm(x)
 73 | }
 74 | ####################################
 75 | ##按月统计
 76 | ##'  @x:collect_df$month
 77 | ##statByMonth(collect_df$month)
 78 | statByMonth<-function(x){
 79 |   ## 根据月份统计阅读量
 80 |   cat(" #### \u6839\u636e\u6708\u4efd\u7edf\u8ba1\u9605\u8bfb\u91cf......\n")
 81 |   amount<-table(x)
 82 |   png('barByMonth.png',width=720,height=540)
 83 |   op<-par(mar=c(2.5,0,2,0))
 84 |   cols<-rainbow(5*length(amount),start = 0.5)[1:length(amount)]
 85 |   barplot(amount,space=0,ylim=c(0,max(amount)+1),col=cols,yaxt="n")
 86 |   text((1:length(amount))-0.5,amount+0.3,labels=amount,col=cols,cex=2,font=2)
 87 |   ## 阅读书籍数量by月 \u9605\u8bfb\u4e66\u7c4d\u6570\u91cfby\u6708
 88 |   title(main=list("\u9605\u8bfb\u4e66\u7c4d\u6570\u91cfby\u6708",cex=2,col="blue"))
 89 |   par(op)
 90 |   dev.off()
 91 |   rm(x)
 92 | }
 93 | #####################################
 94 | ## 按页数统计
 95 | ##'  @x:collect_df
 96 | ## statByPage(collect_df[,c("month","pages")])
 97 | statByPage<-function(x){
 98 |   ## 统计阅读页数
 99 |   cat(" #### \u7edf\u8ba1\u9605\u8bfb\u9875\u6570......\n")
100 |   amount<-tapply(as.integer(x$pages),x$month,sum,na.rm = T)
101 |   amount2<-amount/max(amount)
102 |   png('barByPage.png',width=720,height=540)
103 |   op<-par(mar=c(2.5,0,3,0))
104 |   cols<-topo.colors(5*length(amount),alpha=0.5)[1:length(amount)]
105 |   barplot(amount2,space=0,ylim=c(0,1.1),col=cols,yaxt="n")
106 |   text((1:length(amount))-0.5,amount2+0.05,labels=amount,col=cols,cex=1.5,font=2)
107 |   ## 阅读书籍页数by月
108 |   title(main=list("\u9605\u8bfb\u4e66\u7c4d\u9875\u6570by\u6708",cex=2,col="blue"))
109 |   par(op)
110 |   dev.off()
111 |   rm(x)
112 | }
113 | 
114 | #############################################
115 | ##' 将df转化为DocumentTermMatrix
116 | .df2dtm<-function(df,content='word',word.min=2,type='book'){
117 |   if(!require(tm)){
118 |     install.packages("tm")
119 |     require(tm)
120 |   }
121 |   if(type=='book'){
122 |     df <- data.frame(contents = as.character(df[,content]), 
123 |                      id = as.character(df$bookid), 
124 |                      heading = as.character(df$author),
125 |                      origin=as.character(df$title),
126 |                      stringsAsFactors = F)
127 |   }
128 |   else if(type=='movie'){
129 |     df <- data.frame(contents = as.character(df[,content]), 
130 |                      id = as.character(df$movieid), 
131 |                      heading = as.character(df$author),
132 |                      origin=as.character(df$title),
133 |                      stringsAsFactors = F)
134 |   }
135 | 
136 |   m <- list(Content = "contents", Heading = "heading",
137 |             ID = "origin",Origin="id")
138 |   myReader <- readTabular(mapping = m, language = "Zh_cn")
139 |   corpus <- Corpus(DataframeSource(df), 
140 |                    readerControl = list(reader = myReader,language = "Zh_cn"))
141 |   dtm <- DocumentTermMatrix(corpus, control = list(wordLengths = c(word.min, Inf)))
142 |   rowTotals <- apply(dtm , 1, sum) 
143 |   dtm <- dtm[rowTotals> 0]
144 |   return(dtm)
145 | }
146 | ###################################################
147 | ##' 根据标签的TF进行层次聚类
148 | ##' clusterCloudByTags(x=qxde$collect_df,k=8)
149 | clusterCloudByTags<-function(x,k=8,max.words=100){
150 |   if(!require(wordcloud)){
151 |     install.packages("wordcloud")
152 |     require(wordcloud)
153 |   }
154 |   tagsDtm<-.df2dtm(df=x,content="user_tags",word.min=1)
155 |   ## tagsDtm<-weightTfIdf(tagsDtm)
156 |   ##cat(" #### 根据我的标签对书籍进行层次聚类......\n")
157 |   cat(" #### \u6839\u636e\u6211\u7684\u6807\u7b7e\u5bf9\u4e66\u7c4d\u8fdb\u884c\u5c42\u6b21\u805a\u7c7b......\n")
158 |   diss<-dissimilarity(tagsDtm,method='cosine')
159 |   hc <- hclust(diss, method = "ward")
160 |   png('hclustByTags.png',width=720,height=540)
161 |   op<-par(mar=c(0,0,3,0))
162 |   if(k>nrow(x))k=nrow(x)-1
163 |   ## 这些书自动归类 \u8fd9\u4e9b\u4e66\u81ea\u52a8\u5f52\u7c7b
164 |   plot(hc,frame.plot=F,ann=T,hang=0.2,col=4,
165 |        main=list('\u8fd9\u4e9b\u4e66\u81ea\u52a8\u5f52\u7c7b',
166 |                  cex=2,col="blue"),ylab='',xlab='',sub='')
167 |   if(k>1){
168 |     rect.hclust(hc,k, border = rainbow(k))
169 |   }
170 |   
171 |   par(op)
172 |   dev.off()
173 |   
174 |   tagsmat<-as.matrix(tagsDtm)
175 |   freq<-colSums(tagsmat)
176 |   word<-colnames(tagsDtm)
177 |   word<-gsub("<U\\+30FB>","",word)
178 |   df<-data.frame(word=word,freq=freq)
179 |   df<-df[order(df$freq,decreasing=T),]
180 |   row.names(df)<-NULL
181 |   ##cat(" #### 对我的标签绘制wordcloud......\n")
182 |   cat(" #### \u5bf9\u6211\u7684\u6807\u7b7e\u7ed8\u5236wordcloud......\n")
183 |   png('wordcloudByTags.png',width=720,height=720)
184 |   wordcloud(words=df$word,
185 |             freq=df$freq,
186 |             min.freq=1,scale=c(5,1.4),
187 |             max.words=max.words,
188 |             random.order=F,
189 |             ordered.colors=F,
190 |             colors=topo.colors(length(table(df$freq))))
191 |   ## 我的口味\u6211\u7684\u53e3\u5473
192 |   title(main=list('\u6211\u7684\u53e3\u5473',cex=2,col="blue",font = 2))
193 |   dev.off()
194 |   rm(x,df,diss,tagsDtm)
195 | }
196 | #########################################
197 | ##' 将数据转化为关系形式
198 | ##'  @x:collect_df[,c("title","user_tags")]
199 | graphByTag<-function(x){
200 |   if(!require(igraph)){
201 |     install.packages("igraph")
202 |     require(igraph)
203 |   }
204 |   ##cat(" #### 绘制书籍标签之间的关系图.....\n")
205 |   cat(" #### \u7ed8\u5236\u4e66\u7c4d\u6807\u7b7e\u4e4b\u95f4\u7684\u5173\u7cfb\u56fe.....\n")
206 |   .todf<-function(x){
207 |     x<-as.character(x)
208 |     ## 标签:\u6807\u7b7e:
209 |     taggg<-'\u6807\u7b7e:';Encoding(taggg)<-"UTF-8"
210 |     x<-gsub(taggg,'',c(x[1],x[2]))
211 |     left=x[1]
212 |     right<-unlist(strsplit(x[2],' '))
213 |     right<-right[nchar(right)>0]
214 |     right<-right[right!=left]
215 |     left<-rep(left,length(right))
216 |     cbind(left,right)
217 |   }
218 |   tag_list<-apply(x[,c("title","user_tags")],1,.todf)
219 |   n=length(tag_list)
220 |   tag<-c()
221 |   for(i in 1:n){
222 |     tag<-rbind(tag,tag_list[[i]])
223 |   }
224 |   g<-graph.data.frame(tag,directed=F)
225 |   title<-unique(x$title)
226 |   col=rep(2,length(V(g)$name))
227 |   col[match(V(g)$name,title)]<-4
228 |   vcex=col;vcex[vcex==4]=1.2;vcex[vcex==2]=1
229 |   png('graphByTag.png',width=720,height=720)
230 |   plot(g, layout=layout.fruchterman.reingold, vertex.size=col,
231 |        vertex.label.dist=0, vertex.color=col+1, 
232 |        edge.arrow.size=0.5,vertex.label=V(g)$name,vertex.label.cex=vcex,
233 |        vertex.label.color=col+2)
234 |   ## 书籍与标签之间的关系图
235 |   title(main=list("\u4e66\u7c4d\u4e0e\u6807\u7b7e\u4e4b\u95f4\u7684\u5173\u7cfb\u56fe",
236 |                   font=2,col="blue"))
237 |   dev.off()
238 |   rm(x,tag,g)
239 | }
240 | 
241 | #######################################
242 | ##' 分词函数
243 | .f_seg<-function(x,stopwords=NULL){
244 |   if(!require(Rwordseg)){
245 |     install.packages("Rwordseg", repos = "http://R-Forge.R-project.org", type = "source")
246 |     require(Rwordseg)
247 |   }
248 |   seg<-segmentCN(x)
249 |   if(!is.null(stopwords)){
250 |     seg<-seg[!seg %in% stopwords]
251 |   }
252 |   seg<-seg[nchar(seg)>1]
253 |   word<-paste(seg,collapse=' ')
254 |   word
255 | }
256 | 
257 | #########################################
258 | ##' 对评论进行分词，绘制wordcloud
259 | ##' wordcloudByComment(x=qxde$collect_df)
260 | ## load("stopwords.rda")
261 | wordcloudByComment<-function(x,max.words=100,stopwords,filename='wordcloudByComment'){
262 |   if(!require(wordcloud)){
263 |     install.packages("wordcloud")
264 |     require(wordcloud)
265 |   }
266 |   ##cat(" #### 对评论数据进行分词......\n")
267 |   cat(" #### \u5bf9\u8bc4\u8bba\u6570\u636e\u8fdb\u884c\u5206\u8bcd......\n")
268 |   x<-x[!is.na(x$comment),]
269 |   word<-sapply(x$comment,.f_seg,stopwords)
270 |   names(word)<-NULL
271 |   word<-unlist(strsplit(word," "))
272 |   tmp<-sort(table(word),decreasing=T)
273 |   df<-data.frame(word=names(tmp),freq=tmp[])
274 |   #gsub(':','-',substr(Sys.time(),12,20))
275 |   png(paste0(filename,'.png',sep=''),width=720,height=720)
276 |   #par(bg='gray90')
277 |   ##cat(" #### 对评论关键词绘制wordcloud......\n")
278 |   cat(" #### \u5bf9\u8bc4\u8bba\u5173\u952e\u8bcd\u7ed8\u5236wordcloud......\n")
279 |   wordcloud(words=df$word,
280 |             freq=df$freq,
281 |             min.freq=1,scale=c(5,1.4),
282 |             max.words=max.words,
283 |             random.order=F,
284 |             ordered.colors=F,
285 |             colors=rainbow(length(table(df$freq))))
286 |   ## 我的评论关键词\u6211\u7684\u8bc4\u8bba\u5173\u952e\u8bcd 
287 |   title(main=list('\u6211\u7684\u5410\u69fd',
288 |                   cex=2,col="blue",font = 2))
289 |   dev.off()
290 |   rm(x,df,word)
291 | }
292 | #######################################################
293 | ##阅读统计可视化函数
294 | ##'  @x:user_book_status的输出
295 | ##'  @YEAR:统计年份
296 | ##' load("qxde.rda")
297 | ##' data(stopwords)
298 | ##' user_book_viz(x=qxde,YEAR="2012",stopwords=stopwords,back=TRUE)
299 | user_book_viz<-function(x,YEAR="2013",stopwords=stopwords,back=FALSE){
300 |   ## 正在预处理数据
301 |   cat("\u6b63\u5728\u9884\u5904\u7406\u6570\u636e......\n")
302 |   collect_df<-x$collect_df
303 |   collect_df$reading_date<-gsub("[\n \u8bfb\u8fc7\u60f3\u5728]","",collect_df$reading_date)
304 |   collect_df$price<-gsub("\u5143","",collect_df$price)
305 |   ## 标签: \u6807\u7b7e:
306 |   collect_df$user_tags<-gsub("\u6807\u7b7e: ","",collect_df$user_tags)
307 |   year<-substr(collect_df$reading_date,1,4)
308 |   month<-substr(collect_df$reading_date,1,7)
309 |   collect_df<-data.frame(collect_df,
310 |                          month=month,year=year,stringsAsFactors=F)
311 |   reviews<-x$reviews
312 |   notes<-x$notes
313 |   collect_images<-x$collect_images
314 |   if(!is.null(YEAR)){
315 |     collect_df<-collect_df[year==YEAR,]
316 |     collect_images<-collect_images[year==YEAR]
317 |     r_year<-substr(reviews$published,1,4)
318 |     n_year<-substr(notes$published,1,4)
319 |     reviews<-reviews[r_year==YEAR,]
320 |     notes<-notes[n_year==YEAR,]
321 |   }
322 |   NR_NOTE<-nrow(notes)
323 |   NR_REVIEW=nrow(reviews)
324 |   
325 |   ## 拼图大小
326 |   n<-length(collect_images)^0.5
327 |   n1<-ceiling(n);n2=n1
328 |   if((n1-1)^2+floor(n)>n^2){n2=floor(n)}
329 |   require(EBImage)
330 |   cat("\u6b63\u5728\u7ed8\u5236\u7edf\u8ba1\u56fe......\n") ##正在绘制统计图
331 |   ##cat(" #### 绘制书籍封面拼图......\n")
332 |   cat(" #### \u7ed8\u5236\u4e66\u7c4d\u5c01\u9762\u62fc\u56fe......\n")
333 |   front<-combine(x=collect_images)
334 |   png("front.png",width=64*n1,height=80*n2)
335 |   display(x=front,method="raster",all=T)
336 |   dev.off()
337 |   ##基本统计
338 |   statSummary(x=collect_df,YEAR=YEAR,NR_REVIEW,NR_NOTE)
339 |   ##按月统计
340 |   statByMonth(x=collect_df$month)
341 |   ##按页数统计
342 |   statByPage(x=collect_df[,c("month","pages")])
343 |   ## 层次聚类与wordcloud
344 |   clusterCloudByTags(x=collect_df,k=8)
345 |   ##绘制书籍标签之间的关系图
346 |   graphByTag(x=collect_df[,c("title","user_tags")])
347 |   ## 评论关键词wordcloud 
348 |   wordcloudByComment(x=collect_df,stopwords=stopwords)
349 |   stat_base<-readImage("statSummary.png");h_base<-ncol(stat_base)
350 |   stat_month<-readImage("barByMonth.png");h_month<-ncol(stat_month)
351 |   stat_page<-readImage("barByPage.png");h_page<-ncol(stat_page)
352 |   tag_clust<-readImage("hclustByTags.png");h_clust<-ncol(tag_clust)
353 |   tag_cloud<-readImage("wordcloudByTags.png");h_tagcloud<-ncol(tag_cloud)
354 |   tag_graph<-readImage('graphByTag.png');h_graph<-ncol(tag_graph)
355 |   comment_cloud<-readImage("wordcloudByComment.png");h_cmt<-ncol(comment_cloud)
356 |   im_front<-readImage("front.png")
357 |   im_front<-resize(im_front,720,860);h_front<-ncol(im_front)
358 |   #bigImage<-array(dim=c(720,h1+h2+h3+h4+h5+h6+h7+h8,3))
359 |   bigImage<-array(dim=c(720,h_base+h_month+h_page+h_clust+
360 |                           h_tagcloud,3))
361 |   ### 添加背景色
362 |   if(back==TRUE){
363 |     ## 添加背景色
364 |     cat("\u6dfb\u52a0\u80cc\u666f\u8272......\n")
365 |     plum = readImage(system.file("images", "plum.jpg", package="Rdouban"))
366 |      plum<-resize(plum,720,h_base)
367 |      stat_base<-0.8*stat_base+0.2*plum
368 |      writeImage(stat_base,"statSummary2.png") 
369 |       
370 |      orchid<-readImage(system.file("images", "orchid.jpg", package="Rdouban"))
371 |      orchid<-resize(orchid,720,h_month)
372 |      stat_month<-0.8*stat_month+0.2*orchid
373 |      writeImage(stat_month,"stat_month2.png") 
374 |      ######
375 |      peony<-readImage(system.file("images", "peony.jpg", package="Rdouban"))
376 |      peony<-resize(peony,720,h_page)
377 |      stat_page<-0.9*stat_page+0.1*peony
378 |      writeImage(stat_page,"stat_page2.png") 
379 |       
380 |      peach<-readImage(system.file("images", "peach.jpg", package="Rdouban"))
381 |      peach<-resize(peach,720,h_clust)
382 |      tag_clust<-0.8*tag_clust+0.2*peach
383 |      writeImage(tag_clust,"tag_clust2.png") 
384 |       
385 |      #bamboo<-readImage(system.file("images", "bamboo.jpg", package="Rdouban"))
386 |      im_front<-resize(im_front,720,h_front)
387 |      #im_front<-im_front*0.8+0.2*bamboo
388 |      #writeImage(im_front,"front2.png")
389 |     
390 |     fan<-readImage(system.file("images", "fan.jpg", package="Rdouban"))
391 |     fan<-resize(fan,720,h_graph)
392 |     tag_graph<-0.8*tag_graph+0.2*fan
393 |     writeImage(tag_graph,"graphByTag2.png")
394 |     
395 |     rm(plum,orchid,peony,peach,fan)
396 |     gc()
397 |   }
398 |   cat("\u5408\u5e76\u4e3a\u5927\u56fe......\n")##合并为大图
399 |   #h_base+h_month+h_page+h_clust+h_tagcloud
400 |   bigImage<-as.Image(bigImage);colorMode(bigImage)<-2
401 |   bigImage[,1:h_base,]<-stat_base
402 |   bigImage[,(h_base+1):(h_month+h_base),]<-stat_month
403 |   bigImage[,(h_base+h_month+1):(h_page+h_month+h_base),]<-stat_page
404 |   bigImage[,(h_base+h_month+h_page+1):(h_clust+h_page+h_month+h_base),]<-tag_clust
405 |   bigImage[,(h_base+h_month+h_page+h_clust+1):(h_tagcloud+h_clust+h_page+h_month+h_base),]<-tag_cloud
406 |   #bigImage[,(h1+h2+h3+h4+h5+1):(h6+h5+h4+h3+h2+h1),]<-tag_graph
407 |   #bigImage[,(h1+h2+h3+h4+h5+h6+1):(h7+h6+h5+h4+h3+h2+h1),]<-comment_cloud
408 |   #bigImage[,(h1+h2+h3+h4+h5+h6+h7+1):(h8+h7+h6+h5+h4+h3+h2+h1),]<-im_front
409 |   rm(stat_base,stat_month,stat_page,tag_cloud,tag_clust)
410 |   gc()
411 |   writeImage(bigImage, files="bigImage.png", quality = 85)
412 |   rm(bigImage)
413 |   bigImage2<-array(dim=c(720,h_cmt+h_graph+h_front,3))
414 |   bigImage2<-as.Image(bigImage2);colorMode(bigImage2)<-2
415 |   bigImage2[,1:h_cmt,]<-comment_cloud
416 |   bigImage2[,(1+h_cmt):(h_cmt+h_graph),]<-tag_graph
417 |   bigImage2[,(1+h_cmt+h_graph):(h_cmt+h_graph+h_front),]<-im_front
418 |   writeImage(bigImage2, files="bigImage2.png", quality = 85)
419 |   rm(tag_graph,comment_cloud,im_front,bigImage2)
420 |   ## 信息图存放位置：\u4fe1\u606f\u56fe\u5b58\u653e\u4f4d\u7f6e
421 |   cat("\u4fe1\u606f\u56fe\u5b58\u653e\u4f4d\u7f6e:",getwd(),"\n")
422 |   cat("\u751f\u6210\u7684\u56fe\u7247\u4e3a:\n",
423 |       dir(getwd(),"png"),"\n") #生成的图片为:
424 | }
425 | 


--------------------------------------------------------------------------------