├── lib └── jsoup-1.10.2.jar ├── src └── main │ ├── resources │ ├── log4j.properties │ ├── persistence.xml │ └── app-context.xml │ └── java │ └── tango │ └── crawler │ ├── dao │ ├── CommentDAO.java │ ├── RecordDAO.java │ └── MovieDAO.java │ ├── util │ ├── Constant.java │ └── CommonUtil.java │ ├── entity │ ├── Record.java │ ├── Comment.java │ └── Movie.java │ ├── Entrance.java │ └── service │ └── CrawlerService.java ├── README.md └── pom.xml /lib/jsoup-1.10.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tanqianxiong/Crawler-for-Douban/HEAD/lib/jsoup-1.10.2.jar -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tanqianxiong/Crawler-for-Douban/HEAD/src/main/resources/log4j.properties -------------------------------------------------------------------------------- /src/main/java/tango/crawler/dao/CommentDAO.java: -------------------------------------------------------------------------------- 1 | package tango.crawler.dao; 2 | 3 | import org.springframework.data.jpa.repository.JpaRepository; 4 | import org.springframework.stereotype.Repository; 5 | import tango.crawler.entity.Comment; 6 | 7 | /** 8 | * Created by TANQX3 on 2017-3-25. 9 | */ 10 | @Repository 11 | public interface CommentDAO extends JpaRepository { 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/tango/crawler/util/Constant.java: -------------------------------------------------------------------------------- 1 | package tango.crawler.util; 2 | 3 | import javax.persistence.criteria.CriteriaBuilder; 4 | 5 | /** 6 | * Created by TANQX3 on 2017-3-22. 7 | */ 8 | public class Constant { 9 | public static final String MAIN_URL = "https://movie.douban.com"; 10 | public static final String BASE_URL = "https://movie.douban.com/subject"; 11 | public static final Integer MAX_COUNT = 10000; 12 | public static final Integer DEFAULT_DATA_LENGTH = 255; 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/tango/crawler/dao/RecordDAO.java: -------------------------------------------------------------------------------- 1 | package tango.crawler.dao; 2 | 3 | import org.springframework.data.jpa.repository.JpaRepository; 4 | import org.springframework.data.jpa.repository.Query; 5 | import org.springframework.stereotype.Repository; 6 | import tango.crawler.entity.Record; 7 | 8 | /** 9 | * Created by TANQX3 on 2017-3-22. 10 | */ 11 | @Repository 12 | public interface RecordDAO extends JpaRepository { 13 | Record getByUrl(String url); 14 | 15 | Record getFirstByCrawled(Integer crawled); 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/tango/crawler/dao/MovieDAO.java: -------------------------------------------------------------------------------- 1 | package tango.crawler.dao; 2 | 3 | import org.springframework.data.jpa.repository.JpaRepository; 4 | import org.springframework.stereotype.Repository; 5 | import org.springframework.transaction.annotation.Transactional; 6 | import tango.crawler.entity.Movie; 7 | 8 | import javax.persistence.EntityManager; 9 | import javax.persistence.PersistenceContext; 10 | 11 | /** 12 | * Created by TANQX3 on 2017-3-22. 13 | */ 14 | @Repository 15 | public interface MovieDAO extends JpaRepository{ 16 | Movie findBySubjectId(String subjectId); 17 | } 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | > 借鉴 @杰锅锅(Jackie)大神 [《Java豆瓣电影爬虫——小爬虫成长记(附源码)》](http://www.cnblogs.com/bigdataZJ/p/doubanmovie3.html) 中的爬取思路和代码 [JewelCrawler](https://github.com/DMinerJackie/JewelCrawler) ,采用自己熟悉的Spring-data-jpa进行改写,简化数据库操作和事务。 2 | 3 | 爬取核心代码200行左右。 4 | 1. persistence.xml 5 | - 配置数据库连接 6 | - 根据实体 POJO 类自动建表。如不需要改为 false. 7 | ``` 8 | 9 | ``` 10 | 11 | 2. Entrance 12 | - 入口文件 13 | 14 | 3. log4j.properties 15 | - 日志文件默认存放在F盘 16 | ``` 17 | log4j.appender.logfile.File=F:/movieCrawler.log 18 | ``` 19 | 20 | 4. 思路 21 | - 从一个电影详情页面,抓取其中的电影详情。并把该页面中链接到其他电影页的URL记录下来用于抓取 -------------------------------------------------------------------------------- /src/main/resources/persistence.xml: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | org.hibernate.jpa.HibernatePersistenceProvider 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /src/main/java/tango/crawler/entity/Record.java: -------------------------------------------------------------------------------- 1 | package tango.crawler.entity; 2 | 3 | import javax.persistence.*; 4 | 5 | /** 6 | * Created by TANQX3 on 2017-3-22. 7 | */ 8 | @Entity 9 | @Table(name = "record") 10 | public class Record { 11 | public static final String TYPE_MOVIE = "Movie"; 12 | public static final String TYPE_COMMENT = "Comment"; 13 | public static final String TYPE_OTHER = "Other"; 14 | 15 | public static final int STATUS_UNCRAWLED = 0; 16 | public static final int STATUS_CRAWLED = 1; 17 | public static final int STATUS_ERROR = 2; 18 | 19 | @Id 20 | @GeneratedValue(strategy = GenerationType.IDENTITY) 21 | private Integer id; 22 | 23 | private String url; 24 | private Integer crawled; 25 | private String type; 26 | 27 | public Integer getId() { 28 | return id; 29 | } 30 | 31 | public void setId(Integer id) { 32 | this.id = id; 33 | } 34 | 35 | public String getUrl() { 36 | return url; 37 | } 38 | 39 | public void setUrl(String url) { 40 | this.url = url; 41 | } 42 | 43 | public Integer getCrawled() { 44 | return crawled; 45 | } 46 | 47 | public void setCrawled(Integer crawled) { 48 | this.crawled = crawled; 49 | } 50 | 51 | public String getType() { 52 | return type; 53 | } 54 | 55 | public void setType(String type) { 56 | this.type = type; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/tango/crawler/Entrance.java: -------------------------------------------------------------------------------- 1 | package tango.crawler; 2 | 3 | import org.springframework.beans.factory.annotation.Autowired; 4 | import org.springframework.context.ApplicationContext; 5 | import org.springframework.context.support.ClassPathXmlApplicationContext; 6 | import org.springframework.stereotype.Service; 7 | import tango.crawler.service.CrawlerService; 8 | import tango.crawler.util.Constant; 9 | 10 | import java.util.Random; 11 | 12 | @Service 13 | public class Entrance { 14 | @Autowired 15 | private CrawlerService crawlerService; 16 | 17 | public static void main(String[] args) { 18 | ApplicationContext ac = new ClassPathXmlApplicationContext("app-context.xml"); 19 | System.out.println("Context loaded."); 20 | 21 | Entrance entrance = ac.getBean(Entrance.class); 22 | 23 | System.out.println("Start"); 24 | 25 | //因为想「每次抓取」作为「一个事务」,故循环放到service外面; 26 | for (int i = 0; i < Constant.MAX_COUNT; i++) { 27 | try { 28 | sleepAwhile(); 29 | entrance.crawlerService.crawlOnePage(); 30 | } catch (Exception e) { 31 | e.printStackTrace(); 32 | } 33 | } 34 | 35 | System.out.println("End"); 36 | } 37 | 38 | public static void sleepAwhile() throws InterruptedException { 39 | long a = (new Random().nextInt(2) + 1) * 1000; 40 | System.out.println("Sleep " + a + " ms"); 41 | Thread.sleep(a); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/resources/app-context.xml: -------------------------------------------------------------------------------- 1 | 2 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /src/main/java/tango/crawler/util/CommonUtil.java: -------------------------------------------------------------------------------- 1 | package tango.crawler.util; 2 | 3 | import tango.crawler.entity.Record; 4 | 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | 8 | /** 9 | * Created by TANQX3 on 2017-3-25. 10 | */ 11 | public class CommonUtil { 12 | public static final String USEFUL_RE = "https://movie.douban.com/subject/\\d+(.*)"; 13 | 14 | //public static final String COMMENT_RE = "https://movie.douban.com/subject/\\d{8}/comments"; 15 | 16 | public static String whichType(String url) { 17 | Pattern p = Pattern.compile(USEFUL_RE); 18 | Matcher m = p.matcher(url); 19 | if (m.find()) { 20 | String suffix = m.group(1); 21 | if (suffix.equals("") || suffix.equals("/") || suffix.startsWith("/?from=")) { 22 | return Record.TYPE_MOVIE; 23 | } 24 | if (suffix.startsWith("/comments")) { 25 | return Record.TYPE_COMMENT; 26 | } 27 | return Record.TYPE_OTHER; 28 | } 29 | return null; 30 | } 31 | 32 | public static String delUtf8mb4Chars(String param) { 33 | if (param == null) { 34 | return null; 35 | } 36 | return param.replaceAll("[\\ud800\\udc00-\\udbff\\udfff\\ud800-\\udfff]", ""); 37 | } 38 | 39 | public static String extractSubjectId(String url) { 40 | return url.split("/")[4]; 41 | } 42 | 43 | public static String truncateString(String origin) { 44 | return truncateString(origin,Constant.DEFAULT_DATA_LENGTH); 45 | } 46 | 47 | public static String truncateString(String origin, int length) { 48 | if (origin.length() <= length) { 49 | return origin; 50 | }else { 51 | return origin.substring(0,length); 52 | } 53 | 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/tango/crawler/entity/Comment.java: -------------------------------------------------------------------------------- 1 | package tango.crawler.entity; 2 | 3 | import org.springframework.stereotype.Component; 4 | 5 | import javax.persistence.*; 6 | 7 | /** 8 | * Created by Jackie on 2016/9/24 0024. 9 | */ 10 | @Entity 11 | @Table(name = "comment") 12 | public class Comment { 13 | public static final Integer ContentLength = 2000; 14 | 15 | @Id 16 | @GeneratedValue(strategy = GenerationType.IDENTITY) 17 | private Integer id; 18 | 19 | @Column(name = "content", length = 2000) 20 | private String content; 21 | private String author; 22 | private String authorImg; 23 | private Integer vote; 24 | private String movie; 25 | private Integer movieId; 26 | private String subjectId; 27 | private Integer recordId; 28 | 29 | public String getSubjectId() { 30 | return subjectId; 31 | } 32 | 33 | public void setSubjectId(String subjectId) { 34 | this.subjectId = subjectId; 35 | } 36 | 37 | public Integer getRecordId() { 38 | return recordId; 39 | } 40 | 41 | public void setRecordId(Integer recordId) { 42 | this.recordId = recordId; 43 | } 44 | 45 | public Integer getId() { 46 | return id; 47 | } 48 | 49 | public void setId(Integer id) { 50 | this.id = id; 51 | } 52 | 53 | public String getContent() { 54 | return content; 55 | } 56 | 57 | public void setContent(String content) { 58 | this.content = content; 59 | } 60 | 61 | public String getAuthor() { 62 | return author; 63 | } 64 | 65 | public void setAuthor(String author) { 66 | this.author = author; 67 | } 68 | 69 | public String getAuthorImg() { 70 | return authorImg; 71 | } 72 | 73 | public void setAuthorImg(String authorImg) { 74 | this.authorImg = authorImg; 75 | } 76 | 77 | public Integer getVote() { 78 | return vote; 79 | } 80 | 81 | public void setVote(Integer vote) { 82 | this.vote = vote; 83 | } 84 | 85 | public String getMovie() { 86 | return movie; 87 | } 88 | 89 | public void setMovie(String movie) { 90 | this.movie = movie; 91 | } 92 | 93 | public Integer getMovieId() { 94 | return movieId; 95 | } 96 | 97 | public void setMovieId(Integer movieId) { 98 | this.movieId = movieId; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | tango 8 | crawler 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 4.3.2.RELEASE 13 | 14 | 15 | 16 | 17 | org.slf4j 18 | slf4j-log4j12 19 | 1.7.7 20 | 21 | 22 | log4j 23 | log4j 24 | 1.2.17 25 | 26 | 27 | org.springframework 28 | spring-core 29 | ${org.springframework.version} 30 | 31 | 32 | org.springframework 33 | spring-beans 34 | ${org.springframework.version} 35 | 36 | 37 | org.springframework 38 | spring-context 39 | ${org.springframework.version} 40 | 41 | 42 | org.hibernate 43 | hibernate-entitymanager 44 | 4.3.10.Final 45 | 46 | 47 | org.springframework.data 48 | spring-data-jpa 49 | 1.7.0.RELEASE 50 | 51 | 52 | mysql 53 | mysql-connector-java 54 | 5.1.36 55 | 56 | 57 | 58 | 59 | crawler 60 | src/main/java 61 | 62 | 63 | src/main/resources 64 | 65 | 66 | 67 | 68 | org.apache.maven.plugins 69 | maven-compiler-plugin 70 | 71 | 1.8 72 | 1.8 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /src/main/java/tango/crawler/entity/Movie.java: -------------------------------------------------------------------------------- 1 | package tango.crawler.entity; 2 | 3 | import javax.persistence.*; 4 | 5 | /** 6 | * Created by TANQX3 on 2017-3-21. 7 | */ 8 | @Entity 9 | @Table(name = "movie") 10 | public class Movie { 11 | @Id 12 | @GeneratedValue(strategy = GenerationType.IDENTITY) 13 | private Integer id; 14 | 15 | private String name; 16 | private String director; 17 | private String scenarist; 18 | 19 | @Column(name = "actors", length = 1000) 20 | private String actors; 21 | 22 | private String type; 23 | private String country; 24 | private String language; 25 | 26 | @Column(name = "release_date", length = 500) 27 | private String releaseDate; 28 | 29 | private String runtime; 30 | private String ratingNum; 31 | private String tags; 32 | private Integer recordId; 33 | private String url; 34 | private String subjectId; 35 | 36 | @Column(name = "summary", length = 2000) 37 | private String summary; 38 | 39 | public String getSummary() { 40 | return summary; 41 | } 42 | 43 | public void setSummary(String summary) { 44 | this.summary = summary; 45 | } 46 | 47 | public String getSubjectId() { 48 | return subjectId; 49 | } 50 | 51 | public void setSubjectId(String subjectId) { 52 | this.subjectId = subjectId; 53 | } 54 | 55 | public String getName() { 56 | return name; 57 | } 58 | 59 | public void setName(String name) { 60 | this.name = name; 61 | } 62 | 63 | public Integer getId() { 64 | return id; 65 | } 66 | 67 | public void setId(Integer id) { 68 | this.id = id; 69 | } 70 | 71 | public String getDirector() { 72 | return director; 73 | } 74 | 75 | public void setDirector(String director) { 76 | this.director = director; 77 | } 78 | 79 | public String getScenarist() { 80 | return scenarist; 81 | } 82 | 83 | public void setScenarist(String scenarist) { 84 | this.scenarist = scenarist; 85 | } 86 | 87 | public String getActors() { 88 | return actors; 89 | } 90 | 91 | public void setActors(String actors) { 92 | this.actors = actors; 93 | } 94 | 95 | public String getType() { 96 | return type; 97 | } 98 | 99 | public void setType(String type) { 100 | this.type = type; 101 | } 102 | 103 | public String getCountry() { 104 | return country; 105 | } 106 | 107 | public void setCountry(String country) { 108 | this.country = country; 109 | } 110 | 111 | public String getLanguage() { 112 | return language; 113 | } 114 | 115 | public void setLanguage(String language) { 116 | this.language = language; 117 | } 118 | 119 | public String getReleaseDate() { 120 | return releaseDate; 121 | } 122 | 123 | public void setReleaseDate(String releaseDate) { 124 | this.releaseDate = releaseDate; 125 | } 126 | 127 | public String getRuntime() { 128 | return runtime; 129 | } 130 | 131 | public void setRuntime(String runtime) { 132 | this.runtime = runtime; 133 | } 134 | 135 | public String getRatingNum() { 136 | return ratingNum; 137 | } 138 | 139 | public void setRatingNum(String ratingNum) { 140 | this.ratingNum = ratingNum; 141 | } 142 | 143 | public String getTags() { 144 | return tags; 145 | } 146 | 147 | public void setTags(String tags) { 148 | this.tags = tags; 149 | } 150 | 151 | public Integer getRecordId() { 152 | return recordId; 153 | } 154 | 155 | public void setRecordId(Integer recordId) { 156 | this.recordId = recordId; 157 | } 158 | 159 | public String getUrl() { 160 | return url; 161 | } 162 | 163 | public void setUrl(String url) { 164 | this.url = url; 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/main/java/tango/crawler/service/CrawlerService.java: -------------------------------------------------------------------------------- 1 | package tango.crawler.service; 2 | 3 | import org.jsoup.HttpStatusException; 4 | import org.jsoup.Jsoup; 5 | import org.jsoup.nodes.Document; 6 | import org.jsoup.nodes.Element; 7 | import org.jsoup.select.Elements; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | import org.springframework.beans.factory.annotation.Autowired; 11 | import org.springframework.orm.jpa.JpaSystemException; 12 | import org.springframework.stereotype.Service; 13 | import org.springframework.transaction.annotation.Propagation; 14 | import org.springframework.transaction.annotation.Transactional; 15 | import tango.crawler.dao.CommentDAO; 16 | import tango.crawler.dao.MovieDAO; 17 | import tango.crawler.dao.RecordDAO; 18 | import tango.crawler.entity.Comment; 19 | import tango.crawler.entity.Movie; 20 | import tango.crawler.entity.Record; 21 | import tango.crawler.util.CommonUtil; 22 | import tango.crawler.util.Constant; 23 | 24 | import java.util.regex.Matcher; 25 | import java.util.regex.Pattern; 26 | 27 | /** 28 | * Created by TANQX3 on 2017-3-21. 29 | */ 30 | @Service 31 | public class CrawlerService { 32 | @Autowired 33 | private MovieDAO movieDAO; 34 | @Autowired 35 | private CommentDAO commentDAO; 36 | @Autowired 37 | private RecordDAO recordDAO; 38 | 39 | private Logger LOG = LoggerFactory.getLogger("CommonLog"); 40 | 41 | @Transactional 42 | public void crawlOnePage() throws Exception { 43 | Record record = getOneRecordToCrawl(); 44 | String url = record.getUrl(); 45 | LOG.info("Crawling url:" + url); 46 | try { 47 | Document doc = Jsoup.connect(url).get(); 48 | 49 | //1、抓取该page里是有用href的地址存到record里 50 | crawlValuableRecordInPage(doc); 51 | 52 | //2、抓取该page里的电影或影评 53 | if (Record.TYPE_MOVIE.equals(record.getType())) { 54 | crawlMovieInfo(doc, record); 55 | } else if (Record.TYPE_COMMENT.equals(record.getType())) { 56 | crawlCommentInfo(doc, record); 57 | } 58 | record.setCrawled(Record.STATUS_CRAWLED); 59 | 60 | } catch (HttpStatusException e) { 61 | record.setCrawled(Record.STATUS_ERROR); 62 | LOG.info(e.getMessage()); 63 | } catch(Exception e){ 64 | newTxnToSave(record); 65 | LOG.info(e.getMessage()); 66 | throw e; 67 | } 68 | recordDAO.save(record); 69 | } 70 | 71 | @Transactional(propagation = Propagation.REQUIRES_NEW) 72 | private void newTxnToSave(Record r){ 73 | r.setCrawled(Record.STATUS_ERROR); 74 | recordDAO.save(r); 75 | } 76 | 77 | private Record getOneRecordToCrawl() { 78 | Record r = recordDAO.getFirstByCrawled(Record.STATUS_UNCRAWLED); 79 | if (r == null) { 80 | r = new Record(); 81 | r.setUrl(Constant.MAIN_URL); 82 | r.setCrawled(Record.STATUS_UNCRAWLED); 83 | r.setType(Record.TYPE_OTHER); 84 | recordDAO.save(r); 85 | } 86 | return r; 87 | } 88 | 89 | @Transactional 90 | private void crawlValuableRecordInPage(Document doc) { 91 | Elements hrefs = doc.select("a[href^='" + Constant.BASE_URL + "']"); 92 | for (Element e : hrefs) { 93 | String href = e.attr("href").trim(); 94 | String type = CommonUtil.whichType(href); 95 | if (type == null || Record.TYPE_OTHER.equals(type) || recordDAO.getByUrl(href) != null) { 96 | continue; 97 | } 98 | Record record = new Record(); 99 | record.setUrl(href); 100 | record.setCrawled(Record.STATUS_UNCRAWLED); 101 | record.setType(type); 102 | recordDAO.save(record); 103 | } 104 | } 105 | 106 | @Transactional 107 | private void crawlMovieInfo(Document doc, Record record) { 108 | Element infoDiv = doc.getElementById("info"); 109 | if (infoDiv == null) { 110 | return; 111 | } 112 | String subjectId = CommonUtil.extractSubjectId(record.getUrl()); 113 | if (movieDAO.findBySubjectId(subjectId) != null) { 114 | return; 115 | } 116 | 117 | Elements subInfos = infoDiv.children(); 118 | Movie movie = new Movie(); 119 | for (Element subInfo : subInfos) { 120 | if (subInfo.childNodeSize() > 0) { 121 | String key = subInfo.getElementsByAttributeValue("class", "pl").text(); 122 | if (key == null || "".equals(key)) { 123 | continue; 124 | } 125 | if ("导演".equals(key)) { 126 | String director = subInfo.getElementsByAttributeValue("class", "attrs").text(); 127 | movie.setDirector(CommonUtil.truncateString(director)); 128 | } else if ("编剧".equals(key)) { 129 | movie.setScenarist(CommonUtil.truncateString(subInfo.getElementsByAttributeValue("class", "attrs").text())); 130 | } else if ("主演".equals(key)) { 131 | String actors = subInfo.getElementsByAttributeValue("class", "attrs").text(); 132 | movie.setActors(CommonUtil.truncateString(actors,1000)); 133 | } 134 | } 135 | } 136 | Pattern pattern = Pattern.compile("制片国家/地区:(.*?)\n"); 137 | Matcher matcher = pattern.matcher(infoDiv.html()); 138 | if (matcher.find()) { 139 | movie.setCountry(matcher.group(1).trim()); 140 | } 141 | pattern = pattern.compile("语言:(.*?)\n"); 142 | matcher = pattern.matcher(infoDiv.html()); 143 | if (matcher.find()) { 144 | movie.setLanguage(matcher.group(1).trim()); 145 | } 146 | movie.setType(infoDiv.getElementsByAttributeValue("property", "v:genre").text()); 147 | movie.setReleaseDate(infoDiv.getElementsByAttributeValue("property", "v:initialReleaseDate").text()); 148 | movie.setRuntime(infoDiv.getElementsByAttributeValue("property", "v:runtime").text()); 149 | movie.setTags(doc.getElementsByClass("tags-body").text()); 150 | movie.setName(doc.getElementsByAttributeValue("property", "v:itemreviewed").text()); 151 | movie.setRatingNum(doc.getElementsByAttributeValue("property", "v:average").text()); 152 | movie.setSubjectId(subjectId); 153 | movie.setRecordId(record.getId()); 154 | movie.setSummary(doc.getElementsByAttributeValue("property", "v:summary").text().trim()); 155 | 156 | LOG.info("Movie :《" + movie.getName() + "》 Points: " + movie.getRatingNum() + "\n" + "Summary:" + movie.getSummary()); 157 | movieDAO.save(movie); 158 | } 159 | 160 | @Transactional 161 | private void crawlCommentInfo(Document doc, Record record) { 162 | Element el = doc.getElementById("comments"); 163 | if (el != null) { 164 | String[] movies = doc.getElementsByTag("h1").text().replace(" ", "").split("短评"); 165 | String movieName = movies[0]; 166 | 167 | Elements items = el.select(".comment-item"); 168 | for (Element item : items) { 169 | if (item.getElementsByClass("fold-bd").size() < 1 && item.children().get(1).getElementsByTag("p").size() > 0) { 170 | // to make sure the current item is the comment item rather than other info item && 检测fold-bd是查看是否有折叠,如果是折叠的评论则有fold-bd,折叠评论是指账号有异常的 171 | Comment comm = new Comment(); 172 | comm.setMovie(movieName); 173 | //对评论内容去除4字节utf-8字符(包括Emoji表情),因为mysql utf-8编码不支持 174 | //(另一方式:mysql 改用 utf8mb4) 175 | String content = item.children().get(1).getElementsByTag("p").text().trim(); 176 | content = CommonUtil.delUtf8mb4Chars(CommonUtil.truncateString(content,Comment.ContentLength)); 177 | comm.setContent(content);//use "comment.children().get(1).text()" can get all commentInfo like "1819 有用 桃桃淘电影 2016-10-29 即便评分再高也完全喜欢不来。我们还是太热衷主题与意义了,以至于忽视了传递主题的方式与合理性。影片为了所谓的人性深度,而刻意设计剧情和人物转折,忽视基本的人物行为轨迹,都非常让人不舒服。喜欢有深度的电影,但希望能以更巧妙的方式讲出来,而不该是现在这样。以及形式上,这不就是舞台搬演么" 178 | 179 | comm.setVote(Integer.parseInt(item.getElementsByAttributeValue("class", "votes").text())); 180 | String author = item.getElementsByAttribute("href").get(2).text(); 181 | comm.setAuthor(CommonUtil.delUtf8mb4Chars(author)); 182 | comm.setAuthorImg(item.getElementsByAttribute("href").get(2).attr("href")); 183 | comm.setRecordId(record.getId()); 184 | comm.setSubjectId(CommonUtil.extractSubjectId(record.getUrl())); 185 | 186 | LOG.info("Comment for 《" + movieName + "》:" + comm.getContent()); 187 | commentDAO.save(comm); 188 | } 189 | } 190 | } 191 | } 192 | } 193 | --------------------------------------------------------------------------------