├── info.log ├── error.log ├── bin ├── start.sh ├── PTTCrawler.jar ├── start.bat └── log4j.properties ├── target └── classes │ ├── crawler │ ├── base │ │ ├── Entry.class │ │ ├── Post.class │ │ ├── Reply.class │ │ ├── PostAnalysiser.class │ │ └── Reply$ReplyType.class │ ├── main │ │ └── Main.class │ └── client │ │ ├── PTTClient.class │ │ ├── PTTClient$1.class │ │ ├── PTTClient$2.class │ │ ├── PTTClient$3.class │ │ ├── PTTClient$Screen.class │ │ └── PTTClient$Protocol.class │ ├── META-INF │ ├── MANIFEST.MF │ └── maven │ │ └── PTTCrawler │ │ └── PTTCrawler │ │ ├── pom.properties │ │ └── pom.xml │ ├── quartz.properties │ └── log4j.properties ├── .gitignore ├── src └── main │ └── java │ ├── quartz.properties │ ├── crawler │ ├── base │ │ ├── Entry.java │ │ ├── Reply.java │ │ ├── Post.java │ │ └── PostAnalysiser.java │ ├── main │ │ └── Main.java │ └── client │ │ └── PTTClient.java │ └── log4j.properties ├── .gitattributes ├── .project ├── README.md ├── log4j.properties └── pom.xml /info.log: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /error.log: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bin/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | java -jar PTTCrawler.jar -u XXX -p YYY -b Gossiping 3 | -------------------------------------------------------------------------------- /bin/PTTCrawler.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/bin/PTTCrawler.jar -------------------------------------------------------------------------------- /bin/start.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | java -jar PTTCrawler.jar -u XXX -p YYY -b Gossiping 3 | pause 4 | -------------------------------------------------------------------------------- /target/classes/crawler/base/Entry.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/base/Entry.class -------------------------------------------------------------------------------- /target/classes/crawler/base/Post.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/base/Post.class -------------------------------------------------------------------------------- /target/classes/crawler/base/Reply.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/base/Reply.class -------------------------------------------------------------------------------- /target/classes/crawler/main/Main.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/main/Main.class -------------------------------------------------------------------------------- /target/classes/crawler/client/PTTClient.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/client/PTTClient.class -------------------------------------------------------------------------------- /target/classes/crawler/client/PTTClient$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/client/PTTClient$1.class -------------------------------------------------------------------------------- /target/classes/crawler/client/PTTClient$2.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/client/PTTClient$2.class -------------------------------------------------------------------------------- /target/classes/crawler/client/PTTClient$3.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/client/PTTClient$3.class -------------------------------------------------------------------------------- /target/classes/crawler/base/PostAnalysiser.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/base/PostAnalysiser.class -------------------------------------------------------------------------------- /target/classes/crawler/base/Reply$ReplyType.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/base/Reply$ReplyType.class -------------------------------------------------------------------------------- /target/classes/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Built-By: User 3 | Build-Jdk: 1.8.0_65 4 | Created-By: Maven Integration for Eclipse 5 | 6 | -------------------------------------------------------------------------------- /target/classes/crawler/client/PTTClient$Screen.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/client/PTTClient$Screen.class -------------------------------------------------------------------------------- /target/classes/crawler/client/PTTClient$Protocol.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/g21589/PTTCrawler/HEAD/target/classes/crawler/client/PTTClient$Protocol.class -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .settings/org.eclipse.jdt.core.prefs 3 | *.prefs 4 | *.log 5 | /target/ 6 | /Result/ 7 | /Results/ 8 | /bin/Result/ 9 | /bin/Results/ 10 | -------------------------------------------------------------------------------- /src/main/java/quartz.properties: -------------------------------------------------------------------------------- 1 | # Configure ThreadPool 2 | org.quartz.threadPool.class = org.quartz.simpl.SimpleThreadPool 3 | org.quartz.threadPool.threadCount = 5 4 | org.quartz.threadPool.threadPriority = 4 5 | 6 | org.quartz.scheduler.skipUpdateCheck = true 7 | -------------------------------------------------------------------------------- /target/classes/quartz.properties: -------------------------------------------------------------------------------- 1 | # Configure ThreadPool 2 | org.quartz.threadPool.class = org.quartz.simpl.SimpleThreadPool 3 | org.quartz.threadPool.threadCount = 5 4 | org.quartz.threadPool.threadPriority = 4 5 | 6 | org.quartz.scheduler.skipUpdateCheck = true 7 | -------------------------------------------------------------------------------- /target/classes/META-INF/maven/PTTCrawler/PTTCrawler/pom.properties: -------------------------------------------------------------------------------- 1 | #Generated by Maven Integration for Eclipse 2 | #Fri Mar 04 11:27:12 CST 2016 3 | version=0.0.1-SNAPSHOT 4 | groupId=PTTCrawler 5 | m2e.projectName=PTTCrawler 6 | m2e.projectLocation=D\:\\Projects\\PTTCrawler 7 | artifactId=PTTCrawler 8 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | PTTCrawler 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/main/java/crawler/base/Entry.java: -------------------------------------------------------------------------------- 1 | package crawler.base; 2 | 3 | public class Entry { 4 | 5 | public String id = null; 6 | public String number = null; 7 | public String status = null; 8 | public String karma = null; 9 | public String date = null; 10 | public String author = null; 11 | public String url = null; 12 | public String title = null; 13 | 14 | public Entry() { 15 | 16 | } 17 | 18 | public Entry(String id, String number, String status, String karma, String date, String author, String title, String url) { 19 | this.id = id; 20 | this.number = number; 21 | this.status = status; 22 | this.karma = karma; 23 | this.date = date; 24 | this.author = author; 25 | this.url = url; 26 | this.title = title; 27 | } 28 | 29 | public String toFullString() { 30 | return String.format("ID: %8s Num: %-5s Status: %1s Karma: %-2s Date: %5s Author: %-13s Title: %s", id, number, status, karma, date, author, title); 31 | } 32 | 33 | public String toString() { 34 | return String.format("#%8s %5s %5s %-13s %s", id, number, date, author, title); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PTTCrawler 2 | ========== 3 | [![MIT License][license-image]][license-url] 4 | 5 | PTTCrawler is a post crawler in PTT board. PTTCrawler is implemented by Java. 6 | 7 | Features 8 | ---- 9 | * It supports **telnet** (by Apache commons-net) and **SSH** (by JSch) protocols to connect to ptt. 10 | * It renders the **VT100 terminal** screen to crawl original posts. 11 | * Connect Ptt by **UTF-8** character set. 12 | * Support *multi-thread* crawl posts. 13 | * [API] Also support web version to download the Ptt post. 14 | 15 | How to use 16 | ---- 17 | If we want to crawl all posts in the `Gossiping` board, use the following command: 18 | 19 | java -jar PTTCrawler.jar -u Username -p Password -b Gossiping [-m] 20 | 21 | which `Username` and `Password` are your PTT account and password to login PTT. 22 | Use `-m` flag to enable multi-thread. 23 | 注意: 在文章編號大於十萬的看版,例如八卦版(Gossiping),請在`個人化設定`中啟用`使用新式簡化游標`使文章編號不被全型的`●`所覆蓋。 24 | 25 | Version 26 | ---- 27 | 28 | 0.9.7 29 | 30 | TODO 31 | ---- 32 | * Analysis the post content to structured data. 33 | * Support multi boards list 34 | 35 | License 36 | ---- 37 | 38 | MIT 39 | 40 | [license-image]: http://img.shields.io/badge/license-MIT-blue.svg?style=flat 41 | [license-url]: LICENSE 42 | -------------------------------------------------------------------------------- /log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger = INFO, stdout, D, E 2 | 3 | # stdout 4 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Threshold = INFO 6 | log4j.appender.stdout.layout = org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} [%-5p] %m%n 8 | #log4j.appender.stdout.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %-5p [%t] (%F:%L) %c - %m%n 9 | 10 | # INFO log file output 11 | log4j.appender.D = org.apache.log4j.RollingFileAppender 12 | log4j.appender.D.File = info.log 13 | log4j.appender.D.MaxFileSize = 100MB 14 | log4j.appender.D.MaxBackupIndex = 5 15 | log4j.appender.D.encoding = UTF-8 16 | log4j.appender.D.Threshold = INFO 17 | log4j.appender.D.layout = org.apache.log4j.PatternLayout 18 | log4j.appender.D.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %-5p [%t] %c - %m%n 19 | 20 | # Error log file output 21 | log4j.appender.E = org.apache.log4j.RollingFileAppender 22 | log4j.appender.E.File = error.log 23 | log4j.appender.E.MaxFileSize = 100MB 24 | log4j.appender.E.MaxBackupIndex = 5 25 | log4j.appender.E.encoding = UTF-8 26 | log4j.appender.E.Threshold = ERROR 27 | log4j.appender.E.layout = org.apache.log4j.PatternLayout 28 | log4j.appender.E.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %-5p [%t] %c - %m%n 29 | -------------------------------------------------------------------------------- /bin/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger = INFO, stdout, D, E 2 | 3 | # stdout 4 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Threshold = INFO 6 | # For Windows 7 | #log4j.appender.stdout.encoding = Big5 8 | # For Unix like system 9 | log4j.appender.stdout.encoding = UTF-8 10 | log4j.appender.stdout.layout = org.apache.log4j.PatternLayout 11 | log4j.appender.stdout.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} [%-5p] %m%n 12 | 13 | # INFO log file output 14 | log4j.appender.D = org.apache.log4j.RollingFileAppender 15 | log4j.appender.D.File = info.log 16 | log4j.appender.D.MaxFileSize = 100MB 17 | log4j.appender.D.MaxBackupIndex = 5 18 | log4j.appender.D.encoding = UTF-8 19 | log4j.appender.D.Threshold = INFO 20 | log4j.appender.D.layout = org.apache.log4j.PatternLayout 21 | log4j.appender.D.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} [%-5p] %m%n 22 | 23 | # Error log file output 24 | log4j.appender.E = org.apache.log4j.RollingFileAppender 25 | log4j.appender.E.File = error.log 26 | log4j.appender.E.MaxFileSize = 100MB 27 | log4j.appender.E.MaxBackupIndex = 5 28 | log4j.appender.E.encoding = UTF-8 29 | log4j.appender.E.Threshold = ERROR 30 | log4j.appender.E.layout = org.apache.log4j.PatternLayout 31 | log4j.appender.E.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %-5p [%t] %c - %m%n 32 | -------------------------------------------------------------------------------- /src/main/java/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger = INFO, stdout, D, E 2 | 3 | # Turn off quartz log 4 | log4j.logger.org.quartz = OFF 5 | 6 | # stdout 7 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender 8 | log4j.appender.stdout.Threshold = INFO 9 | log4j.appender.stdout.layout = org.apache.log4j.PatternLayout 10 | #log4j.appender.stdout.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} [%-5p] %m%n 11 | log4j.appender.stdout.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %-5p [%t] (%F:%L) %c - %m%n 12 | 13 | # INFO log file output 14 | log4j.appender.D = org.apache.log4j.RollingFileAppender 15 | log4j.appender.D.File = info.log 16 | log4j.appender.D.MaxFileSize = 100MB 17 | log4j.appender.D.MaxBackupIndex = 5 18 | log4j.appender.D.encoding = UTF-8 19 | log4j.appender.D.Threshold = INFO 20 | log4j.appender.D.layout = org.apache.log4j.PatternLayout 21 | log4j.appender.D.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %-5p [%t] %c - %m%n 22 | 23 | # Error log file output 24 | log4j.appender.E = org.apache.log4j.RollingFileAppender 25 | log4j.appender.E.File = error.log 26 | log4j.appender.E.MaxFileSize = 100MB 27 | log4j.appender.E.MaxBackupIndex = 5 28 | log4j.appender.E.encoding = UTF-8 29 | log4j.appender.E.Threshold = ERROR 30 | log4j.appender.E.layout = org.apache.log4j.PatternLayout 31 | log4j.appender.E.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %-5p [%t] %c - %m%n 32 | -------------------------------------------------------------------------------- /target/classes/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger = INFO, stdout, D, E 2 | 3 | # Turn off quartz log 4 | log4j.logger.org.quartz = OFF 5 | 6 | # stdout 7 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender 8 | log4j.appender.stdout.Threshold = INFO 9 | log4j.appender.stdout.layout = org.apache.log4j.PatternLayout 10 | #log4j.appender.stdout.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} [%-5p] %m%n 11 | log4j.appender.stdout.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %-5p [%t] (%F:%L) %c - %m%n 12 | 13 | # INFO log file output 14 | log4j.appender.D = org.apache.log4j.RollingFileAppender 15 | log4j.appender.D.File = info.log 16 | log4j.appender.D.MaxFileSize = 100MB 17 | log4j.appender.D.MaxBackupIndex = 5 18 | log4j.appender.D.encoding = UTF-8 19 | log4j.appender.D.Threshold = INFO 20 | log4j.appender.D.layout = org.apache.log4j.PatternLayout 21 | log4j.appender.D.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %-5p [%t] %c - %m%n 22 | 23 | # Error log file output 24 | log4j.appender.E = org.apache.log4j.RollingFileAppender 25 | log4j.appender.E.File = error.log 26 | log4j.appender.E.MaxFileSize = 100MB 27 | log4j.appender.E.MaxBackupIndex = 5 28 | log4j.appender.E.encoding = UTF-8 29 | log4j.appender.E.Threshold = ERROR 30 | log4j.appender.E.layout = org.apache.log4j.PatternLayout 31 | log4j.appender.E.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %-5p [%t] %c - %m%n 32 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | PTTCrawler 4 | PTTCrawler 5 | 0.0.1-SNAPSHOT 6 | 7 | 8 | com.jcraft 9 | jsch 10 | 0.1.53 11 | 12 | 13 | commons-net 14 | commons-net 15 | 3.3 16 | 17 | 18 | org.slf4j 19 | slf4j-api 20 | 1.7.14 21 | 22 | 23 | org.slf4j 24 | slf4j-log4j12 25 | 1.7.14 26 | 27 | 28 | org.jsoup 29 | jsoup 30 | 1.8.3 31 | 32 | 33 | org.mongodb 34 | mongo-java-driver 35 | 2.14.1 36 | 37 | 38 | org.json 39 | json 40 | 20141113 41 | 42 | 43 | -------------------------------------------------------------------------------- /target/classes/META-INF/maven/PTTCrawler/PTTCrawler/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | PTTCrawler 4 | PTTCrawler 5 | 0.0.1-SNAPSHOT 6 | 7 | 8 | com.jcraft 9 | jsch 10 | 0.1.53 11 | 12 | 13 | commons-net 14 | commons-net 15 | 3.3 16 | 17 | 18 | org.slf4j 19 | slf4j-api 20 | 1.7.14 21 | 22 | 23 | org.slf4j 24 | slf4j-log4j12 25 | 1.7.14 26 | 27 | 28 | org.jsoup 29 | jsoup 30 | 1.8.3 31 | 32 | 33 | org.mongodb 34 | mongo-java-driver 35 | 2.14.1 36 | 37 | 38 | org.json 39 | json 40 | 20141113 41 | 42 | 43 | -------------------------------------------------------------------------------- /src/main/java/crawler/base/Reply.java: -------------------------------------------------------------------------------- 1 | package crawler.base; 2 | 3 | import java.util.Date; 4 | 5 | import com.mongodb.BasicDBObject; 6 | import com.mongodb.DBObject; 7 | 8 | public class Reply { 9 | 10 | public enum ReplyType { 11 | Positive, Negative, Normal 12 | }; 13 | 14 | private String ID = null; 15 | private Date postTime = null; 16 | private ReplyType type = null; 17 | private String content = null; 18 | 19 | public Reply() { 20 | 21 | } 22 | 23 | public Reply(DBObject obj) { 24 | this.ID = (String) obj.get("ReplyID"); 25 | this.type = (ReplyType) obj.get("ReplyType"); 26 | this.content = (String) obj.get("ReplyContent"); 27 | this.postTime = (Date) obj.get("ReplyDate"); 28 | } 29 | 30 | public String getID() { 31 | return ID; 32 | } 33 | 34 | public Reply setID(String ID) { 35 | this.ID = ID; 36 | return this; 37 | } 38 | 39 | public Date getPostTime() { 40 | return postTime; 41 | } 42 | 43 | public Reply setPostTime(Date postTime) { 44 | this.postTime = postTime; 45 | return this; 46 | } 47 | 48 | public ReplyType getType() { 49 | return type; 50 | } 51 | 52 | public Reply setType(ReplyType type) { 53 | this.type = type; 54 | return this; 55 | } 56 | 57 | public String getContent() { 58 | return content; 59 | } 60 | 61 | public Reply setContent(String content) { 62 | this.content = content; 63 | return this; 64 | } 65 | 66 | public DBObject toMongoDBObject() { 67 | DBObject docReply = new BasicDBObject(); 68 | docReply.put("ReplyID", ID); 69 | String rt = null; 70 | if (type == ReplyType.Positive) { 71 | rt = "UpvoteCount"; 72 | } else if (type == ReplyType.Negative) { 73 | rt = "DownvoteCount"; 74 | } else if (type == ReplyType.Normal) { 75 | rt = "NeutralCount"; 76 | } 77 | docReply.put("ReplyType", rt); 78 | docReply.put("ReplyContent", content); 79 | docReply.put("ReplyDate", postTime); 80 | return docReply; 81 | } 82 | 83 | public static Reply fromMongoDBObject(DBObject obj) { 84 | Reply reply = new Reply(); 85 | reply.ID = (String) obj.get("ReplyID"); 86 | reply.type = (ReplyType) obj.get("ReplyType"); 87 | reply.content = (String) obj.get("ReplyContent"); 88 | reply.postTime = (Date) obj.get("ReplyDate"); 89 | return reply; 90 | } 91 | 92 | } 93 | -------------------------------------------------------------------------------- /src/main/java/crawler/main/Main.java: -------------------------------------------------------------------------------- 1 | package crawler.main; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.PrintWriter; 6 | import java.text.SimpleDateFormat; 7 | import java.util.Date; 8 | import java.util.concurrent.ExecutorService; 9 | import java.util.concurrent.Executors; 10 | import java.util.concurrent.TimeUnit; 11 | 12 | import org.apache.log4j.Logger; 13 | import org.apache.log4j.PropertyConfigurator; 14 | 15 | import crawler.base.Entry; 16 | import crawler.client.PTTClient; 17 | import crawler.client.PTTClient.Protocol; 18 | 19 | public class Main { 20 | 21 | private static final Logger log = Logger.getLogger(Main.class); 22 | private static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); 23 | 24 | public static String username = null; 25 | public static String password = null; 26 | public static String boardname = null; 27 | public static boolean isMultiThread = false; 28 | 29 | static { 30 | PropertyConfigurator.configure("log4j.properties"); 31 | } 32 | 33 | /** 34 | * 解析輸入參數 35 | * @param args 輸入的參數 36 | */ 37 | private static void parseArgs(String[] args) { 38 | 39 | isMultiThread = false; 40 | 41 | for (int i=0; i crawlPostsByRange(from, to)); 160 | } 161 | } 162 | 163 | try { 164 | executor.shutdown(); 165 | executor.awaitTermination(1, TimeUnit.DAYS); 166 | } catch (InterruptedException e) { 167 | e.printStackTrace(); 168 | } 169 | 170 | } 171 | 172 | /** 173 | * 抓取看板內指定範圍的文章 174 | * @param fromNum 175 | * @param toNum 176 | * @throws Exception 177 | */ 178 | public static void crawlPostsByRange(int fromNum, int toNum) { 179 | 180 | final String savePath = "Results/" + boardname + "_" + sdf.format(new Date()); 181 | new File(savePath).mkdirs(); 182 | PTTClient ptt = new PTTClient(); 183 | 184 | try { 185 | 186 | ptt.connect(Protocol.SSH); 187 | ptt.login(username, password, true); 188 | ptt.toBoard(boardname); 189 | Entry entry = ptt.toEntryByNum(boardname, fromNum); 190 | 191 | for (;;) { 192 | if (!entry.author.equals("-")) { 193 | String postContent = ptt.downloadCurrentPost(); 194 | log.info(entry.toString()); 195 | PrintWriter pw = new PrintWriter(savePath + "/#" + entry.id + ".txt"); 196 | pw.print(postContent); 197 | pw.close(); 198 | } 199 | if (entry.number.equals(Integer.toString(toNum)) || entry.number.equals("★")) { 200 | break; 201 | } 202 | entry = ptt.moveDownEntry(boardname); 203 | } 204 | 205 | } catch (Exception e) { 206 | e.printStackTrace(); 207 | } finally { 208 | try { 209 | ptt.logout(); 210 | ptt.close(); 211 | Thread.sleep(10 * 1000); 212 | } catch (Exception e) { 213 | e.printStackTrace(); 214 | } 215 | } 216 | 217 | } 218 | 219 | public static void main(String[] args) { 220 | 221 | parseArgs(args); 222 | 223 | if (isMultiThread) { 224 | crawlAllPostsMultiThread(); 225 | } else { 226 | crawlAllPosts(); 227 | } 228 | 229 | } 230 | 231 | } 232 | -------------------------------------------------------------------------------- /src/main/java/crawler/base/Post.java: -------------------------------------------------------------------------------- 1 | package crawler.base; 2 | 3 | import java.text.SimpleDateFormat; 4 | import java.util.ArrayList; 5 | import java.util.Date; 6 | import java.util.Set; 7 | 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import com.mongodb.BasicDBObject; 12 | import com.mongodb.DBObject; 13 | 14 | public class Post { 15 | 16 | private static final Logger log = LoggerFactory.getLogger(Post.class); 17 | private static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 18 | 19 | private String ID = null; 20 | private String title = null; 21 | private String author = null; 22 | private String status = null; 23 | private String karma = null; 24 | private Date postTime = null; 25 | private String url = null; 26 | 27 | private int upVoteCount = 0; 28 | private int downVoteCount = 0; 29 | private int neutralCount = 0; 30 | 31 | private String content = null; 32 | 33 | private ArrayList replies = null; 34 | 35 | public Post() { 36 | replies = new ArrayList(); 37 | } 38 | 39 | public Post(DBObject obj) { 40 | this.ID = (String) obj.get("ID"); 41 | this.author = (String) obj.get("Artist"); 42 | this.title = (String) obj.get("Title"); 43 | this.postTime = (Date) obj.get("postTime"); 44 | this.url = (String) obj.get("Url"); 45 | this.content = (String) obj.get("Content"); 46 | Set keys = obj.keySet(); 47 | int replyCount = keys.size() - 7; 48 | this.replies = new ArrayList(replyCount); 49 | for (int i=0; i < replyCount; i++) { 50 | DBObject replyObj = (DBObject) obj.get(String.format("reply %d/%d", i+1, replyCount)); 51 | this.replies.add(new Reply(replyObj)); 52 | } 53 | } 54 | 55 | public DBObject toMongoDBObject() { 56 | DBObject docPost = new BasicDBObject(); 57 | docPost.put("ID", ID); 58 | docPost.put("Artist", author); 59 | docPost.put("Title", title); 60 | docPost.put("postTime", postTime); 61 | docPost.put("Url", url); 62 | docPost.put("UpvoteCount", upVoteCount); 63 | docPost.put("DownvoteCount", downVoteCount); 64 | docPost.put("NeutralCount", neutralCount); 65 | docPost.put("catchTime", new Date()); 66 | docPost.put("Content", content); 67 | for (int i=0; i keys = obj.keySet(); 86 | int replyCount = keys.size() - 12; 87 | post.replies = new ArrayList(replyCount); 88 | for (int i=0; i < replyCount; i++) { 89 | DBObject replyObj = (DBObject) obj.get(String.format("reply %d/%d", i+1, replyCount)); 90 | post.replies.add(new Reply(replyObj)); 91 | } 92 | 93 | return post; 94 | } 95 | 96 | /** 97 | * 依據時間只截取出更新部分 98 | * @param fromDate 99 | * @param toDate 100 | * @return 101 | */ 102 | public Post extractUpdatePost(Date fromDate, Date toDate) { 103 | // Content設為null 104 | this.setContent(null); 105 | 106 | // 重新統計回應的種類數 107 | int diffUpvoteCount = 0, diffDownvoteCount = 0, diffNeutralCount = 0; 108 | ArrayList allReplies = this.getReplies(); 109 | ArrayList newReplies = new ArrayList(); 110 | for (Reply reply : allReplies) { 111 | Date pt = reply.getPostTime(); 112 | if (pt != null && !pt.before(fromDate) && pt.before(toDate)) { 113 | switch (reply.getType()) { 114 | case Positive: 115 | ++diffUpvoteCount; 116 | break; 117 | case Negative: 118 | ++diffDownvoteCount; 119 | break; 120 | case Normal: 121 | ++diffNeutralCount; 122 | break; 123 | default: 124 | break; 125 | } 126 | newReplies.add(reply); 127 | } 128 | } 129 | this.setUpVoteCount(diffUpvoteCount); 130 | this.setDownVoteCount(diffDownvoteCount); 131 | this.setNeutralCount(diffNeutralCount); 132 | this.setReplies(newReplies); 133 | 134 | if (newReplies.size() > 0) { 135 | log.info(String.format("Update post (+: %d, -: %d, ~: %d)", diffUpvoteCount, diffDownvoteCount, diffNeutralCount)); 136 | } 137 | 138 | return this; 139 | } 140 | 141 | public String toString() { 142 | return String.format("#%s %s %-13s %s", this.ID, sdf.format(this.postTime), this.author, this.title); 143 | } 144 | 145 | public String getID() { 146 | return ID; 147 | } 148 | 149 | public Post setID(String ID) { 150 | this.ID = ID; 151 | return this; 152 | } 153 | 154 | public String getTitle() { 155 | return title; 156 | } 157 | 158 | public Post setTitle(String title) { 159 | this.title = title; 160 | return this; 161 | } 162 | 163 | public String getAuthor() { 164 | return author; 165 | } 166 | 167 | public Post setAuthor(String author) { 168 | this.author = author; 169 | return this; 170 | } 171 | 172 | public String getStatus() { 173 | return status; 174 | } 175 | 176 | public Post setStatus(String status) { 177 | this.status = status; 178 | return this; 179 | } 180 | 181 | public String getKarma() { 182 | return karma; 183 | } 184 | 185 | public Post setKarma(String karma) { 186 | this.karma = karma; 187 | return this; 188 | } 189 | 190 | public Date getPostTime() { 191 | return postTime; 192 | } 193 | 194 | public Post setPostTime(Date postTime) { 195 | this.postTime = postTime; 196 | return this; 197 | } 198 | 199 | public String getUrl() { 200 | return url; 201 | } 202 | 203 | public Post setUrl(String url) { 204 | this.url = url; 205 | return this; 206 | } 207 | 208 | public String getContent() { 209 | return content; 210 | } 211 | 212 | public Post setContent(String content) { 213 | this.content = content; 214 | return this; 215 | } 216 | 217 | public ArrayList getReplies() { 218 | return replies; 219 | } 220 | 221 | public Post setReplies(ArrayList replies) { 222 | this.replies = replies; 223 | return this; 224 | } 225 | 226 | public void addReply(Reply reply) { 227 | replies.add(reply); 228 | } 229 | 230 | public Reply getReply(int index) { 231 | return replies.get(index); 232 | } 233 | 234 | public int getUpVoteCount() { 235 | return upVoteCount; 236 | } 237 | 238 | public Post setUpVoteCount(int upVoteCount) { 239 | this.upVoteCount = upVoteCount; 240 | return this; 241 | } 242 | 243 | public int getDownVoteCount() { 244 | return downVoteCount; 245 | } 246 | 247 | public Post setDownVoteCount(int downVoteCount) { 248 | this.downVoteCount = downVoteCount; 249 | return this; 250 | } 251 | 252 | public int getNeutralCount() { 253 | return neutralCount; 254 | } 255 | 256 | public Post setNeutralCount(int neutralCount) { 257 | this.neutralCount = neutralCount; 258 | return this; 259 | } 260 | 261 | } 262 | -------------------------------------------------------------------------------- /src/main/java/crawler/base/PostAnalysiser.java: -------------------------------------------------------------------------------- 1 | package crawler.base; 2 | import java.text.ParseException; 3 | import java.text.SimpleDateFormat; 4 | import java.util.Calendar; 5 | import java.util.Date; 6 | import java.util.Locale; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | 10 | import org.apache.log4j.Level; 11 | import org.apache.log4j.Logger; 12 | import org.jsoup.Jsoup; 13 | import org.jsoup.nodes.Document; 14 | import org.jsoup.nodes.Element; 15 | import org.jsoup.safety.Whitelist; 16 | import org.jsoup.select.Elements; 17 | 18 | import crawler.base.Reply.ReplyType; 19 | 20 | public class PostAnalysiser { 21 | 22 | static final Logger log = Logger.getLogger(PostAnalysiser.class); 23 | 24 | static { 25 | log.setLevel(Level.INFO); 26 | } 27 | 28 | private static final Pattern PostHeaderPattern = Pattern.compile( 29 | "作者:*\\s+(?.*?)\\s+((看板|站內):*\\s+(?.*?))*" + 30 | "\\s*標題:*\\s+(?.*?)[\\r\\n]+" + 31 | "\\s*時間:*\\s+(?<time>.*?)[\\r\\n]+"+ 32 | "[-─]*\\s*" 33 | ); 34 | 35 | private static final Pattern PostFooterPattern = Pattern.compile( 36 | "(" + 37 | "[-]*\\s+※\\s+發信站:.*來自:\\s+(?<ip>[\\d\\.]+)\\s+" + 38 | "※\\s+文章網址:\\s+(?<url>.*?)[\\r\\n]+" + 39 | ")|(" + 40 | "[-]*\\s+※\\s+發信站.*?\\s+"+ 41 | "◆\\s+From:\\s+(?<ip2>[\\d\\.]+)[\\r\\n]+" + 42 | ")|(" + 43 | "[-]*\\s+※\\s+發信站:.*(來自:\\s+(?<ip3>[\\d\\.]+))*\\s+" + 44 | ")|(" + 45 | "[-]*※\\s+文章網址:\\s+(?<url2>.*?)[\\r\\n]+" + 46 | //")|(" + 47 | // "\\s+[-]+\\s+" + 48 | ")" 49 | ); 50 | 51 | private static final Pattern PostReplyPattern = Pattern.compile( 52 | "(?<type>[→推噓])\\s*(?<author>.*?):\\s*(?<content>.*?)[\\r\\n]+" 53 | ); 54 | 55 | private static final Pattern ReplyDatePattern = Pattern.compile( 56 | "\\d+/\\d+[ ]\\d+:\\d+" 57 | ); 58 | 59 | private static final Pattern URLTimePattern = Pattern.compile( 60 | "M\\.(?<timestamp>\\d+)\\." 61 | ); 62 | 63 | // Example: Sun Mar 22 23:43:00 2015 64 | private static final SimpleDateFormat postSDF = new SimpleDateFormat("EEE MMM d HH:mm:ss yyyy", Locale.ENGLISH); 65 | private static final SimpleDateFormat postSDF2 = new SimpleDateFormat("EEE MMMd HH:mm:ss yyyy", Locale.ENGLISH); 66 | private static final SimpleDateFormat replySDF = new SimpleDateFormat("yyyy/MM/dd HH:mm"); 67 | 68 | public static Post parsePost(Entry entry, String rawText) { 69 | 70 | Post post = new Post(); 71 | 72 | String content = rawText; 73 | 74 | post.setID(entry.id); 75 | post.setAuthor(entry.author); 76 | post.setUrl(entry.url); 77 | 78 | boolean contentFlag = false; 79 | 80 | try { 81 | 82 | Matcher matcher = PostHeaderPattern.matcher(rawText); 83 | if (!matcher.find()) { 84 | //log.info(rawText); 85 | throw new Exception("The post not match \"PostHeaderPattern\" format."); 86 | } 87 | 88 | post.setTitle(matcher.group("title").trim()); 89 | 90 | String timeStr = matcher.group("time").trim(); 91 | Date postTime = null; 92 | try { 93 | postTime = postSDF.parse(timeStr); 94 | } catch (ParseException e) { 95 | try { 96 | postTime = postSDF2.parse(timeStr); 97 | } catch (ParseException e2) { 98 | 99 | } 100 | } 101 | 102 | Calendar cal = Calendar.getInstance(); 103 | cal.setTime(postTime); 104 | int year = cal.get(Calendar.YEAR); 105 | 106 | post.setPostTime(postTime); 107 | 108 | String remainText = rawText.substring(matcher.end()); 109 | content = remainText; 110 | 111 | matcher = PostFooterPattern.matcher(remainText); 112 | if (matcher.find()) { 113 | if (matcher.group(1) != null) { 114 | //System.out.println(matcher.group("ip").trim()); 115 | //System.out.println(matcher.group("url").trim()); 116 | post.setUrl(matcher.group("url").trim()); 117 | //} else if (matcher.group(4) != null) { 118 | //System.out.println(matcher.group("ip2").trim()); 119 | //} else if (matcher.group(6) != null) { 120 | //System.out.println(matcher.group("ip3").trim()); 121 | } else if (matcher.group(9) != null) { 122 | post.setUrl(matcher.group("url2").trim()); 123 | } 124 | 125 | //System.out.println(); 126 | contentFlag = true; 127 | content = remainText.substring(0, matcher.start()).trim(); 128 | remainText = rawText.substring(matcher.end()); 129 | post.setContent(content); 130 | } else { 131 | //throw new Exception("The post not match \"PostFooterPattern\" format."); 132 | post.setContent(content); 133 | } 134 | 135 | matcher = PostReplyPattern.matcher(remainText); 136 | int upVoteCount = 0, downVoteCount = 0, neutralCount = 0; 137 | while (matcher.find()) { 138 | 139 | if (!contentFlag) { 140 | contentFlag = true; 141 | post.setContent(remainText.substring(0, matcher.start())); 142 | } 143 | 144 | //DBObject docReply = new BasicDBObject(); 145 | Reply reply = new Reply(); 146 | 147 | String type = matcher.group("type"); 148 | String author = matcher.group("author"); 149 | String temp = matcher.group("content"); 150 | String replyContent = temp; 151 | Date ReplyDate = null; 152 | 153 | ReplyType rt = null; 154 | if (type.equals("推")) { 155 | upVoteCount++; 156 | rt = ReplyType.Positive; 157 | } else if (type.equals("噓")) { 158 | downVoteCount++; 159 | rt = ReplyType.Negative; 160 | } else if (type.equals("→")) { 161 | neutralCount++; 162 | rt = ReplyType.Normal; 163 | } 164 | 165 | if (temp.length() > 11) { // Time pattern is 11 characters (MM/DD HH:mm) 166 | 167 | Matcher m = ReplyDatePattern.matcher(temp); 168 | if (m.find()) { 169 | 170 | replyContent = temp.substring(0, m.start()); 171 | String rdstr = m.group(); 172 | try { 173 | rdstr = year + "/" + rdstr; 174 | ReplyDate = replySDF.parse(rdstr); 175 | } catch (ParseException e) { 176 | log.warn("Fail to parae reply date. (\"" + temp + "\")"); 177 | } 178 | 179 | } 180 | } 181 | 182 | reply.setID(author); 183 | reply.setType(rt); 184 | reply.setContent(replyContent); 185 | reply.setPostTime(ReplyDate); 186 | 187 | post.addReply(reply); 188 | 189 | } 190 | 191 | post.setUpVoteCount(upVoteCount); 192 | post.setDownVoteCount(downVoteCount); 193 | post.setNeutralCount(neutralCount); 194 | 195 | //post.setParsed(true); 196 | 197 | } catch (Exception e) { 198 | 199 | log.warn("Fail to parse the post. (PostID: " + entry.id + ") " + e.toString()); 200 | //e.printStackTrace(); 201 | 202 | post.setContent(content); 203 | //post.setParsed(false); 204 | 205 | } 206 | 207 | return post; 208 | 209 | } 210 | 211 | public static Post parsePost(Document doc, String url) { 212 | 213 | Post post = new Post(); 214 | 215 | post.setUrl(url); 216 | 217 | Element mainContent = doc.getElementById("main-content"); 218 | 219 | // 標頭 220 | int year = 2015; 221 | Elements metas = mainContent.select(".article-metaline, .article-metaline-right"); 222 | for (Element meta : metas) { 223 | String tag = meta.select(".article-meta-tag").text(); 224 | String value = meta.select(".article-meta-value").text(); 225 | if (tag.equals("作者")) { 226 | Matcher matcher = Pattern.compile("(?<author>.*?)\\s+").matcher(value); 227 | if (matcher.find()) { 228 | value = matcher.group(); 229 | } 230 | post.setAuthor(value.trim()); 231 | } else if (tag.equals("標題")) { 232 | post.setTitle(value.trim()); 233 | } else if (tag.equals("時間")) { 234 | Date postTime = null; 235 | try { 236 | postTime = getTimeFromPttURL(url); 237 | postTime = postSDF.parse(value.trim()); 238 | Calendar cal = Calendar.getInstance(); 239 | cal.setTime(postTime); 240 | year = cal.get(Calendar.YEAR); 241 | if (year <= 1980) { 242 | postTime = getTimeFromPttURL(url); 243 | } 244 | } catch (Exception e) { 245 | log.info("Cannot parse the post time."); 246 | } finally { 247 | post.setPostTime(postTime); 248 | } 249 | } else if (tag.equals("看板")) { 250 | 251 | } else { 252 | 253 | } 254 | 255 | meta.remove(); 256 | } 257 | 258 | if (post.getPostTime() == null) { 259 | try { post.setPostTime(getTimeFromPttURL(url)); } catch (Exception e) {} 260 | } 261 | 262 | // 推文 263 | Elements pushs = mainContent.select("div.push"); 264 | int upVoteCount = 0, downVoteCount = 0, neutralCount = 0; 265 | for (Element push : pushs) { 266 | 267 | Reply reply = new Reply(); 268 | 269 | String pushTag = push.select(".push-tag").text().trim(); 270 | String pushUserID = push.select(".push-userid").text().trim(); 271 | String pushContent = push.select(".push-content").text().replaceFirst("^:\\s+", "").trim(); 272 | String pushDatetime = push.select(".push-ipdatetime").text().trim(); 273 | 274 | ReplyType rt = null; 275 | if (pushTag.equals("推")) { 276 | upVoteCount++; 277 | rt = ReplyType.Positive; 278 | } else if (pushTag.equals("噓")) { 279 | downVoteCount++; 280 | rt = ReplyType.Negative; 281 | } else if (pushTag.equals("→")) { 282 | neutralCount++; 283 | rt = ReplyType.Normal; 284 | } 285 | 286 | reply.setID(pushUserID); 287 | reply.setType(rt); 288 | reply.setContent(pushContent); 289 | try { 290 | pushDatetime = pushDatetime.replaceAll("\\d+(\\.\\d+)+", "").trim(); 291 | reply.setPostTime(replySDF.parse(year + "/" + pushDatetime)); 292 | } catch (Exception e) { 293 | log.warn(e.getMessage()); 294 | } 295 | 296 | post.addReply(reply); 297 | 298 | push.remove(); 299 | } 300 | 301 | // 其他(發信站, 文章網址, ...) 302 | Elements f2s = mainContent.select("span.f2"); 303 | Pattern F2Pattern = Pattern.compile("※\\s+(?<type>發信站|文章網址|轉錄者|編輯):\\s+((?<id>\\w+)\\s+)*"); 304 | for (Element f2 : f2s) { 305 | String text = f2.text().trim(); 306 | Matcher m = F2Pattern.matcher(text); 307 | if (m.find()) { 308 | if (m.group("type").equals("編輯")) { 309 | if (post.getAuthor() == null) { 310 | post.setAuthor(m.group("id")); 311 | } 312 | } else if (m.group("type").equals("轉錄者")) { 313 | if (post.getAuthor() == null) { 314 | post.setAuthor(m.group("id")); 315 | } 316 | } 317 | f2.remove(); 318 | } 319 | } 320 | 321 | String content = mainContent.wrap("<pre>").text(); 322 | content = content.replaceAll(PostFooterPattern.pattern(), "") 323 | .replaceAll("※\\s+.*轉錄至看板.*(\\d+:\\d+)*", "") 324 | .trim(); 325 | 326 | post.setUpVoteCount(upVoteCount); 327 | post.setDownVoteCount(downVoteCount); 328 | post.setNeutralCount(neutralCount); 329 | post.setContent(content); 330 | 331 | return post; 332 | 333 | } 334 | 335 | /** 336 | * br2nl 337 | * @param html 338 | * @return 339 | */ 340 | public static String br2nl(String html) { 341 | if(html == null) { 342 | return html; 343 | } 344 | Document document = Jsoup.parse(html); 345 | document.outputSettings(new Document.OutputSettings().prettyPrint(false));//makes html() preserve linebreaks and spacing 346 | document.select("br").append("\\n"); 347 | document.select("p").prepend("\\n\\n"); 348 | String s = document.html().replaceAll("\\\\n", "\n"); 349 | return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); 350 | } 351 | 352 | /** 353 | * getTimeFromPttURL 354 | * @param url 355 | * @return 356 | */ 357 | public static Date getTimeFromPttURL(String url) throws Exception { 358 | Matcher m = URLTimePattern.matcher(url); 359 | if (m.find()) { 360 | long timestamp = Long.parseLong(m.group("timestamp")); 361 | return timestamp2Date(timestamp); 362 | } 363 | return null; 364 | } 365 | 366 | /** 367 | * Convert Unix timestamp to Date 368 | * @param unixSeconds 369 | * @return 370 | */ 371 | public static Date timestamp2Date(long unixSeconds) { 372 | return new Date(unixSeconds * 1000L); 373 | } 374 | 375 | } 376 | -------------------------------------------------------------------------------- /src/main/java/crawler/client/PTTClient.java: -------------------------------------------------------------------------------- 1 | package crawler.client; 2 | import java.io.BufferedReader; 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.io.OutputStream; 7 | import java.io.PushbackReader; 8 | import java.net.SocketException; 9 | import java.security.KeyManagementException; 10 | import java.security.NoSuchAlgorithmException; 11 | import java.security.SecureRandom; 12 | import java.security.cert.CertificateException; 13 | import java.security.cert.X509Certificate; 14 | import java.util.ArrayList; 15 | import java.util.Arrays; 16 | import java.util.List; 17 | import java.util.Properties; 18 | import java.util.regex.Matcher; 19 | import java.util.regex.Pattern; 20 | 21 | import javax.net.ssl.HostnameVerifier; 22 | import javax.net.ssl.HttpsURLConnection; 23 | import javax.net.ssl.SSLContext; 24 | import javax.net.ssl.SSLSession; 25 | import javax.net.ssl.X509TrustManager; 26 | 27 | import org.apache.commons.net.telnet.TelnetClient; 28 | import org.apache.log4j.Level; 29 | import org.apache.log4j.Logger; 30 | import org.json.JSONObject; 31 | import org.jsoup.Jsoup; 32 | import org.jsoup.nodes.Document; 33 | import org.jsoup.nodes.Element; 34 | 35 | import com.jcraft.jsch.Channel; 36 | import com.jcraft.jsch.ChannelShell; 37 | import com.jcraft.jsch.JSch; 38 | import com.jcraft.jsch.JSchException; 39 | import com.jcraft.jsch.Session; 40 | 41 | import crawler.base.Entry; 42 | import crawler.base.Post; 43 | import crawler.base.PostAnalysiser; 44 | 45 | public class PTTClient { 46 | 47 | private static final Logger log = Logger.getLogger(PTTClient.class); 48 | private static final boolean isPrintScreen = false; 49 | private static final boolean isPrintSource = false; 50 | static { 51 | log.setLevel(Level.ALL); 52 | } 53 | 54 | public static enum Protocol { 55 | Telnet, SSH 56 | } 57 | 58 | public static enum Screen { 59 | MainMenu, // 主選單 (【主功能表】.*批踢踢實業坊.*呼叫器) 60 | Board, // 看板 (文章選讀.*回應.*推文.*轉錄.*相關主題.*找標題/作者.*進板畫面) 61 | Post, // 貼文 (瀏覽.*第.*頁.*目前顯示.*第.*行.*離開) 62 | Unknown 63 | } 64 | 65 | private static final String UserAgent = "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"; 66 | private static final Pattern VT100ControlPattern = Pattern.compile("\u001B\\[(?<code>[0-9;]*)(?<type>[ABCDHJKmsu])"); 67 | private static final int DEFAULT_TIMEOUT = 10 * 1000; 68 | 69 | private static final String MenuHeader = "【主功能表】[\\s\\S]*呼叫器"; 70 | private static final String BoardFooter = "文章選讀[\\s\\S]*相關主題[\\s\\S]*找標題/作者[\\s\\S]*進板畫面"; 71 | private static final String PostFooter = "瀏覽[\\s\\S]*第[\\s\\S]*頁[\\s\\S]*目前顯示[\\s\\S]*第[\\s\\S]*行[\\s\\S]*離開"; 72 | 73 | private static final Pattern ENTRYPATTER_PATTERN = Pattern.compile("[●>][ ]*(?<id>[0-9]+|★[ ]+)[ ](?<status>.)(?<karma>[0-9 X]+|爆)(?<date>../..)[ ](?<author>.*?)([\\s□轉]|R:)+(?<title>.*)"); 74 | private static final Pattern PROGRESS_PATTERN = Pattern.compile("(?<percent>\\d+)%[^\\d]*(?<from>\\d+)~(?<to>\\d+)"); 75 | 76 | private static final Pattern AID_PATTERN = Pattern.compile("文章代碼\\(AID\\):\\s*#(?<aid>........)"); 77 | private static final Pattern URL_PATTERN = Pattern.compile("文章網址:\\s*(?<url>.*?)[\\s\\│]+"); 78 | 79 | private static final Pattern URL_VERIFY = Pattern.compile("^(https?)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]"); 80 | 81 | public static final int RETV_TIMEOUT = -1; 82 | public static final int RETV_EOF = -2; 83 | public static final int RETV_IOEXCEPTION = -9; 84 | 85 | // Screen 86 | private int posX = -1; 87 | private int posY = -1; 88 | private char[][] screen = null; 89 | 90 | private Protocol protocol = null; 91 | private TelnetClient tc = null; 92 | 93 | private Channel channel = null; 94 | private Session session = null; 95 | 96 | private InputStream is = null; 97 | private OutputStream os = null; 98 | private Thread renderScreenThread = null; 99 | 100 | // Matchers 101 | @SuppressWarnings("unused") 102 | private String beforeStr = null, matchStr = null, afterStr = null; 103 | 104 | public PTTClient() { 105 | initialize(); 106 | } 107 | 108 | /** 109 | * 初始化 110 | */ 111 | public void initialize() { 112 | screen = new char[72][80]; 113 | posY = posX = -1; 114 | clearScreen(); 115 | } 116 | 117 | /** 118 | * Connect PTT by using protocol 119 | * @param protocol 120 | * @throws SocketException 121 | * @throws IOException 122 | * @throws JSchException 123 | */ 124 | public void connect(Protocol protocol) throws SocketException, IOException, JSchException { 125 | 126 | this.protocol = protocol; 127 | 128 | switch (this.protocol) { 129 | case Telnet: 130 | log.info("Connect ptt.cc using telnet"); 131 | 132 | tc = new TelnetClient(); 133 | tc.connect("ptt.cc"); 134 | 135 | is = tc.getInputStream(); 136 | os = tc.getOutputStream(); 137 | 138 | break; 139 | case SSH: 140 | default: 141 | log.info("Connect ptt.cc using SSH (bbs@ptt.cc)"); 142 | 143 | Properties configuration = new Properties(); 144 | configuration.put("kex", "diffie-hellman-group1-sha1," 145 | + "diffie-hellman-group14-sha1," 146 | + "diffie-hellman-group-exchange-sha1," 147 | + "diffie-hellman-group-exchange-sha256"); 148 | configuration.put("StrictHostKeyChecking", "no"); 149 | 150 | session = new JSch().getSession("bbsu", "ptt.cc"); 151 | session.setConfig(configuration); 152 | session.connect(10 * 1000); // Timeout 10 seconds 153 | channel = (ChannelShell) session.openChannel("shell"); 154 | channel.connect(); 155 | 156 | is = channel.getInputStream(); 157 | os = channel.getOutputStream(); 158 | 159 | break; 160 | } 161 | 162 | renderScreenThread = new Thread(new Runnable() { 163 | @Override 164 | public void run() { 165 | try { 166 | renderScreen(); 167 | } catch (IOException e) { 168 | e.printStackTrace(); 169 | } 170 | } 171 | }); 172 | renderScreenThread.setName("Render the screen from InputStream"); 173 | renderScreenThread.setDaemon(true); 174 | renderScreenThread.start(); 175 | 176 | } 177 | 178 | /** 179 | * 關閉連線 180 | * @throws IOException 181 | */ 182 | public void close() throws IOException { 183 | 184 | if (this.protocol == null) { 185 | return; 186 | } 187 | 188 | switch (this.protocol) { 189 | case Telnet: 190 | default: 191 | if (tc == null) { return; } 192 | log.info("Close the telnet connection."); 193 | tc.disconnect(); 194 | break; 195 | case SSH: 196 | if (session == null) { return; } 197 | log.info("Close the SSH connection."); 198 | channel.disconnect(); 199 | session.disconnect(); 200 | break; 201 | } 202 | 203 | } 204 | 205 | /** 206 | * 登入PTT 207 | * @param username 208 | * @param password 209 | * @param isDup 210 | * @throws Exception 211 | */ 212 | public void login(String username, String password, boolean isDup) throws Exception { 213 | 214 | if (this.protocol == PTTClient.Protocol.Telnet) { 215 | send(username + ",\r\n" + password + "\r\n"); 216 | } else if (expect("請輸入代號,或以 guest 參觀,或以 new 註冊:") == 0) { 217 | send(username + "\r\n" + password + "\r\n"); 218 | } else { 219 | log.error("Login error."); 220 | } 221 | 222 | int midx = expect("密碼不對", "錯誤", "您想刪除其他重複登入的連線嗎?", "請按任意鍵繼續"); 223 | if (midx < 0) { 224 | throw new Exception("連線逾時"); 225 | } else if (midx < 2) { 226 | throw new Exception("密碼不對喔!請檢查帳號及密碼有無輸入錯誤。"); 227 | } else if (midx == 2) { 228 | send(isDup ? "n\r\n" : "y\r\n"); 229 | if (expect("請按任意鍵繼續") == 0) { 230 | send("\r\n"); 231 | } 232 | } else if (midx == 3) { 233 | send("\r\n"); 234 | } 235 | 236 | midx = expect("呼叫器", "您要刪除以上錯誤嘗試的記錄嗎?"); 237 | if (midx == 1) { 238 | send("Y\r\n"); 239 | midx = expect("呼叫器"); 240 | } 241 | 242 | if (midx == 0) { 243 | log.info("登入成功"); 244 | } else { 245 | throw new Exception("登入失敗"); 246 | } 247 | 248 | } 249 | 250 | /** 251 | * 登出 252 | * @throws IOException 253 | */ 254 | public void logout() throws IOException { 255 | send("qqqqqqeee\nY\n"); 256 | } 257 | 258 | /** 259 | * 回上一層 260 | * @throws IOException 261 | */ 262 | public void quit() throws IOException { 263 | send("q"); 264 | } 265 | 266 | /** 267 | * 回到主選單 268 | * @throws IOException 269 | */ 270 | public void toMainMenu() throws IOException { 271 | send("qqqqqq"); 272 | if (expect("【主功能表】[\\s\\S]*呼叫器") != 0) { 273 | log.warn("無法回到【主功能表】"); 274 | } else { 275 | log.info("已回到【主功能表】"); 276 | } 277 | } 278 | 279 | /** 280 | * Go to the board by board name (Current entry maybe 置底文) 281 | * @param boardName 282 | * @throws Exception 283 | */ 284 | public void toBoard(String boardName) throws Exception { 285 | toMainMenu(); 286 | send("s" + boardName + "\r\n$$"); 287 | int m1 = expect("看板《" + boardName + "》[\\s\\S]*" + BoardFooter); 288 | if (m1 != 0) { 289 | throw new Exception("Fail to go Board."); 290 | } 291 | } 292 | 293 | /** 294 | * Get the popularity of the board 295 | * @param boardName 296 | * @return 297 | * @throws Exception 298 | */ 299 | public int getBoardPopularity(String boardName) throws Exception { 300 | toBoard(boardName); 301 | this.refresh(); 302 | if (expect("編號.*日.*期.*作.*者.*文.*章.*標.*題.*人氣:\\d+") == 0) { 303 | Matcher m = Pattern.compile("\\d+").matcher(matchStr); 304 | if (m.find()) { 305 | return Integer.parseInt(m.group()); 306 | } 307 | } 308 | return -1; 309 | } 310 | 311 | /** 312 | * Get the current screen 313 | * @return 314 | * @throws IOException 315 | */ 316 | public Screen getCurrentScreen(String boardName) throws IOException { 317 | refresh(100); 318 | int matchIndex = expect( 319 | MenuHeader, 320 | "看板《" + boardName + "》[\\s\\S]*" + BoardFooter, 321 | PostFooter 322 | ); 323 | if (matchIndex == 0) { 324 | return Screen.MainMenu; 325 | } else if (matchIndex == 1) { 326 | return Screen.Board; 327 | } else if (matchIndex == 2) { 328 | return Screen.Post; 329 | } else { 330 | return Screen.Unknown; 331 | } 332 | } 333 | 334 | /** 335 | * setPlainTextMode 336 | * @throws IOException 337 | */ 338 | public void setPlainTextMode(String boardName) throws IOException { 339 | this.refresh(); 340 | send("l\\3q"); 341 | expect("看板《" + boardName + "》[\\s\\S]*" + BoardFooter); 342 | } 343 | 344 | /** 345 | * Move to up entry 346 | * @return The entry information after moving 347 | * @throws Exception 348 | */ 349 | public Entry moveUpEntry(String boardName) throws Exception { 350 | Entry newEntry, oldEntry = getBasicEntryInfo(boardName); 351 | if (oldEntry.number.equals("1")) { 352 | throw new Exception("Aready at the toppest entry."); 353 | } 354 | send("k"); 355 | int times = 0; 356 | do { 357 | Thread.sleep(100); 358 | newEntry = getBasicEntryInfo(boardName); 359 | if (++times > 100) { 360 | throw new Exception("Can not move to the up entry."); 361 | } 362 | } while (newEntry.number == oldEntry.number); 363 | return getFullEntryInfo(boardName); 364 | } 365 | 366 | /** 367 | * Move to down entry 368 | * @return The entry information after moving 369 | * @throws Exception 370 | */ 371 | public Entry moveDownEntry(String boardName) throws Exception { 372 | Entry newEntry, oldEntry = getBasicEntryInfo(boardName); 373 | if (oldEntry.number.equals("★")) { 374 | throw new Exception("Aready at the downest entry."); 375 | } 376 | send("n"); 377 | int times = 0; 378 | do { 379 | Thread.sleep(100); 380 | newEntry = getBasicEntryInfo(boardName); 381 | if (++times > 100) { 382 | throw new Exception("Can not move to the down entry."); 383 | } 384 | } while (newEntry.number == oldEntry.number); 385 | return getFullEntryInfo(boardName); 386 | } 387 | 388 | /** 389 | * Go to the latest post entry 390 | * @throws Exception 391 | */ 392 | public Entry toLatestPost(String boardName) throws Exception { 393 | send("$$"); // Skip the welcome of the board & to the latest article 394 | refresh(300); 395 | if (expect("看板《" + boardName + "》[\\s\\S]*" + BoardFooter) != 0) { 396 | throw new Exception("Current screen is not \"Board\""); 397 | } 398 | 399 | Entry oldEntry = getFullEntryInfo(boardName), newEntry = null; 400 | if (!oldEntry.number.equals("★") && !oldEntry.author.equals("-")) { 401 | return oldEntry; 402 | } 403 | for (int times=0; ; times++) { 404 | 405 | try { 406 | 407 | send("k"); 408 | 409 | int times2 = 0; 410 | do { 411 | Thread.sleep(100); 412 | newEntry = getFullEntryInfo(boardName); 413 | if (++times2 > 100) { 414 | return newEntry; 415 | } 416 | } while (oldEntry.id == newEntry.id); 417 | 418 | if (!newEntry.number.equals("★") && !newEntry.author.equals("-")) { 419 | break; 420 | } 421 | oldEntry = newEntry; 422 | 423 | } catch (Exception e) { 424 | 425 | } 426 | 427 | if (times >= 100) { 428 | throw new Exception("Can not go to latest post."); 429 | } 430 | 431 | } 432 | 433 | return newEntry; 434 | } 435 | 436 | public Entry getBasicEntryInfo(String boardName) throws Exception { 437 | if (expect("看板《" + boardName + "》[\\s\\S]*" + BoardFooter) == 0) { 438 | Matcher matcher = ENTRYPATTER_PATTERN.matcher(matchStr); 439 | if (matcher.find()) { 440 | String id = null; 441 | String url = null; 442 | String number = matcher.group("id").trim(); 443 | String status = matcher.group("status").trim(); 444 | String karma = matcher.group("karma").trim(); 445 | String date = matcher.group("date").trim(); 446 | String author = matcher.group("author").trim(); 447 | String title = matcher.group("title").trim(); 448 | return new Entry(id, number, status, karma, date, author, title, url); 449 | } else { 450 | throw new Exception("Can not match entry. " + matchStr); 451 | } 452 | } else { 453 | throw new Exception("Screen is not \"Board\""); 454 | } 455 | } 456 | 457 | /** 458 | * getFullEntryInfo 459 | * @return 460 | * @throws Exception 461 | */ 462 | public Entry getFullEntryInfo(String boardName) throws Exception { 463 | 464 | boolean isSuccess = false; 465 | int times = 0; 466 | Matcher matcher = null; 467 | 468 | do { 469 | 470 | expect("看板《" + boardName + "》[\\s\\S]*" + BoardFooter); 471 | 472 | matcher = ENTRYPATTER_PATTERN.matcher(matchStr); 473 | if (matcher.find()) { 474 | isSuccess = true; 475 | } else { 476 | log.warn("Faild to match entry. " + matchStr.replaceAll("\\s+", " ")); 477 | times++; 478 | } 479 | 480 | if (times >= 5) { 481 | log.error("Faild to match entry. " + matchStr.replaceAll("\\s+", " ")); 482 | throw new Exception("Faild to match entry."); 483 | } 484 | 485 | } while (!isSuccess); 486 | 487 | String id = null; 488 | String url = null; 489 | String number = matcher.group("id").trim(); 490 | String status = matcher.group("status").trim(); 491 | String karma = matcher.group("karma").trim(); 492 | String date = matcher.group("date").trim(); 493 | String author = matcher.group("author").trim(); 494 | String title = matcher.group("title").trim(); 495 | 496 | if (!author.equals("-")) { 497 | boolean success = false; 498 | int count = 0; 499 | do { 500 | String[] temp = this.getAID().split("\\t"); 501 | if (temp.length > 0 && !temp[0].equals("")) { 502 | id = temp[0]; 503 | } 504 | if (temp.length > 1 && !temp[1].equals("")) { 505 | url = temp[1]; 506 | } 507 | if (url != null && URL_VERIFY.matcher(url).find()) { 508 | success = true; 509 | } 510 | count++; 511 | } while (!success && count < 5); 512 | } 513 | 514 | Entry entry = new Entry(id, number, status, karma, date, author, title, url); 515 | return entry; 516 | } 517 | 518 | /** 519 | * Get post ID (e.g. 1L4GI8SM) 520 | * @return 521 | * @throws IOException 522 | */ 523 | public String getAID() throws IOException { 524 | 525 | String aid = ""; 526 | String url = ""; 527 | 528 | send("Q"); 529 | if (expect("請按任意鍵繼續") != 0) { 530 | return ""; 531 | } 532 | 533 | // 文章代碼(AID): #1L4GI8SM 534 | if (expect(AID_PATTERN) == 0) { 535 | Matcher m = AID_PATTERN.matcher(matchStr); 536 | if (m.find()) { 537 | aid = m.group("aid"); 538 | } 539 | } 540 | if (expect(URL_PATTERN) == 0) { 541 | Matcher m = URL_PATTERN.matcher(matchStr); 542 | if (m.find()) { 543 | url = m.group("url"); 544 | } 545 | } 546 | 547 | send("\n"); 548 | 549 | return aid + "\t" + url; 550 | } 551 | 552 | /** 553 | * toPostByNum 554 | * @param postNum 555 | * @throws Exception 556 | */ 557 | public Entry toEntryByNum(String boardName, int postNum) throws Exception { 558 | send(Integer.toString(postNum) + "\r\nhq"); 559 | this.refresh(200); 560 | if (expect("看板《" + boardName + "》[\\s\\S]*" + BoardFooter) != 0) { 561 | throw new Exception("Current screen is not \"Board\""); 562 | } 563 | return getFullEntryInfo(boardName); 564 | } 565 | 566 | /** 567 | * Go to post by ID 568 | * @param postID 569 | * @return 570 | * @throws Exception 571 | */ 572 | public Entry toEntryByID(String boardName, String postID) throws Exception { 573 | log.info("Go to AID: #"+ postID); 574 | send("#" + postID + "\r\nhq"); 575 | this.refresh(200); 576 | if (expect("看板《" + boardName + "》[\\s\\S]*" + BoardFooter) != 0) { 577 | throw new Exception("Current screen is not \"Board\""); 578 | } 579 | return getFullEntryInfo(boardName); 580 | } 581 | 582 | /** 583 | * 下載目前游標位置的貼文 584 | * @return 585 | * @throws Exception 586 | */ 587 | public String downloadCurrentPost() throws Exception { 588 | 589 | StringBuilder content = new StringBuilder(); 590 | int percent = -1; 591 | int fromLine = -1, toLine = -1; 592 | int fromLine_bk = -1, toLine_bk = 0; 593 | 594 | try { 595 | 596 | send("l\f"); 597 | 598 | while (true) { 599 | 600 | int midx = expect(PostFooter, "此頁內容會依閱讀者不同", "此文章無內容[\\s\\S]*按任意鍵繼續"); 601 | if (midx < 0) { 602 | log.warn("[Skip] Unexpected PostFooter"); 603 | break; 604 | } else if (midx == 1) { 605 | log.info("[Skip] 此頁內容會依閱讀者不同"); 606 | break; 607 | } else if (midx == 2) { 608 | log.info("Screen: 此文章無內容 [按任意鍵繼續]"); 609 | break; 610 | } 611 | 612 | String[] lines = beforeStr.split("\\n"); 613 | String footer = matchStr; 614 | 615 | Matcher matcher = PROGRESS_PATTERN.matcher(footer); 616 | if (!matcher.find()) { 617 | throw new Exception("Faild to match footer \"" + footer + "\""); 618 | } 619 | fromLine_bk = fromLine; 620 | 621 | percent = Integer.parseInt(matcher.group("percent")); 622 | fromLine = Integer.parseInt(matcher.group("from")); 623 | toLine = Integer.parseInt(matcher.group("to")); 624 | if (percent != 100 && fromLine == fromLine_bk) { 625 | Thread.sleep(50); 626 | continue; 627 | } 628 | 629 | //log.trace(String.format("%4d ~ %4d\t%3d%%", fromLine, toLine, percent)); 630 | 631 | // Append content 632 | int overlapLines = 0; 633 | if (fromLine <= toLine_bk) { 634 | overlapLines = toLine_bk - fromLine + 1; 635 | } 636 | for (int i=overlapLines; i<(lines.length-1); i++) { 637 | content.append(lines[i].trim()).append("\n"); 638 | } 639 | toLine_bk = toLine; 640 | 641 | // Next page or 100% break loop 642 | if (percent == 100) { 643 | break; 644 | } else { 645 | send((char) 0x06 + "\f"); 646 | } 647 | 648 | } 649 | 650 | send("q\f"); 651 | 652 | } catch (IOException e) { 653 | e.printStackTrace(); 654 | throw new Exception("下載貼文發生錯誤"); 655 | } 656 | 657 | return content.toString(); 658 | } 659 | 660 | public String getScreen() { 661 | StringBuilder sb = new StringBuilder(24 * 80); 662 | for (int i=0; i<24; i++) { 663 | for (int j=0; j<80; j++) { 664 | if (screen[i][j] != 0x00) { 665 | sb.append(screen[i][j]); 666 | } 667 | } 668 | sb.append("\n"); 669 | } 670 | return sb.toString(); 671 | } 672 | 673 | public int expect(Object... patterns) { 674 | return expect(DEFAULT_TIMEOUT, patterns); 675 | } 676 | 677 | public int expect(int timeout, Object... patterns) { 678 | ArrayList<Pattern> list = new ArrayList<Pattern>(); 679 | for (Object obj : patterns) { 680 | if (obj instanceof String) 681 | list.add(Pattern.compile((String) obj)); 682 | else if (obj instanceof Pattern) 683 | list.add((Pattern) obj); 684 | else { 685 | list.add(Pattern.compile(Pattern.quote(obj.toString()))); 686 | } 687 | } 688 | return expect(timeout, list); 689 | } 690 | 691 | public int expect(int timeout, List<Pattern> list) { 692 | 693 | long endTime = System.currentTimeMillis() + (long) timeout; 694 | 695 | while (true) { 696 | 697 | String currentScreen = getScreen(); 698 | 699 | for (int i = 0; i < list.size(); i++) { 700 | Matcher m = list.get(i).matcher(currentScreen); 701 | if (m.find()) { 702 | int matchStart = m.start(), matchEnd = m.end(); 703 | beforeStr = currentScreen.substring(0, matchStart); 704 | matchStr = m.group(); 705 | afterStr = currentScreen.substring(matchEnd); 706 | return i; 707 | } 708 | } 709 | 710 | long waitTime = endTime - System.currentTimeMillis(); 711 | if (waitTime <= 0) { 712 | return RETV_TIMEOUT; 713 | } 714 | 715 | try { 716 | Thread.sleep(100); 717 | } catch (InterruptedException e) { 718 | e.printStackTrace(); 719 | } 720 | 721 | } 722 | 723 | } 724 | 725 | public void refresh() throws IOException { 726 | refresh(300); 727 | } 728 | 729 | public void refresh(int waitTime) throws IOException { 730 | send("\f"); 731 | try { 732 | Thread.sleep(waitTime); 733 | } catch (InterruptedException e) { 734 | e.printStackTrace(); 735 | } 736 | } 737 | 738 | /** 739 | * 740 | * @throws KeyManagementException 741 | * @throws NoSuchAlgorithmException 742 | */ 743 | public static void enableSSLSocket() throws KeyManagementException, NoSuchAlgorithmException { 744 | HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() { 745 | @Override 746 | public boolean verify(String hostname, SSLSession session) { 747 | return true; 748 | } 749 | }); 750 | 751 | SSLContext context = SSLContext.getInstance("TLS"); 752 | context.init(null, new X509TrustManager[]{new X509TrustManager() { 753 | public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { 754 | } 755 | 756 | public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { 757 | } 758 | 759 | public X509Certificate[] getAcceptedIssuers() { 760 | return new X509Certificate[0]; 761 | } 762 | }}, new SecureRandom()); 763 | HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory()); 764 | } 765 | 766 | /** 767 | * Download post by URL (PTT Web Version) 768 | * @param url 769 | * @return 770 | * @throws IOException 771 | */ 772 | public static Post downloadPostByURL(String url) throws Exception { 773 | return downloadPostByURL(url, DEFAULT_TIMEOUT); 774 | } 775 | 776 | /** 777 | * Download post by URL (PTT Web Version) 778 | * @param url 779 | * @param timeout 780 | * @return 781 | * @throws Exception 782 | */ 783 | public static Post downloadPostByURL(String url, int timeout) throws Exception { 784 | 785 | Document doc = Jsoup.connect(url) 786 | .userAgent(UserAgent) 787 | .timeout(timeout) 788 | .cookie("over18", "1") 789 | .get(); 790 | 791 | return PostAnalysiser.parsePost(doc, url); 792 | 793 | } 794 | 795 | /** 796 | * Download post by URL with real time update (PTT Web Version) 797 | * @param url 798 | * @param timeout 799 | * @return 800 | * @throws Exception 801 | */ 802 | public static Post downloadPostByURLwithRU(String url, int timeout) throws Exception { 803 | 804 | Document doc = Jsoup.connect(url) 805 | .userAgent(UserAgent) 806 | .timeout(timeout) 807 | .cookie("over18", "1") 808 | .get(); 809 | 810 | try { 811 | 812 | Element pe = doc.getElementById("article-polling"); 813 | String pollUrl = "http://www.ptt.cc" + pe.attr("data-pollurl"); 814 | String longpollurl = "http://www.ptt.cc" + pe.attr("data-longpollurl"); 815 | 816 | String longpollJSON = Jsoup.connect(longpollurl) 817 | .userAgent(UserAgent) 818 | .timeout(5000) 819 | .ignoreContentType(true) 820 | .cookie("over18", "1") 821 | .execute() 822 | .body(); 823 | 824 | JSONObject obj = new JSONObject(longpollJSON); 825 | String size = obj.get("size").toString(); 826 | String sizeSig = obj.get("sig").toString(); 827 | 828 | String pollJSON = Jsoup.connect(pollUrl + "&size=" + size + "&size-sig=" + sizeSig) 829 | .userAgent(UserAgent) 830 | .timeout(3000) 831 | .ignoreContentType(true) 832 | .cookie("over18", "1") 833 | .execute() 834 | .body(); 835 | 836 | obj = new JSONObject(pollJSON); 837 | String contentHtml = obj.get("contentHtml").toString(); 838 | 839 | doc.getElementById("main-content").append(contentHtml); 840 | 841 | } catch (Exception e) { 842 | } 843 | 844 | return PostAnalysiser.parsePost(doc, url); 845 | 846 | } 847 | 848 | private void renderScreen() throws IOException { 849 | 850 | BufferedReader br = new BufferedReader( 851 | new InputStreamReader(is, "UTF-8")); 852 | PushbackReader pr = new PushbackReader(br, 128); 853 | 854 | int nc = 0; 855 | char[] cb = new char[4096]; 856 | 857 | while ((nc = pr.read(cb)) != -1) { 858 | 859 | if (isPrintSource) { 860 | System.out.print(new String(cb, 0, nc)); 861 | } 862 | 863 | for (int pos = 0; pos < nc; pos++) { 864 | 865 | char c = cb[pos]; 866 | 867 | switch (c) { 868 | case 0x08: // BS 869 | if (--posX < 0) { 870 | --posY; 871 | posX = 79; 872 | } 873 | continue; 874 | case 0x0A: // LF 875 | posY++; 876 | break; 877 | case 0x0D: // CR 878 | posX = 0; 879 | break; 880 | case 0x1B: // ESC 881 | int endPos = findEndPosOfVT100Conctrl(cb, nc, pos); 882 | if (endPos == -1) { 883 | pr.unread(cb, pos, nc - pos); 884 | pos = nc; 885 | } else { 886 | String ctrlStr = new String(cb, pos, endPos - pos + 1); 887 | Matcher matcher = VT100ControlPattern.matcher(ctrlStr); 888 | if (matcher.find()) { 889 | String code = matcher.group("code"); 890 | String type = matcher.group("type"); 891 | processVT100Conctrl(code, type); 892 | } else { 893 | log.error("Unknown VT100 Conctrl"); 894 | } 895 | pos = endPos; 896 | } 897 | break; 898 | default: 899 | 900 | if (c < 0x20 || c == 0x7F) { 901 | //System.out.printf("ASCII Conctrl: %c(%x)\n----------\n", ch, (int)(ch)); 902 | continue; 903 | } 904 | 905 | if (posX >= 0 && posY >= 0) { 906 | 907 | if (isHalfWidth(c)) { 908 | screen[posY][posX] = c; 909 | } else { 910 | screen[posY][posX] = c; 911 | if (posX < 79) { 912 | screen[posY][++posX] = 0x00; 913 | } 914 | } 915 | posX++; 916 | if (posX >= 80) { 917 | //posY++; 918 | posX = 79; 919 | } 920 | 921 | } 922 | } 923 | 924 | } 925 | 926 | if (isPrintScreen) { 927 | printScreen(); 928 | } 929 | 930 | } 931 | 932 | pr.close(); 933 | br.close(); 934 | 935 | } 936 | 937 | private int findEndPosOfVT100Conctrl(char[] cb, int nc, int pos) { 938 | int endPos = -1; 939 | for (int i = pos + 1; i < nc; i++) { 940 | char ec = cb[i]; 941 | if (ec == 'A' || ec == 'B' || ec == 'C' || ec == 'D' || ec == 'H' || 942 | ec == 'J' || ec == 'K' || ec == 'm' || ec == 's' || ec == 'u') { 943 | endPos = i; 944 | break; 945 | } 946 | } 947 | return endPos; 948 | } 949 | 950 | private void processVT100Conctrl(String code, String type) { 951 | 952 | switch (type) { 953 | case "m": 954 | break; 955 | case "H": 956 | if (code.equals("")) { 957 | // Cursor Home 958 | posY = posX = 0; 959 | } else { 960 | // Cursor to position 961 | final Pattern p = Pattern.compile("(?<Y>\\d+);(?<X>\\d+)"); 962 | Matcher m = p.matcher(code); 963 | if (m.find()) { 964 | posY = Integer.parseInt(m.group("Y")) - 1; 965 | posX = Integer.parseInt(m.group("X")) - 1; 966 | } 967 | } 968 | break; 969 | case "J": 970 | if (code.equals("2")) { 971 | // Erases the screen with the background colour and moves the cursor to home. 972 | clearScreen(); 973 | posY = posX = 0; 974 | } else if (code.equals("1")) { 975 | // Erases the screen from the current line up to the top of the screen. 976 | for (int i=0; i<=posY; i++) { 977 | Arrays.fill(screen[i], ' '); 978 | } 979 | } else if (code.equals("")) { 980 | // Erases the screen from the current line down to the bottom of the screen. 981 | for (int i=posY; i<24; i++) { 982 | Arrays.fill(screen[i], ' '); 983 | } 984 | } 985 | break; 986 | case "K": 987 | if (code.equals("")) { 988 | // Erases from the current cursor position to the end of the current line. 989 | for (int i=posX; i<80; i++) { 990 | screen[posY][i] = ' '; 991 | } 992 | } else if (code.equals("1")) { 993 | // Erases from the current cursor position to the start of the current line. 994 | for (int i=0; i<posX; i++) { 995 | screen[posY][i] = ' '; 996 | } 997 | } else if (code.equals("2")) { 998 | // Erases the entire current line. 999 | for (int i=0; i<80; i++) { 1000 | screen[posY][i] = ' '; 1001 | } 1002 | } 1003 | break; 1004 | default: 1005 | // TODO 1006 | log.error("Un implement type: " + type); 1007 | } 1008 | 1009 | } 1010 | 1011 | private void send(String message) throws IOException { 1012 | os.write(message.getBytes()); 1013 | os.flush(); 1014 | } 1015 | 1016 | private boolean isHalfWidth(char c) { 1017 | return '\u0000' <= c && c <= '\u00FF' 1018 | || '\uFF61' <= c && c <= '\uFFDC' 1019 | || '\uFFE8' <= c && c <= '\uFFEE'; 1020 | } 1021 | 1022 | private void printScreen() { 1023 | 1024 | for (int i=0; i<24; i++) { 1025 | for (int j=0; j<80; j++) { 1026 | if (screen[i][j] != 0x00) { 1027 | System.out.print(screen[i][j]); 1028 | } 1029 | } 1030 | System.out.println(); 1031 | } 1032 | 1033 | } 1034 | 1035 | private void clearScreen() { 1036 | for (int i=0; i<72; i++) { 1037 | Arrays.fill(screen[i], (char) (' ')); 1038 | } 1039 | } 1040 | 1041 | } 1042 | --------------------------------------------------------------------------------