├── .gitignore ├── Makefile ├── README.md ├── clearenv.sh ├── conf ├── parsers │ ├── article.json │ ├── board.json │ └── section.json ├── seeds.json └── start.sh ├── main.go ├── newsconf └── seeds.json ├── parsers.go ├── rss └── rss.go ├── start.sh ├── start_news.sh ├── usage_cn.md ├── util.go ├── version └── version.go └── web.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, build with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | build 15 | data 16 | first.lock 17 | tld.cache 18 | zerolog 19 | .DS_Store 20 | 21 | .etlinks/ 22 | 23 | crawler 24 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GO_CMD=go 2 | REPO_PATH=crawler.club/crawler 3 | GIT_SHA=`git rev-parse --short HEAD || echo "GitNotFound"` 4 | GO_LDFLAGS=-ldflags "-X ${REPO_PATH}/version.GitSHA=${GIT_SHA}" 5 | GO_BUILD=$(GO_CMD) build 6 | GO_CLEAN=$(GO_CMD) clean 7 | GO_TEST=$(GO_CMD) test 8 | GO_GET=$(GO_CMD) get 9 | 10 | BUILD_DIR=build 11 | 12 | all: linux darwin windows 13 | 14 | linux: 15 | GOOS=linux $(GO_BUILD) $(GO_LDFLAGS) -o "$(BUILD_DIR)/linux/crawler" "${REPO_PATH}" 16 | darwin: 17 | GOOS=darwin $(GO_BUILD) $(GO_LDFLAGS) -o "$(BUILD_DIR)/darwin/crawler" "${REPO_PATH}" 18 | windows: 19 | GOOS=windows $(GO_BUILD) $(GO_LDFLAGS) -o "$(BUILD_DIR)/windows/crawler.exe" "${REPO_PATH}" 20 | 21 | clean: 22 | $(GO_CLEAN) 23 | rm -fr $(BUILD_DIR) 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Crawler4U: A general purpose focused crawler 2 | 3 | ## Overview 4 | Crawler4U is a general purpose focused crawling and scraping tool based on json format configurations. 5 | 6 | ## Install 7 | The Golang way: 8 | ```sh 9 | go get crawler.club/crawler 10 | ``` 11 | Or download the pre-built binaries from [releases](https://github.com/crawlerclub/crawler/releases) for you system. 12 | 13 | ## Build from source 14 | Before you can build the `crawler` from source. A workable golang development environment is needed. Downlad golang [here](https://golang.org/dl/) and then execute the following commands. 15 | 16 | ```sh 17 | go get -d crawler.club/crawler 18 | cd $GOPATH/src/crawler.club/crawler 19 | make 20 | ``` 21 | 22 | ## Usage 23 | [中文](usage_cn.md) 24 | 25 | ## Companies using crawler.club/crawler 26 | * [elensdata](https://www.elensdata.com/) 27 | * [huawei](https://www.huawei.com/) 28 | * [baidu](https://www.baidu.com) 29 | * [bytedance](https://www.bytedance.com/) 30 | * [zenia](https://www.zenia.ai/) 31 | -------------------------------------------------------------------------------- /clearenv.sh: -------------------------------------------------------------------------------- 1 | rm first.lock 2 | rm -fr ./data 3 | rm -fr .rsslinks 4 | rm -fr .etlinks 5 | -------------------------------------------------------------------------------- /conf/parsers/article.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "article", 3 | "example_url": "http://www.newsmth.net/nForum/article/AI/65703", 4 | "default_fields": true, 5 | "rules": { 6 | "root": [ 7 | { 8 | "type": "url", 9 | "key": "article", 10 | "xpath": "//div[@class='t-pre']//li/a/@href" 11 | }, 12 | { 13 | "type": "dom", 14 | "key": "posts", 15 | "xpath": "//table[contains(concat(' ', @class, ' '), ' article ')]" 16 | } 17 | ], 18 | "posts": [ 19 | { 20 | "type": "text", 21 | "key": "text", 22 | "xpath": ".//td[contains(concat(' ', @class, ' '), ' a-content ')]" 23 | }, 24 | { 25 | "type": "html", 26 | "key": "meta", 27 | "xpath": ".//td[contains(concat(' ', @class, ' '), ' a-content ')]", 28 | "re": [ 29 | "发信人:(?P.+?)\\((?P.*?)\\).*?信区:(?P.+?)
", 30 | "标  题:(?P.+?)<br/>", 31 | "发信站:(?P<site>.+?)\\((?P<time>.+?)\\)", 32 | "\\[FROM: (?P<ip>[\\d\\.\\*]+?)\\]" 33 | ] 34 | }, 35 | { 36 | "type": "text", 37 | "key": "floor", 38 | "xpath": ".//span[contains(@class, 'a-pos')]", 39 | "re": ["(\\d+|楼主)"], 40 | "js": "function process(s){if(s=='楼主') return '0'; return s;}" 41 | } 42 | ] 43 | }, 44 | "js": "" 45 | } 46 | -------------------------------------------------------------------------------- /conf/parsers/board.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "board", 3 | "example_url": "http://www.newsmth.net/nForum/board/Universal", 4 | "default_fields": true, 5 | "rules": { 6 | "root": [ 7 | { 8 | "type": "url", 9 | "key": "article", 10 | "xpath": "//tr[not(contains(@class, 'top ad'))]/td[2]/a" 11 | }, 12 | { 13 | "type": "url", 14 | "key": "board", 15 | "xpath": "//div[@class='t-pre']//li[@class='page-select']/following-sibling::li[1]/a" 16 | }, 17 | { 18 | "type": "text", 19 | "key": "time_", 20 | "xpath": "//tr[not(contains(@class, 'top'))][1]/td[8]" 21 | } 22 | ] 23 | }, 24 | "js": "" 25 | } 26 | -------------------------------------------------------------------------------- /conf/parsers/section.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "section", 3 | "example_url": "http://www.newsmth.net/nForum/section/1", 4 | "default_fields": true, 5 | "rules": { 6 | "root": [ 7 | { 8 | "type": "url", 9 | "key": "section", 10 | "xpath": "//tr[contains(td[2]/text(),'[二级目录]')]/td[1]/a" 11 | }, 12 | { 13 | "type": "url", 14 | "key": "board", 15 | "xpath": "//tr[not(contains(td[2]/text(),'[二级目录]'))]/td[1]/a" 16 | } 17 | ] 18 | }, 19 | "js": "" 20 | } 21 | -------------------------------------------------------------------------------- /conf/seeds.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "parser_name": "section", 4 | "url": "http://www.newsmth.net/nForum/section/1" 5 | }, 6 | { 7 | "parser_name": "section", 8 | "url": "http://www.newsmth.net/nForum/section/2" 9 | }, 10 | { 11 | "parser_name": "section", 12 | "url": "http://www.newsmth.net/nForum/section/3" 13 | }, 14 | { 15 | "parser_name": "section", 16 | "url": "http://www.newsmth.net/nForum/section/4" 17 | }, 18 | { 19 | "parser_name": "section", 20 | "url": "http://www.newsmth.net/nForum/section/5" 21 | }, 22 | { 23 | "parser_name": "section", 24 | "url": "http://www.newsmth.net/nForum/section/6" 25 | }, 26 | { 27 | "parser_name": "section", 28 | "url": "http://www.newsmth.net/nForum/section/7" 29 | }, 30 | { 31 | "parser_name": "section", 32 | "url": "http://www.newsmth.net/nForum/section/8" 33 | }, 34 | { 35 | "parser_name": "section", 36 | "url": "http://www.newsmth.net/nForum/section/9" 37 | }, 38 | { 39 | "parser_name": "section", 40 | "url": "http://www.newsmth.net/nForum/section/A" 41 | } 42 | ] 43 | -------------------------------------------------------------------------------- /conf/start.sh: -------------------------------------------------------------------------------- 1 | go get github.com/liuzl/httpserver 2 | httpserver -port :2002 3 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "fmt" 7 | "io/ioutil" 8 | "math/rand" 9 | "os" 10 | "os/signal" 11 | "path/filepath" 12 | "runtime" 13 | "strings" 14 | "sync" 15 | "syscall" 16 | "time" 17 | 18 | "crawler.club/crawler/version" 19 | "crawler.club/dl" 20 | "crawler.club/et" 21 | "github.com/golang/glog" 22 | "github.com/liuzl/store" 23 | "zliu.org/filestore" 24 | "zliu.org/goutil" 25 | "zliu.org/q" 26 | ) 27 | 28 | var ( 29 | dir = flag.String("dir", "data", "working dir") 30 | timeout = flag.Int64("timeout", 300, "in seconds") 31 | c = flag.Int("c", 1, "worker count") 32 | retry = flag.Int("retry", 5, "retry cnt") 33 | period = flag.Int("period", -1, "period in seconds") 34 | fs = flag.Bool("fs", true, "filestore flag") 35 | api = flag.Bool("api", false, "http api flag") 36 | proxy = flag.Bool("proxy", false, "use proxy or not") 37 | ua = flag.String("ua", "", "pc, mobile, google. Golang UA for empty") 38 | ) 39 | 40 | var crawlQueue, storeQueue *q.Queue 41 | var urlStore, dedupStore *store.LevelStore 42 | var fileStore *filestore.FileStore 43 | var once sync.Once 44 | 45 | func finish() { 46 | if crawlQueue != nil { 47 | crawlQueue.Close() 48 | } 49 | if storeQueue != nil { 50 | storeQueue.Close() 51 | } 52 | if urlStore != nil { 53 | urlStore.Close() 54 | } 55 | if dedupStore != nil { 56 | dedupStore.Close() 57 | } 58 | if fileStore != nil { 59 | fileStore.Close() 60 | } 61 | } 62 | 63 | func initTopics() (err error) { 64 | once.Do(func() { 65 | crawlDir := filepath.Join(*dir, "crawl") 66 | if crawlQueue, err = q.NewQueueWithRetryLimit(crawlDir, *retry); err != nil { 67 | glog.Error(err) 68 | return 69 | } 70 | storeDir := filepath.Join(*dir, "store") 71 | if storeQueue, err = q.NewQueue(storeDir); err != nil { 72 | glog.Error(err) 73 | return 74 | } 75 | dbDir := filepath.Join(*dir, "url") 76 | if urlStore, err = store.NewLevelStore(dbDir); err != nil { 77 | glog.Error(err) 78 | return 79 | } 80 | dedupDir := filepath.Join(*dir, "dedup") 81 | if dedupStore, err = store.NewLevelStore(dedupDir); err != nil { 82 | glog.Error(err) 83 | return 84 | } 85 | if *fs { 86 | fsDir := filepath.Join(*dir, "fs") 87 | if fileStore, err = filestore.NewFileStore(fsDir); err != nil { 88 | glog.Error(err) 89 | return 90 | } 91 | } 92 | if goutil.FileGuard("first.lock") { 93 | if err = initSeeds(); err != nil { 94 | return 95 | } 96 | } 97 | }) 98 | return 99 | } 100 | 101 | func initSeeds() error { 102 | seedsFile := filepath.Join(*conf, "seeds.json") 103 | content, err := ioutil.ReadFile(seedsFile) 104 | if err != nil { 105 | glog.Error(err) 106 | return err 107 | } 108 | var seeds []*et.UrlTask 109 | if err = json.Unmarshal(content, &seeds); err != nil { 110 | glog.Error(err) 111 | return err 112 | } 113 | glog.Infof("initSeeds %d seeds", len(seeds)) 114 | tz := time.Now().Format("200601020304") 115 | for _, seed := range seeds { 116 | seed.TaskName = tz 117 | b, _ := json.Marshal(seed) 118 | if err = crawlQueue.Enqueue(string(b)); err != nil { 119 | glog.Error(err) 120 | return err 121 | } 122 | } 123 | return nil 124 | } 125 | 126 | func stop(sigs chan os.Signal, exit chan bool) { 127 | <-sigs 128 | glog.Info("receive stop signal!") 129 | close(exit) 130 | } 131 | 132 | func work(i int, exit chan bool) { 133 | glog.Infof("start worker %d", i) 134 | for { 135 | select { 136 | case <-exit: 137 | glog.Infof("worker %d exit", i) 138 | return 139 | default: 140 | key, item, err := crawlQueue.Dequeue(*timeout) 141 | if err != nil { 142 | if err.Error() == "Queue is empty" { 143 | s := rand.Int()%20 + 5 144 | glog.Infof("queue is empty, worker %d sleep %d seconds", i, s) 145 | goutil.Sleep(time.Duration(s)*time.Second, exit) 146 | } else { 147 | glog.Error(err) 148 | } 149 | continue 150 | } 151 | task := new(et.UrlTask) 152 | if err = json.Unmarshal([]byte(item), task); err != nil { 153 | glog.Error(err) 154 | continue 155 | } 156 | 157 | var t time.Time 158 | bt, err := urlStore.Get(task.Url) 159 | if err == nil || err.Error() == "leveldb: not found" { 160 | t, _ = time.Parse(time.RFC3339, string(bt)) 161 | } 162 | 163 | var req = &dl.HttpRequest{Url: task.Url} 164 | if *ua == "pc" || *ua == "mobile" || *ua == "google" { 165 | req.Platform = *ua 166 | } 167 | if *proxy { 168 | req.UseProxy = true 169 | } 170 | resp := dl.Download(req) 171 | if resp.Error != nil { 172 | glog.Error(resp.Error) 173 | continue 174 | } 175 | items := strings.Split(resp.RemoteAddr, ":") 176 | ip := "" 177 | if len(items) > 0 { 178 | ip = items[0] 179 | } 180 | tasks, records, err := Parse(task, resp.Text, ip) 181 | if err != nil { 182 | glog.Error(err) 183 | continue 184 | } 185 | 186 | t2 := time.Now() 187 | for _, rec := range records { 188 | b, _ := json.Marshal(rec) 189 | if *fs { 190 | fileStore.WriteLine(b) 191 | } 192 | if err = storeQueue.Enqueue(string(b)); err != nil { 193 | glog.Error(err) 194 | } 195 | 196 | if rec["time_"] != nil { 197 | switch rec["time_"].(type) { 198 | case string: 199 | t2, _ = time.Parse(time.RFC3339, rec["time_"].(string)) 200 | } 201 | } 202 | } 203 | 204 | if t2.After(t) { 205 | for _, t := range tasks { 206 | if task.TaskName != "" { 207 | t.TaskName = task.TaskName 208 | } 209 | k := taskKey(t) 210 | if has, err := dedupStore.Has(k); has { 211 | continue 212 | } else if err != nil { 213 | glog.Error(err) 214 | } 215 | dedupStore.Put(k, nil) 216 | b, _ := json.Marshal(t) 217 | if err = crawlQueue.Enqueue(string(b)); err != nil { 218 | glog.Error(err) 219 | } 220 | } 221 | } 222 | 223 | if len(tasks) > 0 || len(records) > 0 { 224 | if err = crawlQueue.Confirm(key); err != nil { 225 | glog.Error(err) 226 | } 227 | urlStore.Put(task.Url, []byte(t2.UTC().Format(time.RFC3339))) 228 | } 229 | } 230 | } 231 | } 232 | 233 | func checkSeeds(exit chan bool) { 234 | defer glog.Info("checkSeeds exit") 235 | for { 236 | select { 237 | case <-exit: 238 | return 239 | default: 240 | goutil.Sleep(time.Duration(*period)*time.Second, exit) 241 | glog.Info("check seeds period") 242 | initSeeds() 243 | } 244 | } 245 | } 246 | 247 | func main() { 248 | flag.Parse() 249 | defer glog.Flush() 250 | 251 | fmt.Printf("Git SHA: %s\n", version.GitSHA) 252 | fmt.Printf("Go Version: %s\n", runtime.Version()) 253 | fmt.Printf("Go OS/Arch: %s/%s\n", runtime.GOOS, runtime.GOARCH) 254 | 255 | defer glog.Info("exit!") 256 | 257 | if err := initTopics(); err != nil { 258 | return 259 | } 260 | defer finish() 261 | 262 | exit := make(chan bool) 263 | sigs := make(chan os.Signal) 264 | signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) 265 | go stop(sigs, exit) 266 | 267 | if *period > 0 && *c > 0 { 268 | go checkSeeds(exit) 269 | } 270 | for i := 0; i < *c; i++ { 271 | go work(i, exit) 272 | } 273 | 274 | if *api { 275 | go web() 276 | } 277 | 278 | // wait exit signal 279 | select { 280 | case <-exit: 281 | return 282 | } 283 | } 284 | -------------------------------------------------------------------------------- /newsconf/seeds.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "parser_name": "link_", 4 | "url": "https://tech.sina.com.cn/", 5 | "ext": { 6 | "category": "Tech" 7 | } 8 | }, 9 | { 10 | "parser_name": "link_", 11 | "url": "https://new.qq.com/ch/tech/", 12 | "ext": { 13 | "category": "Tech" 14 | } 15 | }, 16 | { 17 | "parser_name": "link_", 18 | "url": "https://it.sohu.com/", 19 | "ext": { 20 | "category": "Tech" 21 | } 22 | } 23 | ] 24 | -------------------------------------------------------------------------------- /parsers.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "io/ioutil" 7 | "path/filepath" 8 | "strings" 9 | "sync" 10 | "time" 11 | 12 | "crawler.club/ce" 13 | "crawler.club/crawler/rss" 14 | "crawler.club/et" 15 | ) 16 | 17 | var ( 18 | conf = flag.String("conf", "./conf", "dir for parsers conf") 19 | ) 20 | 21 | type Parsers struct { 22 | sync.Mutex 23 | items map[string]*et.Parser 24 | } 25 | 26 | func (p *Parsers) GetParser(name string, refresh bool) (*et.Parser, error) { 27 | p.Lock() 28 | defer p.Unlock() 29 | if !refresh && p.items[name] != nil { 30 | return p.items[name], nil 31 | } 32 | file := filepath.Join(*conf, "parsers", name+".json") 33 | content, err := ioutil.ReadFile(file) 34 | if err != nil { 35 | return nil, err 36 | } 37 | parser := new(et.Parser) 38 | if err := json.Unmarshal(content, parser); err != nil { 39 | return nil, err 40 | } 41 | p.items[name] = parser 42 | return parser, nil 43 | } 44 | 45 | var pool = &Parsers{items: make(map[string]*et.Parser)} 46 | 47 | func Parse(task *et.UrlTask, page, ip string) ( 48 | []*et.UrlTask, []map[string]interface{}, error) { 49 | name := task.ParserName 50 | url := task.Url 51 | switch strings.ToLower(name) { 52 | case "rss_": 53 | feeds, err := rss.Parse(url, page, task.Ext) 54 | return nil, feeds, err 55 | case "content_": 56 | doc := ce.ParsePro(url, page, ip, false) 57 | return nil, []map[string]interface{}{map[string]interface{}{"doc": doc, "t": time.Now(), "ext": task.Ext}}, nil 58 | case "link_": 59 | links, err := et.ParseNewLinks(page, url) 60 | if err != nil { 61 | return nil, nil, err 62 | } 63 | var tasks []*et.UrlTask 64 | for _, link := range links { 65 | tasks = append(tasks, &et.UrlTask{ParserName: "content_", Url: link, Ext: task.Ext}) 66 | } 67 | return tasks, nil, nil 68 | default: 69 | p, err := pool.GetParser(name, false) 70 | if err != nil { 71 | return nil, nil, err 72 | } 73 | return p.Parse(page, url) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /rss/rss.go: -------------------------------------------------------------------------------- 1 | package rss 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "sync" 7 | "time" 8 | 9 | "github.com/liuzl/store" 10 | "github.com/xgolib/gofeed" 11 | ) 12 | 13 | var linkStore *store.LevelStore 14 | var once sync.Once 15 | 16 | func getLinkStore() *store.LevelStore { 17 | once.Do(func() { 18 | dir, err := filepath.Abs(filepath.Dir(os.Args[0])) 19 | if err != nil { 20 | panic(err) 21 | } 22 | linkStore, err = store.NewLevelStore(filepath.Join(dir, ".rsslinks")) 23 | if err != nil { 24 | panic(err) 25 | } 26 | }) 27 | return linkStore 28 | } 29 | 30 | func Parse(url, page string, ext interface{}) ([]map[string]interface{}, error) { 31 | fp := gofeed.NewParser() 32 | feed, err := fp.ParseString(page) 33 | if err != nil { 34 | return nil, err 35 | } 36 | var ret []map[string]interface{} 37 | for _, item := range feed.Items { 38 | has, err := getLinkStore().Has(item.Link) 39 | if err != nil { 40 | return nil, err 41 | } 42 | if has { 43 | continue 44 | } 45 | //glog.Info("add ", item.Link) 46 | getLinkStore().Put(item.Link, []byte(time.Now().UTC().Format(time.RFC3339))) 47 | ret = append(ret, map[string]interface{}{"feed": item, "ext": ext}) 48 | } 49 | return ret, nil 50 | } 51 | -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | go build 2 | ./crawler -logtostderr -api -period 30 3 | -------------------------------------------------------------------------------- /start_news.sh: -------------------------------------------------------------------------------- 1 | go build 2 | ./crawler -logtostderr -api -period 30 -conf ./newsconf 3 | -------------------------------------------------------------------------------- /usage_cn.md: -------------------------------------------------------------------------------- 1 | # Crawler使用方法 2 | 3 | ## 使用说明 4 | 5 | ### 安装golang开发环境(如果不安装也可以直接下载release包)。 6 | 7 | ### 通过go来安装`crawler.club/crawler`。 8 | 9 | ```sh 10 | go install crawler.club/crawler 11 | crawler --help 12 | ``` 13 | 14 | ### 主要参数说明 15 | * `-api` 打开通过http取数据的接口 16 | * `-addr` http服务地址,查看爬虫状态、取数据等http接口,默认为`2001` 17 | * `-fs` 打开本地文件存储,默认开启 18 | * `-dir` 工作目录,默认为`./data`,抓取回来的文件会存储在`./data/fs/` 19 | * `-c` 工作进程数,默认为`1` 20 | * `-period` 抓取周期,以秒为单位,默认`-1`表明只抓取一次 21 | 22 | ### 主要HTTP接口 23 | * 查看队列头数据(不从队列删除) 24 | ``` 25 | GET http://localhost:2001/api/data?peek=true 26 | ``` 27 | * 取走队列数据(取走后会从队列删除) 28 | ``` 29 | GET http://localhost:2001/api/data 30 | ``` 31 | * 查看爬虫状态 32 | ``` 33 | GET http://localhost:2001/api/status 34 | ``` 35 | ### 其他说明 36 | 程序运营后在启动目录下会生成两个隐藏目录`.rsslinks`和`.etlinks`。这两个目录分别用作`rss`类型和`web`类型抓取的链接去重,避免重复抓取。 37 | 38 | `web`类型的抓取抽取新文章的链接,是通过`.etlinks`的过滤来实现的,其基于的假设是:从列表页发现新的正文页的抓取模式,定期抓取列表页,抽取其中的所有链接,以前从未见过的链接作为新的文章链接。对于导航链接、广告链接等,在第一轮抓取的时候应该已经抓回,所以从第二轮开始应该都是新文章链接。 39 | 40 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "crawler.club/et" 7 | "zliu.org/goutil" 8 | ) 9 | 10 | func taskKey(t *et.UrlTask) string { 11 | if t == nil { 12 | return "" 13 | } 14 | return fmt.Sprintf("%s\t%s\t%s", 15 | goutil.ReverseOrderEncode(t.TaskName), t.Url, t.ParserName) 16 | } 17 | -------------------------------------------------------------------------------- /version/version.go: -------------------------------------------------------------------------------- 1 | package version 2 | 3 | var ( 4 | GitSHA = "Not provided (use make instead of go build)" 5 | ) 6 | -------------------------------------------------------------------------------- /web.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "io/ioutil" 7 | "net/http" 8 | "strings" 9 | "time" 10 | 11 | "crawler.club/et" 12 | "github.com/golang/glog" 13 | "zliu.org/goutil/rest" 14 | ) 15 | 16 | var ( 17 | addr = flag.String("addr", ":2001", "rest address") 18 | ) 19 | 20 | func StatusHandler(w http.ResponseWriter, r *http.Request) { 21 | glog.Infof("addr=%s method=%s host=%s uri=%s", 22 | r.RemoteAddr, r.Method, r.Host, r.RequestURI) 23 | ret := map[string]interface{}{ 24 | "crawl": crawlQueue.Status(), 25 | "store": storeQueue.Status(), 26 | } 27 | rest.MustEncode(w, rest.RestMessage{"OK", ret}) 28 | } 29 | 30 | func AddTaskHandler(w http.ResponseWriter, r *http.Request) { 31 | glog.Infof("addr=%s method=%s host=%s uri=%s", 32 | r.RemoteAddr, r.Method, r.Host, r.RequestURI) 33 | r.ParseForm() 34 | b, err := ioutil.ReadAll(r.Body) 35 | if err != nil { 36 | rest.MustEncode(w, rest.RestMessage{"ERROR", err.Error()}) 37 | return 38 | } 39 | var task = new(et.UrlTask) 40 | if err = json.Unmarshal(b, task); err != nil { 41 | rest.MustEncode(w, rest.RestMessage{"ERROR", err.Error()}) 42 | return 43 | } 44 | task.TaskName = time.Now().Format("200601020304") 45 | k := taskKey(task) 46 | if has, err := dedupStore.Has(k); has { 47 | rest.MustEncode(w, rest.RestMessage{"DUP", k}) 48 | return 49 | } else if err != nil { 50 | rest.MustEncode(w, rest.RestMessage{"ERROR", err.Error()}) 51 | return 52 | } 53 | dedupStore.Put(k, nil) 54 | b, _ = json.Marshal(task) 55 | if err = crawlQueue.Enqueue(string(b)); err != nil { 56 | rest.MustEncode(w, rest.RestMessage{"ERROR", err.Error()}) 57 | return 58 | } 59 | rest.MustEncode(w, rest.RestMessage{"OK", k}) 60 | } 61 | 62 | func DataHandler(w http.ResponseWriter, r *http.Request) { 63 | glog.Infof("addr=%s method=%s host=%s uri=%s", 64 | r.RemoteAddr, r.Method, r.Host, r.RequestURI) 65 | r.ParseForm() 66 | peek := strings.ToLower(strings.TrimSpace(r.FormValue("peek"))) 67 | var ret string 68 | var err error 69 | if peek == "true" { 70 | ret, err = storeQueue.Peek() 71 | } else { 72 | _, ret, err = storeQueue.Dequeue(-1) 73 | } 74 | if err != nil { 75 | rest.MustEncode(w, rest.RestMessage{"ERROR", err.Error()}) 76 | return 77 | } 78 | w.Write([]byte(ret)) 79 | } 80 | 81 | func web() { 82 | if crawlQueue == nil || dedupStore == nil { 83 | glog.Error("topics did not init, can't start web server") 84 | return 85 | } 86 | http.Handle("/api/addtask", rest.WithLog(AddTaskHandler)) 87 | http.Handle("/api/status", rest.WithLog(StatusHandler)) 88 | http.Handle("/api/data", rest.WithLog(DataHandler)) 89 | glog.Info("rest server listen on", *addr) 90 | glog.Error(http.ListenAndServe(*addr, nil)) 91 | } 92 | --------------------------------------------------------------------------------