├── .gitignore
├── LICENSE
├── README.en.md
├── README.md
├── crawler
├── charset.go
├── config.go
├── main.go
├── page_parser.go
├── request.go
├── task_queue.go
├── transform.go
└── util.go
├── doc
├── golang写入文件失败, invalid arguement.md
├── golang协程池.md
├── goquery页面编码处理(二)-HTML字符实体.md
├── goquery页面编码处理.md
├── sqlite3并发写入.md
└── src
│ └── screenshot.jpg
├── docker-compose.yml
├── docker
├── nginx.conf
└── readme.md
├── main.go
├── model
├── model.go
└── url_record.go
└── util
├── go_pool.go
└── log.go
/.gitignore:
--------------------------------------------------------------------------------
1 | sites
2 | site.db
3 | site.db-journal
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 general
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.en.md:
--------------------------------------------------------------------------------
1 | # site-mirror-go
2 |
3 | #### Description
4 | 网站镜像工具, 可以将整个站点上的全部资源抓取到本地访问.
5 |
6 | #### Software Architecture
7 | Software architecture description
8 |
9 | #### Installation
10 |
11 | 1. xxxx
12 | 2. xxxx
13 | 3. xxxx
14 |
15 | #### Instructions
16 |
17 | 1. xxxx
18 | 2. xxxx
19 | 3. xxxx
20 |
21 | #### Contribution
22 |
23 | 1. Fork the repository
24 | 2. Create Feat_xxx branch
25 | 3. Commit your code
26 | 4. Create Pull Request
27 |
28 |
29 | #### Gitee Feature
30 |
31 | 1. You can use Readme\_XXX.md to support different languages, such as Readme\_en.md, Readme\_zh.md
32 | 2. Gitee blog [blog.gitee.com](https://blog.gitee.com)
33 | 3. Explore open source project [https://gitee.com/explore](https://gitee.com/explore)
34 | 4. The most valuable open source project [GVP](https://gitee.com/gvp)
35 | 5. The manual of Gitee [https://gitee.com/help](https://gitee.com/help)
36 | 6. The most popular members [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # site-mirror-go
2 |
3 | 这是一个通用的爬虫, 整站下载工具, 可以下载包括页面, 图片, css样式及js文件的所有资源, 并存储到本地指定目录下.
4 |
5 | 功能特性:
6 |
7 | 1. 指定抓取深度(0为不限深度, 1为只抓取单页面)
8 | 2. 可以通过配置指定不下载图片, css, js或字体等资源
9 | 3. 设置黑名单以屏蔽指定链接的资源
10 |
11 | 完成后可以通过仓库中的`docker-compose.yml`启动一个nginx容器从本地访问.
12 |
13 | 注意: 本工具只能下载静态页面, 对于通过js动态加载的内容无能为力(比如bilibili), 一般只限于文章, 图片, 新闻资讯等网站.
14 |
15 | ------
16 |
17 | 同类的python版本见
18 |
19 | - [site-mirror-py github](https://github.com/generals-space/site-mirror-py)
20 | - [site-mirror-py 码云](https://gitee.com/generals-space/site-mirror-py)
21 |
22 | 实现逻辑相同.
23 |
24 | golang版本: 1.11.1+
25 |
26 | ------
27 |
28 | 效果截图
29 |
30 | 
--------------------------------------------------------------------------------
/crawler/charset.go:
--------------------------------------------------------------------------------
1 | package crawler
2 |
3 | import (
4 | "bytes"
5 | "html"
6 | "io/ioutil"
7 | "strings"
8 |
9 | "golang.org/x/text/encoding"
10 | "golang.org/x/text/encoding/simplifiedchinese"
11 | "golang.org/x/text/encoding/traditionalchinese"
12 | "golang.org/x/text/encoding/unicode"
13 | "golang.org/x/text/transform"
14 | )
15 |
16 | // CharsetMap 字符集映射
17 | var CharsetMap = map[string]encoding.Encoding{
18 | "utf-8": unicode.UTF8,
19 | "gbk": simplifiedchinese.GBK,
20 | "gb2312": simplifiedchinese.GB18030,
21 | "gb18030": simplifiedchinese.GB18030,
22 | "big5": traditionalchinese.Big5,
23 | }
24 |
25 | // HTMLCharacterEntitiesMap HTML 字符实体
26 | var HTMLCharacterEntitiesMap = map[string]string{
27 | "\u00a0": " ",
28 | "©": "©",
29 | "®": "®",
30 | "™": "™",
31 | "¢": "¢",
32 | "£": "£",
33 | "¥": "¥",
34 | "€": "€",
35 | "§": "§",
36 | }
37 |
38 | // ReplaceHTMLCharacterEntities 替换页面中html实体字符, 以免写入文件时遇到不支持的字符
39 | func ReplaceHTMLCharacterEntities(input string, charset encoding.Encoding) (output string) {
40 | if charset == unicode.UTF8 {
41 | output = input
42 | return
43 | }
44 | output = html.UnescapeString(input)
45 | for char, entity := range HTMLCharacterEntitiesMap {
46 | output = strings.Replace(output, char, entity, -1)
47 | }
48 | return
49 | }
50 |
51 | // DecodeToUTF8 从输入的byte数组中按照指定的字符集解析出对应的utf8格式的内容并返回.
52 | func DecodeToUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) {
53 | if charset == unicode.UTF8 {
54 | output = input
55 | return
56 | }
57 | reader := transform.NewReader(bytes.NewReader(input), charset.NewDecoder())
58 | output, err = ioutil.ReadAll(reader)
59 | if err != nil {
60 | return
61 | }
62 | return
63 | }
64 |
65 | // EncodeFromUTF8 将输入的utf-8格式的byte数组中按照指定的字符集编码并返回
66 | func EncodeFromUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) {
67 | if charset == unicode.UTF8 {
68 | output = input
69 | return
70 | }
71 | reader := transform.NewReader(bytes.NewReader(input), encoding.ReplaceUnsupported(charset.NewEncoder()))
72 | output, err = ioutil.ReadAll(reader)
73 | if err != nil {
74 | return
75 | }
76 | return
77 | }
78 |
--------------------------------------------------------------------------------
/crawler/config.go:
--------------------------------------------------------------------------------
1 | package crawler
2 |
3 | // Config ...
4 | type Config struct {
5 | // 单个页面中可能包含链接的最大数量
6 | // 用LinkRatioInSinglePage*PageWorkerCount得到PageQueueSize,
7 | // 这是为了防止由于队列满而造成worker阻塞引起的列锁,
8 | // 但仍然可能由于递归抓取而写满队列
9 | LinkRatioInSinglePage int
10 | PageWorkerCount int
11 | AssetWorkerCount int
12 |
13 | SiteDBPath string
14 | SitePath string
15 |
16 | StartPage string
17 | MainSite string
18 | UserAgent string
19 | // 爬取页面的深度, 从1开始计, 爬到第N层为止.
20 | // 1表示只抓取单页, 0表示无限制
21 | MaxDepth int
22 | // 请求出错最大重试次数(超时也算出错)
23 | MaxRetryTimes int
24 |
25 | OutsiteAsset bool
26 | NoJs bool
27 | NoCSS bool
28 | NoImages bool
29 | NoFonts bool
30 | BlackList []string
31 | }
32 |
33 | // NewConfig 获取默认配置
34 | func NewConfig() (config *Config) {
35 | config = &Config{
36 | LinkRatioInSinglePage: 5000,
37 | PageWorkerCount: 10,
38 | AssetWorkerCount: 10,
39 |
40 | SiteDBPath: "site.db",
41 | SitePath: "sites",
42 |
43 | OutsiteAsset: true,
44 | NoJs: true,
45 | NoCSS: false,
46 | NoImages: false,
47 | NoFonts: false,
48 | BlackList: []string{},
49 | }
50 |
51 | return
52 | }
53 |
--------------------------------------------------------------------------------
/crawler/main.go:
--------------------------------------------------------------------------------
1 | package crawler
2 |
3 | import (
4 | "bytes"
5 | "io/ioutil"
6 | "net/http"
7 | "net/url"
8 | "strings"
9 | "sync"
10 |
11 | "github.com/PuerkitoBio/goquery"
12 | "github.com/jinzhu/gorm"
13 |
14 | "gitee.com/generals-space/site-mirror-go.git/model"
15 | "gitee.com/generals-space/site-mirror-go.git/util"
16 | )
17 |
18 | var logger *util.Logger
19 |
20 | // Crawler ...
21 | type Crawler struct {
22 | PageQueue chan *model.URLRecord // 页面任务队列
23 | AssetQueue chan *model.URLRecord // 静态资源任务队列
24 |
25 | Config *Config
26 | DBClient *gorm.DB
27 | DBClientMutex *sync.Mutex
28 | }
29 |
30 | // NewCrawler 创建Crawler对象
31 | func NewCrawler(config *Config, _logger *util.Logger) (crawler *Crawler, err error) {
32 | logger = _logger
33 | pageQueue := make(chan *model.URLRecord, config.PageWorkerCount*config.LinkRatioInSinglePage)
34 | assetQueue := make(chan *model.URLRecord, config.AssetWorkerCount*config.LinkRatioInSinglePage)
35 | urlObj, err := url.Parse(config.StartPage)
36 | if err != nil {
37 | logger.Errorf("解析起始地址失败: url: %s, %s", config.StartPage, err.Error())
38 | return
39 | }
40 | mainSite := urlObj.Host // Host成员带端口.
41 | config.MainSite = mainSite
42 |
43 | dbClient, err := model.GetDB(config.SiteDBPath)
44 | if err != nil {
45 | logger.Errorf("初始化数据库失败: site db: %s, %s", config.SiteDBPath, err.Error())
46 | return
47 | }
48 | crawler = &Crawler{
49 | PageQueue: pageQueue,
50 | AssetQueue: assetQueue,
51 |
52 | Config: config,
53 | DBClient: dbClient,
54 | DBClientMutex: &sync.Mutex{},
55 | }
56 |
57 | err = crawler.LoadTaskQueue()
58 | if err != nil {
59 | logger.Errorf("加载任务队列失败: %s", err.Error())
60 | return
61 | }
62 | return
63 | }
64 |
65 | // Start 启动n个工作协程
66 | func (crawler *Crawler) Start() {
67 | req := &model.URLRecord{
68 | URL: crawler.Config.StartPage,
69 | URLType: model.URLTypePage,
70 | Refer: "",
71 | Depth: 1,
72 | FailedTimes: 0,
73 | }
74 | crawler.EnqueuePage(req)
75 |
76 | for i := 0; i < crawler.Config.PageWorkerCount; i++ {
77 | go crawler.GetHTMLPage(i)
78 | }
79 | for i := 0; i < crawler.Config.AssetWorkerCount; i++ {
80 | go crawler.GetStaticAsset(i)
81 | }
82 | }
83 |
84 | // getAndRead 发起请求获取页面或静态资源, 返回响应体内容.
85 | func (crawler *Crawler) getAndRead(req *model.URLRecord) (body []byte, header http.Header, err error) {
86 | crawler.DBClientMutex.Lock()
87 | err = model.UpdateURLRecordStatus(crawler.DBClient, req.URL, model.URLTaskStatusPending)
88 | crawler.DBClientMutex.Unlock()
89 | if err != nil {
90 | logger.Infof("更新任务队列记录失败: req: %+v, error: %s", req, err.Error())
91 | return
92 | }
93 |
94 | if req.FailedTimes > crawler.Config.MaxRetryTimes {
95 | logger.Infof("失败次数过多, 不再尝试: req: %+v", req)
96 | return
97 | }
98 |
99 | if req.URLType == model.URLTypePage && 0 < crawler.Config.MaxDepth && crawler.Config.MaxDepth < req.Depth {
100 | logger.Infof("当前页面已达到最大深度, 不再抓取: req: %+v", req)
101 | return
102 | }
103 |
104 | resp, err := getURL(req.URL, req.Refer, crawler.Config.UserAgent)
105 | if err != nil {
106 | logger.Errorf("请求失败, 重新入队列: req: %+v, error: %s", req, err.Error())
107 | req.FailedTimes++
108 | if req.URLType == model.URLTypePage {
109 | crawler.EnqueuePage(req)
110 | } else {
111 | crawler.EnqueueAsset(req)
112 | }
113 | return
114 | } else if resp.StatusCode == 404 {
115 | // 抓取失败一般是5xx或403, 405等, 出现404基本上就没有重试的意义了, 可以直接放弃
116 | crawler.DBClientMutex.Lock()
117 | err = model.UpdateURLRecordStatus(crawler.DBClient, req.URL, model.URLTaskStatusFailed)
118 | crawler.DBClientMutex.Unlock()
119 | if err != nil {
120 | logger.Errorf("更新任务记录状态失败: req: %+v, error: %s", req, err.Error())
121 | }
122 | return
123 | }
124 | defer resp.Body.Close()
125 |
126 | header = resp.Header
127 | body, err = ioutil.ReadAll(resp.Body)
128 |
129 | return
130 | }
131 |
132 | // GetHTMLPage 工作协程, 从队列中获取任务, 请求html页面并解析
133 | func (crawler *Crawler) GetHTMLPage(num int) {
134 | for req := range crawler.PageQueue {
135 | logger.Infof("取得页面任务: %+v", req)
136 |
137 | respBody, _, err := crawler.getAndRead(req)
138 |
139 | // 编码处理
140 | charsetName, err := getPageCharset(respBody)
141 | if err != nil {
142 | logger.Errorf("获取页面编码失败: req: %+v, error: %s", req, err.Error())
143 | continue
144 | }
145 | charsetName = strings.ToLower(charsetName)
146 | logger.Debugf("当前页面编码: %s, req: %+v", charsetName, req)
147 | charset, exist := CharsetMap[charsetName]
148 | if !exist {
149 | logger.Debugf("未找到匹配的编码: req: %+v, error: %s", req, err.Error())
150 | continue
151 | }
152 | utf8Coutent, err := DecodeToUTF8(respBody, charset)
153 | if err != nil {
154 | logger.Errorf("页面解码失败: req: %+v, error: %s", req, err.Error())
155 | continue
156 | }
157 | utf8Reader := bytes.NewReader(utf8Coutent)
158 | htmlDom, err := goquery.NewDocumentFromReader(utf8Reader)
159 | if err != nil {
160 | logger.Errorf("生成dom树失败: req: %+v, error: %s", req, err.Error())
161 | continue
162 | }
163 |
164 | logger.Debugf("准备进行页面解析: req: %+v", req)
165 |
166 | if 0 < crawler.Config.MaxDepth && crawler.Config.MaxDepth < req.Depth+1 {
167 | logger.Infof("当前页面已达到最大深度, 不再解析新页面: %+v", req)
168 | } else {
169 | crawler.ParseLinkingPages(htmlDom, req)
170 | }
171 | crawler.ParseLinkingAssets(htmlDom, req)
172 |
173 | logger.Debugf("页面解析完成, 准备写入本地文件: req: %+v", req)
174 |
175 | htmlString, err := htmlDom.Html()
176 | if err != nil {
177 | logger.Errorf("获取页面Html()值失败: req: %+v, error: %s", req, err.Error())
178 | continue
179 | }
180 | htmlString = ReplaceHTMLCharacterEntities(htmlString, charset)
181 | fileContent, err := EncodeFromUTF8([]byte(htmlString), charset)
182 | if err != nil {
183 | logger.Errorf("页面编码失败: req: %+v, error: %s", req, err.Error())
184 | continue
185 | }
186 | fileDir, fileName, err := TransToLocalPath(crawler.Config.MainSite, req.URL, model.URLTypePage)
187 | if err != nil {
188 | logger.Errorf("转换为本地链接失败: req: %+v, error: %s", req, err.Error())
189 | continue
190 | }
191 | err = WriteToLocalFile(crawler.Config.SitePath, fileDir, fileName, fileContent)
192 | if err != nil {
193 | logger.Errorf("写入文件失败: req: %+v, error: %s", req, err.Error())
194 | continue
195 | }
196 |
197 | logger.Debugf("页面任务写入本地文件成功: req: %+v", req)
198 |
199 | crawler.DBClientMutex.Lock()
200 | err = model.UpdateURLRecordStatus(crawler.DBClient, req.URL, model.URLTaskStatusSuccess)
201 | crawler.DBClientMutex.Unlock()
202 | if err != nil {
203 | logger.Errorf("更新任务记录状态失败: req: %+v, error: %s", req, err.Error())
204 | continue
205 | }
206 | logger.Debugf("页面任务完成: req: %+v", req)
207 | }
208 | }
209 |
210 | // GetStaticAsset 工作协程, 从队列中获取任务, 获取静态资源并存储
211 | func (crawler *Crawler) GetStaticAsset(num int) {
212 | for req := range crawler.AssetQueue {
213 | logger.Infof("取得静态资源任务: %+v", req)
214 |
215 | respBody, respHeader, err := crawler.getAndRead(req)
216 |
217 | // 如果是css文件, 解析其中的链接, 否则直接存储.
218 | field, exist := respHeader["Content-Type"]
219 | if exist && field[0] == "text/css" {
220 | respBody, err = crawler.parseCSSFile(respBody, req)
221 | if err != nil {
222 | logger.Errorf("解析css文件失败: req: %+v, error: %s", req, err.Error())
223 | continue
224 | }
225 | }
226 | fileDir, fileName, err := TransToLocalPath(crawler.Config.MainSite, req.URL, model.URLTypeAsset)
227 | if err != nil {
228 | logger.Errorf("转换为本地链接失败: req: %+v, error: %s", req, err.Error())
229 | continue
230 | }
231 |
232 | err = WriteToLocalFile(crawler.Config.SitePath, fileDir, fileName, respBody)
233 | if err != nil {
234 | logger.Errorf("写入文件失败: req: %+v, error: %s", req, err.Error())
235 | continue
236 | }
237 | logger.Debugf("静态资源任务写入本地文件成功: req: %+v", req)
238 |
239 | crawler.DBClientMutex.Lock()
240 | err = model.UpdateURLRecordStatus(crawler.DBClient, req.URL, model.URLTaskStatusSuccess)
241 | crawler.DBClientMutex.Unlock()
242 | if err != nil {
243 | logger.Errorf("更新任务记录状态失败: req: %+v, error: %s", req, err.Error())
244 | continue
245 | }
246 | logger.Debugf("静态资源任务完成: req: %+v", req)
247 | }
248 | }
249 |
--------------------------------------------------------------------------------
/crawler/page_parser.go:
--------------------------------------------------------------------------------
1 | package crawler
2 |
3 | import (
4 | "strings"
5 |
6 | "gitee.com/generals-space/site-mirror-go.git/model"
7 | "github.com/PuerkitoBio/goquery"
8 | )
9 |
10 | // ParseLinkingPages 解析并改写页面中的页面链接, 包括a, iframe等元素
11 | func (crawler *Crawler) ParseLinkingPages(htmlDom *goquery.Document, req *model.URLRecord) {
12 | aList := htmlDom.Find("a")
13 | crawler.parseLinkingPages(aList, req, "href")
14 | }
15 |
16 | // parseLinkingPages 遍历选中节点, 解析链接入库, 同时修改节点的链接属性.
17 | func (crawler *Crawler) parseLinkingPages(nodeList *goquery.Selection, req *model.URLRecord, attrName string) {
18 | // nodeList.Nodes 对象表示当前选择器中包含的元素
19 | nodeList.Each(func(i int, nodeItem *goquery.Selection) {
20 | subURL, exist := nodeItem.Attr(attrName)
21 | if !exist || emptyLinkPattern.MatchString(subURL) {
22 | return
23 | }
24 |
25 | fullURL, fullURLWithoutFrag := joinURL(req.URL, subURL)
26 | if !URLFilter(fullURL, model.URLTypePage, crawler.Config) {
27 | return
28 | }
29 | localLink, err := TransToLocalLink(crawler.Config.MainSite, fullURL, model.URLTypePage)
30 | if err != nil {
31 | return
32 | }
33 | nodeItem.SetAttr(attrName, localLink)
34 |
35 | // 新任务入队列
36 | req := &model.URLRecord{
37 | URL: fullURLWithoutFrag,
38 | URLType: model.URLTypePage,
39 | Refer: req.URL,
40 | Depth: req.Depth + 1,
41 | }
42 | crawler.EnqueuePage(req)
43 | })
44 | }
45 |
46 | // ParseLinkingAssets 解析并改写页面中的静态资源链接, 包括js, css, img等元素
47 | func (crawler *Crawler) ParseLinkingAssets(htmlDom *goquery.Document, req *model.URLRecord) {
48 | linkList := htmlDom.Find("link")
49 | crawler.parseLinkingAssets(linkList, req, "href")
50 |
51 | scriptList := htmlDom.Find("script")
52 | crawler.parseLinkingAssets(scriptList, req, "src")
53 |
54 | imgList := htmlDom.Find("img")
55 | crawler.parseLinkingAssets(imgList, req, "src")
56 |
57 | videoList := htmlDom.Find("video")
58 | crawler.parseLinkingAssets(videoList, req, "src")
59 |
60 | audioList := htmlDom.Find("audio")
61 | crawler.parseLinkingAssets(audioList, req, "src")
62 | }
63 |
64 | func (crawler *Crawler) parseLinkingAssets(nodeList *goquery.Selection, req *model.URLRecord, attrName string) {
65 | // nodeList.Nodes 对象表示当前选择器中包含的元素
66 | nodeList.Each(func(i int, nodeItem *goquery.Selection) {
67 | subURL, exist := nodeItem.Attr(attrName)
68 | if !exist || emptyLinkPattern.MatchString(subURL) {
69 | return
70 | }
71 |
72 | fullURL, fullURLWithoutFrag := joinURL(req.URL, subURL)
73 | if !URLFilter(fullURL, model.URLTypeAsset, crawler.Config) {
74 | return
75 | }
76 | localLink, err := TransToLocalLink(crawler.Config.MainSite, fullURL, model.URLTypeAsset)
77 | if err != nil {
78 | return
79 | }
80 | nodeItem.SetAttr(attrName, localLink)
81 |
82 | // 新任务入队列
83 | req := &model.URLRecord{
84 | URL: fullURLWithoutFrag,
85 | URLType: model.URLTypeAsset,
86 | Refer: req.URL,
87 | Depth: req.Depth + 1,
88 | }
89 | crawler.EnqueueAsset(req)
90 | })
91 | }
92 |
93 | // parseCSSFile 解析css文件中的链接, 获取资源并修改其引用路径.
94 | // css中可能包含url属性,或者是background-image属性的引用路径,
95 | // 格式可能为url('./bg.jpg'), url("./bg.jpg"), url(bg.jpg)
96 | func (crawler *Crawler) parseCSSFile(content []byte, req *model.URLRecord) (newContent []byte, err error) {
97 | fileStr := string(content)
98 | // FindAllStringSubmatch返回值为切片, 是所有匹配到的字符串集合.
99 | // 其成员也是切片, 此切片类似于FindStringSubmatch()的结果, 表示分组的匹配情况.
100 | matchedArray := cssAssetURLPattern.FindAllStringSubmatch(fileStr, -1)
101 | for _, matchedItem := range matchedArray {
102 | for _, matchedURL := range matchedItem[1:] {
103 | if matchedURL == "" || emptyLinkPattern.MatchString(matchedURL) {
104 | continue
105 | }
106 | fullURL, fullURLWithoutFrag := joinURL(req.URL, matchedURL)
107 | if !URLFilter(fullURL, model.URLTypeAsset, crawler.Config) {
108 | return
109 | }
110 | localLink, err := TransToLocalLink(crawler.Config.MainSite, fullURL, model.URLTypeAsset)
111 | if err != nil {
112 | continue
113 | }
114 | fileStr = strings.Replace(fileStr, matchedURL, localLink, -1)
115 | // 新任务入队列
116 | req := &model.URLRecord{
117 | URL: fullURLWithoutFrag,
118 | URLType: model.URLTypeAsset,
119 | Refer: req.URL,
120 | Depth: req.Depth + 1,
121 | }
122 | crawler.EnqueueAsset(req)
123 | }
124 | }
125 | newContent = []byte(fileStr)
126 | return
127 | }
128 |
--------------------------------------------------------------------------------
/crawler/request.go:
--------------------------------------------------------------------------------
1 | package crawler
2 |
3 | import (
4 | "bytes"
5 | "net/http"
6 | "net/url"
7 | "os"
8 | "path"
9 | "regexp"
10 | "strings"
11 |
12 | "gitee.com/generals-space/site-mirror-go.git/model"
13 | "github.com/PuerkitoBio/goquery"
14 | )
15 |
16 | func getURL(url, refer, ua string) (resp *http.Response, err error) {
17 | client := &http.Client{}
18 | req, err := http.NewRequest("GET", url, nil)
19 | req.Header.Set("User-Agent", ua)
20 | req.Header.Set("Referer", refer)
21 |
22 | resp, err = client.Do(req)
23 | if err != nil {
24 | logger.Errorf("请求失败: url: %s, refer: %s, error: %s", url, refer, err.Error())
25 | return
26 | }
27 | return
28 | }
29 |
30 | func joinURL(baseURL, subURL string) (fullURL, fullURLWithoutFrag string) {
31 | baseURLObj, _ := url.Parse(baseURL)
32 | subURLObj, _ := url.Parse(subURL)
33 | fullURLObj := baseURLObj.ResolveReference(subURLObj)
34 | fullURL = fullURLObj.String()
35 | fullURLObj.Fragment = ""
36 | fullURLWithoutFrag = fullURLObj.String()
37 | return
38 | }
39 |
40 | // getPageCharset 解析页面, 从中获取页面编码信息
41 | func getPageCharset(body []byte) (charset string, err error) {
42 | bodyReader := bytes.NewReader(body)
43 | dom, err := goquery.NewDocumentFromReader(bodyReader)
44 | if err != nil {
45 | logger.Errorf("生成dom树失败: %s", err.Error())
46 | return
47 | }
48 | var metaInfo string
49 | var exist bool
50 | metaInfo, exist = dom.Find("meta[charset]").Attr("charset")
51 | if exist {
52 | charset = metaInfo
53 | return
54 | }
55 | metaInfo, exist = dom.Find("meta[http-equiv]").Attr("content")
56 | if exist {
57 | // FindStringSubmatch返回值为切片, 第一个成员为模式匹配到的子串, 之后的成员分别是各分组匹配到的子串.
58 | // ta基本等效于FindStringSubmatch(metaInfo, 1), 只查询1个匹配项.
59 | matchedArray := charsetPattern.FindStringSubmatch(metaInfo)
60 | for _, matchedItem := range matchedArray[1:] {
61 | if matchedItem != "" {
62 | charset = matchedItem
63 | return
64 | }
65 | }
66 | }
67 | charset = "utf-8"
68 | return
69 | }
70 |
71 | // URLFilter ...
72 | func URLFilter(fullURL string, urlType int, config *Config) (boolean bool) {
73 | urlObj, err := url.Parse(fullURL)
74 | if err != nil {
75 | logger.Errorf("解析地址失败: url: %s, %s", fullURL, err.Error())
76 | return
77 | }
78 | if urlType == model.URLTypePage && urlObj.Host != config.MainSite {
79 | logger.Infof("不抓取站外页面: %s", fullURL)
80 | return
81 | }
82 | if urlType == model.URLTypeAsset && urlObj.Host != config.MainSite && config.OutsiteAsset == false {
83 | logger.Infof("不抓取站外资源: %s", fullURL)
84 | return
85 | }
86 | if urlType == model.URLTypeAsset && strings.HasSuffix(fullURL, ".js") && config.NoJs == true {
87 | logger.Infof("不抓取js资源: %s", fullURL)
88 | return
89 | }
90 | if urlType == model.URLTypeAsset && strings.HasSuffix(fullURL, ".css") && config.NoCSS == true {
91 | logger.Infof("不抓取css资源: %s", fullURL)
92 | return
93 | }
94 | if urlType == model.URLTypeAsset && imagePattern.MatchString(fullURL) && config.NoImages == true {
95 | logger.Infof("不抓取图片资源: %s", fullURL)
96 | return
97 | }
98 | if urlType == model.URLTypeAsset && fontPattern.MatchString(fullURL) && config.NoFonts == true {
99 | logger.Infof("不抓取字体资源: %s", fullURL)
100 | return
101 | }
102 | for _, rule := range config.BlackList {
103 | pattern := regexp.MustCompile(rule)
104 | if pattern.MatchString(fullURL) {
105 | logger.Infof("不抓取黑名单中的url: %s", fullURL)
106 | return
107 | }
108 | }
109 | return true
110 | }
111 |
112 | // WriteToLocalFile ...
113 | func WriteToLocalFile(baseDir string, fileDir string, fileName string, fileContent []byte) (err error) {
114 | fileDir = path.Join(baseDir, fileDir)
115 | err = os.MkdirAll(fileDir, os.ModePerm)
116 | filePath := path.Join(fileDir, fileName)
117 | file, err := os.Create(filePath)
118 | defer file.Close()
119 |
120 | _, err = file.Write(fileContent)
121 | if err != nil {
122 | logger.Errorf("写入文件失败: %s", err.Error())
123 | }
124 | return
125 | }
126 |
--------------------------------------------------------------------------------
/crawler/task_queue.go:
--------------------------------------------------------------------------------
1 | package crawler
2 |
3 | import "gitee.com/generals-space/site-mirror-go.git/model"
4 |
5 | // LoadTaskQueue 初始化任务队列, 读取数据库中的`PageTask`与`AssetTask`表,
6 | // 将其中缓存的任务加载到任务队列中
7 | func (crawler *Crawler) LoadTaskQueue() (err error) {
8 | logger.Info("初始化任务队列")
9 | crawler.DBClientMutex.Lock()
10 | pageTasks, err := model.QueryUnfinishedPageTasks(crawler.DBClient)
11 | crawler.DBClientMutex.Unlock()
12 | if err != nil {
13 | logger.Errorf("获取页面任务失败: %s", err.Error())
14 | return
15 | }
16 |
17 | logger.Debugf("获取页面队列任务数量: %d", len(pageTasks))
18 | for _, task := range pageTasks {
19 | crawler.PageQueue <- task
20 | // crawler.EnqueuePage(task)
21 | }
22 |
23 | crawler.DBClientMutex.Lock()
24 | assetTasks, err := model.QueryUnfinishedAssetTasks(crawler.DBClient)
25 | crawler.DBClientMutex.Unlock()
26 | if err != nil {
27 | logger.Errorf("获取页面任务失败: %s", err.Error())
28 | return
29 | }
30 |
31 | logger.Debugf("获取静态资源队列任务数量: %d", len(pageTasks))
32 | for _, task := range assetTasks {
33 | crawler.AssetQueue <- task
34 | // crawler.EnqueueAsset(task)
35 | }
36 | logger.Infof("初始化任务队列完成, 页面任务数量: %d, 静态资源任务数量: %d", len(crawler.PageQueue), len(crawler.AssetQueue))
37 | return
38 | }
39 |
40 | // EnqueuePage 页面任务入队列.
41 | // 入队列前查询数据库记录, 如已有记录则不再接受.
42 | // 已进入队列的任务, 必定已经存在记录, 但不一定能成功下载.
43 | // 由于队列长度有限, 这里可能会阻塞, 最可能发生死锁
44 | // 每个page worker在解析页面时, 会将页面中的链接全部入队列.
45 | // 如果此时队列已满, page worker就会阻塞, 当所有worker都阻塞到这里时, 程序就无法继续执行.
46 | func (crawler *Crawler) EnqueuePage(req *model.URLRecord) {
47 | var err error
48 |
49 | crawler.PageQueue <- req
50 |
51 | crawler.DBClientMutex.Lock()
52 | defer crawler.DBClientMutex.Unlock()
53 |
54 | err = model.AddOrUpdateURLRecord(crawler.DBClient, req)
55 | if err != nil {
56 | logger.Errorf("添加(更新)页面任务url记录失败, req: %+v, err: %s", req, err.Error())
57 | return
58 | }
59 | return
60 | }
61 |
62 | // EnqueueAsset 页面任务入队列.
63 | // 入队列前查询数据库记录, 如已有记录则不再接受.
64 | func (crawler *Crawler) EnqueueAsset(req *model.URLRecord) {
65 | var err error
66 | // 由于队列长度有限, 这里可能会阻塞
67 | crawler.AssetQueue <- req
68 |
69 | crawler.DBClientMutex.Lock()
70 | defer crawler.DBClientMutex.Unlock()
71 |
72 | err = model.AddOrUpdateURLRecord(crawler.DBClient, req)
73 | if err != nil {
74 | logger.Errorf("添加(更新)静态资源任务url记录失败, req: %+v, err: %s", req, err.Error())
75 | return
76 | }
77 | return
78 | }
79 |
--------------------------------------------------------------------------------
/crawler/transform.go:
--------------------------------------------------------------------------------
1 | package crawler
2 |
3 | import (
4 | "net/url"
5 | "path"
6 | "strings"
7 |
8 | "gitee.com/generals-space/site-mirror-go.git/model"
9 | )
10 |
11 | // TransToLocalLink ...
12 | // @return: localLink 本地链接, 用于写入本地html文档中的link/script/img/a等标签的链接属性, 格式为以斜线/起始的根路径.
13 | func TransToLocalLink(mainSite string, fullURL string, urlType int) (localLink string, err error) {
14 | // 对于域名为host的url, 资源存放目录为output根目录, 而不是域名文件夹. 默认不设置主host
15 | urlObj, err := url.Parse(fullURL)
16 | if err != nil {
17 | logger.Errorf("解析URL出错: %s", err.Error())
18 | return
19 | }
20 | originHost := urlObj.Host
21 | originPath := urlObj.Path
22 |
23 | localLink = originPath
24 | if urlType == model.URLTypePage {
25 | localLink = transToLocalLinkForPage(urlObj)
26 | } else {
27 | localLink = transToLocalLinkForAsset(urlObj)
28 | }
29 |
30 | // 如果该url就是当前站点域名下的,那么无需新建域名目录存放.
31 | // 如果是其他站点的(需要事先开启允许下载其他站点静态资源的配置),
32 | // 则要将资源存放在以站点域名为名的目录下, 路径中仍然需要保留域名部分.
33 | if originHost != mainSite {
34 | host := originHost
35 | // 有时originHost中可能包含端口, 冒号需要转义.
36 | host = strings.Replace(host, ":", SpecialCharsMap[":"], -1)
37 | localLink = "/" + host + localLink
38 | }
39 | /*
40 | // url中可能包含中文(不只是query中), 需要解码.
41 | localLink, err = url.QueryUnescape(localLink)
42 | if err != nil {
43 | logger.Errorf("解码URL出错: localLink: %s, %s", localLink, err.Error())
44 | return
45 | }
46 | */
47 | return
48 | }
49 |
50 | func transToLocalLinkForPage(urlObj *url.URL) (localLink string) {
51 | originPath := urlObj.Path
52 | originQuery := urlObj.RawQuery
53 |
54 | localLink = originPath
55 |
56 | // 如果path为空
57 | if localLink == "" {
58 | localLink = "index.html"
59 | }
60 | // 如果path以/结尾
61 | boolean := strings.HasSuffix(localLink, "/")
62 | if boolean {
63 | localLink += "index.html"
64 | }
65 |
66 | // 替换query参数中的特殊字符
67 | if originQuery != "" {
68 | queryStr := originQuery
69 | for key, val := range SpecialCharsMap {
70 | queryStr = strings.Replace(queryStr, key, val, -1)
71 | }
72 | localLink = localLink + SpecialCharsMap["?"] + queryStr
73 | }
74 |
75 | // 如果是不支持的页面后缀, 如.php, .jsp, .asp等
76 | // 注意此时localLink可能是拼接过query的字符串.
77 | if !htmlURLPattern.MatchString(localLink) {
78 | localLink += ".html"
79 | }
80 |
81 | return
82 | }
83 |
84 | func transToLocalLinkForAsset(urlObj *url.URL) (localLink string) {
85 | originPath := urlObj.Path
86 | originQuery := urlObj.RawQuery
87 |
88 | localLink = originPath
89 |
90 | // 如果path为空
91 | if localLink == "" {
92 | localLink = "index"
93 | }
94 | // 如果path以/结尾
95 | boolean := strings.HasSuffix(localLink, "/")
96 | if boolean {
97 | localLink += "index"
98 | }
99 |
100 | // 替换query参数中的特殊字符
101 | if originQuery != "" {
102 | queryStr := originQuery
103 | for key, val := range SpecialCharsMap {
104 | queryStr = strings.Replace(queryStr, key, val, -1)
105 | }
106 | localLink = localLink + SpecialCharsMap["?"] + queryStr
107 | }
108 |
109 | return
110 | }
111 |
112 | // TransToLocalPath ...
113 | // @return: 返回本地路径与文件名称, 用于写入本地文件
114 | func TransToLocalPath(mainSite string, fullURL string, urlType int) (fileDir string, fileName string, err error) {
115 | localLink, err := TransToLocalLink(mainSite, fullURL, urlType)
116 |
117 | // 如果是站外资源, local_link可能为/www.xxx.com/static/x.jpg,
118 | // 但我们需要的存储目录是相对路径, 所以需要事先将链接起始的斜线/移除, 作为相对路径.
119 | if strings.HasPrefix(localLink, "/") {
120 | localLink = localLink[1:]
121 | }
122 |
123 | fileDir = path.Dir(localLink)
124 | fileName = path.Base(localLink)
125 | return
126 | }
127 |
--------------------------------------------------------------------------------
/crawler/util.go:
--------------------------------------------------------------------------------
1 | package crawler
2 |
3 | import "regexp"
4 |
5 | // SpecialCharsMap 查询参数中的特殊字符
6 | var SpecialCharsMap = map[string]string{
7 | "\\": "xg",
8 | ":": "mh",
9 | "*": "xh",
10 | "?": "wh",
11 | "<": "xy",
12 | ">": "dy",
13 | "|": "sx",
14 | " ": "kg",
15 | }
16 |
17 | // 以如下格式结尾的路径才是可以直接以静态路径访问的.
18 | // 其他如.php, .jsp, .asp等如果nginx中没有相应的处理方法, 无法直接展示.
19 | var htmlURLPatternStr = `(\.(html)|(htm)|(xhtml)|(xml))$`
20 | var htmlURLPattern = regexp.MustCompile(htmlURLPatternStr)
21 |
22 | var imagePatternStr = `\.((jpg)|(png)|(bmp)|(jpeg)|(gif)|(webp))$`
23 | var imagePattern = regexp.MustCompile(imagePatternStr)
24 |
25 | var fontPatternStr = `\.((ttf)|(woff)|(woff2)|(otf)|(eot))$`
26 | var fontPattern = regexp.MustCompile(fontPatternStr)
27 |
28 | // charsetPatternInDOMStr meta[http-equiv]元素, content属性中charset截取的正则模式.
29 | // 如
30 | var charsetPatternInDOMStr = `charset\s*=\s*(\S*)\s*;?`
31 |
32 | // charsetPattern 普通的MatchString可直接接受模式字符串, 无需Compile,
33 | // 但是只能作为判断是否匹配, 无法从中获取其他信息.
34 | var charsetPattern = regexp.MustCompile(charsetPatternInDOMStr)
35 |
36 | var cssAssetURLPatternStr = `url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)`
37 | var cssAssetURLPattern = regexp.MustCompile(cssAssetURLPatternStr)
38 |
39 | var emptyLinkPatternStr = `(^data:)|(^mailto:)|(about:blank)|(javascript:)`
40 | var emptyLinkPattern = regexp.MustCompile(emptyLinkPatternStr)
41 |
--------------------------------------------------------------------------------
/doc/golang写入文件失败, invalid arguement.md:
--------------------------------------------------------------------------------
1 | # golang写入文件失败, invalid arguement
2 |
3 | ```
4 | file, _ := os.OpenFile(filePath, os.O_RDWR, os.ModePerm)
5 | _, err = file.Write(fileContent)
6 | ```
7 |
8 | err不为空, 值为`invalid arguement`
9 |
10 | 后来发现是打开文件的标识位不正确, 创建文件还应加上`O_CREATE`, 干脆直接使用`os.Create()`函数.
--------------------------------------------------------------------------------
/doc/golang协程池.md:
--------------------------------------------------------------------------------
1 | # golang协程池
2 |
3 | golang并没有原生的协程池, 虽然一个常规的协程池实现起来很简单, 只不过用`for{}`循环创建指定数量的`goroutine`, 各`goroutine`中用`for{}`死循环监听同一个channel对象就可以对这个channel队列协同处理了. 但是实际上一个协程池并没有那么简单.
4 |
5 | 比如如果一个goroutine出现异常而退出后, 如何补全协程池数量? 不然一个个都退出了, 最后就没了...
6 |
7 | 再比如如何动态调整协程池数量? 获取正在被占用/空闲的协程数量?
8 |
9 | 这些问题不得不考虑.
10 |
--------------------------------------------------------------------------------
/doc/goquery页面编码处理(二)-HTML字符实体.md:
--------------------------------------------------------------------------------
1 | # goquery页面编码处理(二)-字符实体
2 |
3 | 1. [ReplaceWithHtml sees html entity where there is none](https://github.com/PuerkitoBio/goquery/issues/113)
4 |
5 | 2. [Should not escape &](https://github.com/PuerkitoBio/goquery/issues/109)
6 |
7 | 3. [Don't change to space in Html()](https://github.com/PuerkitoBio/goquery/issues/28)
8 |
9 | 4. [w3school HTML 字符实体](http://www.w3school.com.cn/html/html_entities.asp)
10 |
11 | 接着上一篇的问题继续总结.
12 |
13 | 上一篇想着用第三方编码处理工具[mahonia](https://github.com/axgle/mahonia)进行编解码操作, 但是` `被解析为□, 有点可惜.
14 |
15 | 后来仔细一看还是有不少问题:
16 |
17 | 1. `Html()`方法得到的字符串中的某些符号被输出为字符实体编码. 如`'` -> `'`, `&` -> `&`. 一个`a`标签变成了`加入收藏`
18 |
19 | 2. ` `实体编码并没有出现在`Html()`结果中, 而且被转换成了一个不可见字符.
20 |
21 | ------
22 |
23 | 首先要解析`Html()`结果中的字符实体, 毕竟这不是编码问题. 于是找到了参考文章1和2, 也是goquery的官方issue.
24 |
25 | 但作者说这不是goquery的原因, 而是使用了`/x/net/html`包的原因, `/x/net/html`本身会转义一些字符(这个转义的原则我还不清楚), 所以我选择使用`html.UnescapeString()`方法将这个实体反转回来, 得到我想要的.
26 |
27 | ------
28 |
29 | 然后是` `的问题, `Html()`的结果中并没有出现` `, 这一点和上面所说的`'` -> `'`的行为不同. 但是`&bnsp;`又没有被转义成一个传统的空格`whitespace`(字符编码`\u0020`), 而是转换成了一个不间断空格`non-breaking space`(字符编码`\u00a0`). 见参考文章3(另外, `non-breaking space`的定义可见参考文章4.).
30 |
31 | 参考文章3中作者提到可以使用`property_test.go`中的`TestNbsp()`函数检测页面中的` `字符. 我使用了如下语句将其替换.
32 |
33 | ```go
34 | strings.Replace(output, "\u00a0", " ", -1)
35 | ```
36 |
37 | 除了`\u00a0`, 还有`©`版权字符等也需要转换.
38 |
39 | 可以猜想, goquery在调用`Html()`方法时一定是把页面渲染了一次, 才把页面中不支持GBK的字符渲染成了`©`和`\u00a0`等, 我们需要将其全部转回来, 再写入到文件.
40 |
41 | ...md花了3天时间.
--------------------------------------------------------------------------------
/doc/goquery页面编码处理.md:
--------------------------------------------------------------------------------
1 | # goquery页面编码处理
2 |
3 | 1. [goquery - Handle Non-UTF8 html Pages](https://github.com/PuerkitoBio/goquery/wiki/Tips-and-tricks)
4 |
5 | 2. [goquery 增加GBK支持](https://blog.csdn.net/jrainbow/article/details/52712685)
6 |
7 | 3. [Golang的字符编码介绍](https://www.cnblogs.com/yinzhengjie/p/7956689.html)
8 |
9 | 4. [colly 抓取页面乱码问题](https://studygolang.com/topics/6745)
10 |
11 | 5. [[go]“编码时不支持符号”utf8到sjis转换时出错](https://teratail.com/questions/106106)
12 |
13 | 6. [Best way to translate UTF-8 to ISO8859-1 in Go](https://stackoverflow.com/questions/47660160/best-way-to-translate-utf-8-to-iso8859-1-in-go)
14 |
15 | goquery的`NewDocumentXXX()`函数默认只接收utf-8编码的页面内容, 编码转换操作需要用户自行处理. (查了下, 另一个dom解析工具colly也是这样).
16 |
17 | 有人在issue中提到`GBK`编码和`CJK(Chinene, Japanese, Korean)`的支持问题, goquery作者回复可见wiki, 即本文参考文章1. 其中提到可以使用[iconv-go](https://github.com/djimenez/iconv-go)包, 这个包实际使用的是C语言中的`iconv`函数, 需要cgo支持, 我在win下无法安装此包, linux下没试过, 放弃了.
18 |
19 | 考虑到我抓取的目标网页中可能出现的编码有限, 不外乎如下几种
20 |
21 | ```go
22 | var CharsetMap = map[string]encoding.Encoding{
23 | "utf-8": unicode.UTF8,
24 | "gbk": simplifiedchinese.GBK,
25 | "gb2312": simplifiedchinese.GB18030,
26 | "gb18030": simplifiedchinese.GB18030,
27 | "big5": traditionalchinese.Big5,
28 | }
29 | ```
30 |
31 | 于是我在网上找到参考文章2和3, 尝试了一下`golang.org/x/text`简单的编解码操作.
32 |
33 | ```go
34 | // DecodeToUTF8 从输入的byte数组中按照指定的字符集解析出对应的utf8格式的内容并返回.
35 | func DecodeToUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) {
36 | reader := transform.NewReader(bytes.NewReader(input), charset.NewDecoder())
37 | output, err = ioutil.ReadAll(reader)
38 | if err != nil {
39 | return
40 | }
41 | return
42 | }
43 |
44 | // EncodeFromUTF8 将输入的utf-8格式的byte数组中按照指定的字符集编码并返回
45 | func EncodeFromUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) {
46 | reader := transform.NewReader(bytes.NewReader(input), charset.NewEncoder())
47 | output, err = ioutil.ReadAll(reader)
48 | if err != nil {
49 | return
50 | }
51 | return
52 | }
53 | ```
54 |
55 | 本来实验时很正常, 编解码和读写文件没有什么问题.
56 |
57 | 但是使用goquery的`NewDocument()`读取http响应(已进行utf-8解码)构建dom对象后, 通过`Html()`方法得到页面内容, 我想将页面内容按照页面原始编码写入文件.
58 |
59 | 然后在调用`EncodeFromUTF8()`进行编码操作时出错, 报`encoding: rune not supported by encoding.`
60 |
61 | 这个问题是因为某个字符存在于utf-8但是不存在于gbk, 见参考文章5. 经过排查, 发现经过`goquery`处理的, 是页面中的` `字符还原成原编码内容时的地方报此错误.
62 |
63 | ------
64 |
65 | 本来想换成colloy的, 但是发现colloy也有这个问题, 不过在参考文章4中找到[mahonia](https://github.com/axgle/mahonia), 试了下, 还好.
66 |
67 | 不过`&bnsp;`按原编码写到文件中变成了方框...
68 |
69 | ```
70 |
71 | ```
72 |
73 | 按照参考文章6的提示, `/x/text`包中有一个`ReplaceUnsupported()`方法, 改写`EncodeFromUTF8()`方法为如下
74 |
75 | ```go
76 | // EncodeFromUTF8 将输入的utf-8格式的byte数组中按照指定的字符集编码并返回
77 | func EncodeFromUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) {
78 | if charset == unicode.UTF8 {
79 | output = input
80 | return
81 | }
82 | reader := transform.NewReader(bytes.NewReader(input), encoding.ReplaceUnsupported(charset.NewEncoder()))
83 | output, err = ioutil.ReadAll(reader)
84 | if err != nil {
85 | return
86 | }
87 | return
88 | }
89 | ```
90 |
91 | 这样会忽略不支持的字符, 也会得到□, 但总归不会出错, 也不用`mahonia`了.
--------------------------------------------------------------------------------
/doc/sqlite3并发写入.md:
--------------------------------------------------------------------------------
1 | # sqlite3并发写入
2 |
3 | 在数据库操作未加锁之前, 多协程从channel队列中取任务, 再删除数据库中该任务的持久化记录, 会出现某些任务没有被删掉的情况. 一直不明白原因, 因为在删除持久化记录后再查询, 这个记录明明已经删掉了. 而且每次运行, 未删除的记录是会变动的.
4 |
5 | 之后尝试为每张表的写操作都加上锁, 为此还拆分了`page_task.go`和`asset_task.go`, 但这样在运行的时候出现了`database is locked`错误...
6 |
7 | 后来再查询资料, 发现sqlite3不支持表级锁, 只支持库级锁.
8 |
9 | 什么意思呢?
10 |
11 | 我们通过`Open`创建一个数据库连接(实际上是一个连接池), 只能有单个协程能使用其对数据库进行操作, 就算不同协程操作的表不同也一样.
12 |
13 | 这不同于我们常规的多线程数据库操作: 开一个连接池, 实际上与数据库建立了多条连接, 不同协程获取到的数据库连接都是池中的可用连接(gorm是支持多协程, 是并发安全的), 不需要额外加锁.
14 |
15 | 不过这样一个小程序运行时再开postgres, mysql这种数据库又有点太重了, 所以目前还是先这样吧, 起码安全.
16 |
17 | ...有一个用golang实现的基于sqlite的分布式数据库[rqlite](https://github.com/rqlite/rqlite), 不知道是不是可以用于轻量的本地存储, 以后试试.
18 |
--------------------------------------------------------------------------------
/doc/src/screenshot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/generals-space/site-mirror-go/fce466e9cf568e7ab3b1e60030b300b04f8c88ec/doc/src/screenshot.jpg
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 |
3 | services:
4 | nginx:
5 | image: nginx
6 | ports:
7 | - 8080:80
8 | volumes:
9 | - ./docker/nginx.conf:/etc/nginx/conf.d/main.conf
10 | - ./sites:/usr/share/nginx/html
11 |
--------------------------------------------------------------------------------
/docker/nginx.conf:
--------------------------------------------------------------------------------
1 | server{
2 | listen 80;
3 | server_name _;
4 |
5 | root /usr/share/nginx/html;
6 | location / {
7 | try_files $uri $uri/ /index.html;
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/docker/readme.md:
--------------------------------------------------------------------------------
1 | windows
2 |
3 | ```
4 | docker run -it --name site-mirror-go -v %gopath%/src/gitee.com/generals-space/site-mirror-go.git:/project generals/golang_node8 /bin/bash
5 | ```
6 |
7 | linux
8 |
9 | ```
10 | docker run -it --name site-mirror-go -v $GOPATH/src/gitee.com/generals-space/site-mirror-go.git:/project generals/golang_node8 /bin/bash
11 | ```
12 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "os"
5 | "os/signal"
6 | "syscall"
7 |
8 | "gitee.com/generals-space/site-mirror-go.git/crawler"
9 | "gitee.com/generals-space/site-mirror-go.git/util"
10 | )
11 |
12 | func main() {
13 | logger := util.NewLogger(os.Stdout)
14 | // logger.SetLevel("debug")
15 |
16 | config := crawler.NewConfig()
17 | config.StartPage = "https://www.lewenxiaoshuo.com/"
18 | config.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
19 | config.MaxDepth = 1
20 |
21 | c, err := crawler.NewCrawler(config, logger)
22 | if err != nil {
23 | panic(err)
24 | }
25 | c.Start()
26 | defer func() {
27 | logger.Info("用户取消")
28 | }()
29 | // 等待用户取消, 目前无法自动结束.
30 | channel := make(chan os.Signal)
31 | signal.Notify(channel, syscall.SIGINT, syscall.SIGTERM)
32 | logger.Info(<-channel)
33 | }
34 |
--------------------------------------------------------------------------------
/model/model.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | import (
4 | "github.com/jinzhu/gorm"
5 | _ "github.com/jinzhu/gorm/dialects/sqlite" // 注释防止绿色下划线语法提示
6 | )
7 |
8 | const (
9 | // URLTaskStatusInit 任务状态初始值, 0
10 | URLTaskStatusInit = iota
11 | // URLTaskStatusPending 从队列中取出, 未出结果时的状态.
12 | URLTaskStatusPending
13 | // URLTaskStatusSuccess 任务状态成功, 2
14 | URLTaskStatusSuccess
15 | // URLTaskStatusFailed 任务状态失败(404), 3
16 | URLTaskStatusFailed
17 | )
18 |
19 | const (
20 | URLTypePage int = iota
21 | URLTypeAsset
22 | )
23 |
24 | // URLRecord 任务记录表
25 | type URLRecord struct {
26 | gorm.Model
27 | URL string `gorm:"unique, not null"`
28 | Refer string
29 | Depth int
30 | URLType int
31 | FailedTimes int
32 | Status int `gorm:"default 0"`
33 | }
34 |
35 | // GetDB 获取数据库链接
36 | func GetDB(dbPath string) (db *gorm.DB, err error) {
37 | db, err = gorm.Open("sqlite3", dbPath)
38 | tables := []interface{}{
39 | &URLRecord{},
40 | }
41 | db.AutoMigrate(tables...)
42 | return
43 | }
44 |
--------------------------------------------------------------------------------
/model/url_record.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | import "github.com/jinzhu/gorm"
4 |
5 | // isExistInURLRecord 查询数据库中指定的url任务记录, 判断是否已存在
6 | func isExistInURLRecord(db *gorm.DB, url string) bool {
7 | var err error
8 | var count int
9 | err = db.Table("url_records").Where("url = ?", url).Count(&count).Error
10 | if err != nil || count == 0 {
11 | return false
12 | }
13 | return true
14 | }
15 |
16 | // queryUnfinishedTasks ...
17 | func queryUnfinishedTasks(db *gorm.DB, urlType int) (tasks []*URLRecord, err error) {
18 | tasks = []*URLRecord{}
19 | err = db.Where("url_type = ? and status in (?)", urlType, []int{URLTaskStatusInit, URLTaskStatusPending}).Find(&tasks).Error
20 | return
21 | }
22 |
23 | // QueryUnfinishedPageTasks ...
24 | func QueryUnfinishedPageTasks(db *gorm.DB) (tasks []*URLRecord, err error) {
25 | return queryUnfinishedTasks(db, URLTypePage)
26 | }
27 |
28 | // QueryUnfinishedAssetTasks ...
29 | func QueryUnfinishedAssetTasks(db *gorm.DB) (tasks []*URLRecord, err error) {
30 | return queryUnfinishedTasks(db, URLTypeAsset)
31 | }
32 |
33 | // AddOrUpdateURLRecord 任务入队列时添加URLRecord新记录(如果已存在则更新failed_times和status字段)
34 | func AddOrUpdateURLRecord(db *gorm.DB, task *URLRecord) (err error) {
35 | exist := isExistInURLRecord(db, task.URL)
36 | if exist {
37 | whereArgs := map[string]interface{}{
38 | "url": task.URL,
39 | }
40 | dataToBeUpdated := map[string]interface{}{
41 | "failed_times": task.FailedTimes,
42 | "status": URLTaskStatusInit, // 任务重新入队列要将状态修改为init状态
43 | }
44 | err = db.Model(&URLRecord{}).Where(whereArgs).Updates(dataToBeUpdated).Error
45 | } else {
46 | err = db.Create(task).Error
47 | }
48 | return
49 | }
50 |
51 | // UpdateURLRecordStatus 更新url任务记录状态
52 | func UpdateURLRecordStatus(db *gorm.DB, url string, status int) (err error) {
53 | urlRecord := &URLRecord{}
54 | err = db.Where("url = ?", url).First(urlRecord).Error
55 | if err != nil {
56 | return
57 | }
58 |
59 | err = db.Model(urlRecord).UpdateColumn("status", status).Error
60 | return
61 | }
62 |
--------------------------------------------------------------------------------
/util/go_pool.go:
--------------------------------------------------------------------------------
1 | package util
2 |
--------------------------------------------------------------------------------
/util/log.go:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | // Package log includes logging related manipulations.
4 | //
5 | // log.SetLevel("debug")
6 | // logger := log.NewLogger(os.Stdout)
7 | //
8 | // logger.Trace("trace message)
9 | // logger.Debug("debug message")
10 | // logger.Info("info message")
11 | // logger.Warn("warning message")
12 | // logger.Error("error message")
13 | // logger.Fatal("fatal message")
14 | //
15 | // logger.Errorf("formatted %s message", "error")
16 |
17 | import (
18 | "fmt"
19 | "io"
20 | stdlog "log"
21 | "os"
22 | "strings"
23 | )
24 |
25 | // Logging level.
26 | const (
27 | Off = iota
28 | Trace
29 | Debug
30 | Info
31 | Warn
32 | Error
33 | Fatal
34 | )
35 |
36 | // all loggers.
37 | var loggers []*Logger
38 |
39 | // the global default logging level, it will be used for creating logger.
40 | var logLevel = Debug
41 |
42 | // Logger represents a simple logger with level.
43 | // The underlying logger is the standard Go logging "log".
44 | type Logger struct {
45 | level int
46 | logger *stdlog.Logger
47 | }
48 |
49 | // NewLogger creates a logger.
50 | func NewLogger(out io.Writer) *Logger {
51 | ret := &Logger{level: logLevel, logger: stdlog.New(out, "", stdlog.Ldate|stdlog.Ltime|stdlog.Lshortfile)}
52 |
53 | loggers = append(loggers, ret)
54 |
55 | return ret
56 | }
57 |
58 | // SetLevel sets the logging level of all loggers.
59 | func SetLevel(level string) {
60 | logLevel = getLevel(level)
61 |
62 | for _, l := range loggers {
63 | l.SetLevel(level)
64 | }
65 | }
66 |
67 | // getLevel gets logging level int value corresponding to the specified level.
68 | func getLevel(level string) int {
69 | level = strings.ToLower(level)
70 |
71 | switch level {
72 | case "off":
73 | return Off
74 | case "trace":
75 | return Trace
76 | case "debug":
77 | return Debug
78 | case "info":
79 | return Info
80 | case "warn":
81 | return Warn
82 | case "error":
83 | return Error
84 | case "fatal":
85 | return Fatal
86 | default:
87 | return Info
88 | }
89 | }
90 |
91 | // SetLevel sets the logging level of a logger.
92 | func (l *Logger) SetLevel(level string) {
93 | l.level = getLevel(level)
94 | }
95 |
96 | // IsTraceEnabled determines whether the trace level is enabled.
97 | func (l *Logger) IsTraceEnabled() bool {
98 | return l.level <= Trace
99 | }
100 |
101 | // IsDebugEnabled determines whether the debug level is enabled.
102 | func (l *Logger) IsDebugEnabled() bool {
103 | return l.level <= Debug
104 | }
105 |
106 | // IsWarnEnabled determines whether the debug level is enabled.
107 | func (l *Logger) IsWarnEnabled() bool {
108 | return l.level <= Warn
109 | }
110 |
111 | // Trace prints trace level message.
112 | func (l *Logger) Trace(v ...interface{}) {
113 | if Trace < l.level {
114 | return
115 | }
116 |
117 | l.logger.SetPrefix("T ")
118 | l.logger.Output(2, fmt.Sprint(v...))
119 | }
120 |
121 | // Tracef prints trace level message with format.
122 | func (l *Logger) Tracef(format string, v ...interface{}) {
123 | if Trace < l.level {
124 | return
125 | }
126 |
127 | l.logger.SetPrefix("T ")
128 | l.logger.Output(2, fmt.Sprintf(format, v...))
129 | }
130 |
131 | // Debug prints debug level message.
132 | func (l *Logger) Debug(v ...interface{}) {
133 | if Debug < l.level {
134 | return
135 | }
136 |
137 | l.logger.SetPrefix("D ")
138 | l.logger.Output(2, fmt.Sprint(v...))
139 | }
140 |
141 | // Debugf prints debug level message with format.
142 | func (l *Logger) Debugf(format string, v ...interface{}) {
143 | if Debug < l.level {
144 | return
145 | }
146 |
147 | l.logger.SetPrefix("D ")
148 | l.logger.Output(2, fmt.Sprintf(format, v...))
149 | }
150 |
151 | // Info prints info level message.
152 | func (l *Logger) Info(v ...interface{}) {
153 | if Info < l.level {
154 | return
155 | }
156 |
157 | l.logger.SetPrefix("I ")
158 | l.logger.Output(2, fmt.Sprint(v...))
159 | }
160 |
161 | // Infof prints info level message with format.
162 | func (l *Logger) Infof(format string, v ...interface{}) {
163 | if Info < l.level {
164 | return
165 | }
166 |
167 | l.logger.SetPrefix("I ")
168 | l.logger.Output(2, fmt.Sprintf(format, v...))
169 | }
170 |
171 | // Warn prints warning level message.
172 | func (l *Logger) Warn(v ...interface{}) {
173 | if Warn < l.level {
174 | return
175 | }
176 |
177 | l.logger.SetPrefix("W ")
178 | l.logger.Output(2, fmt.Sprint(v...))
179 | }
180 |
181 | // Warnf prints warning level message with format.
182 | func (l *Logger) Warnf(format string, v ...interface{}) {
183 | if Warn < l.level {
184 | return
185 | }
186 |
187 | l.logger.SetPrefix("W ")
188 | l.logger.Output(2, fmt.Sprintf(format, v...))
189 | }
190 |
191 | // Error prints error level message.
192 | func (l *Logger) Error(v ...interface{}) {
193 | if Error < l.level {
194 | return
195 | }
196 |
197 | l.logger.SetPrefix("E ")
198 | l.logger.Output(2, fmt.Sprint(v...))
199 | }
200 |
201 | // Errorf prints error level message with format.
202 | func (l *Logger) Errorf(format string, v ...interface{}) {
203 | if Error < l.level {
204 | return
205 | }
206 |
207 | l.logger.SetPrefix("E ")
208 | l.logger.Output(2, fmt.Sprintf(format, v...))
209 | }
210 |
211 | // Fatal prints fatal level message.
212 | func (l *Logger) Fatal(v ...interface{}) {
213 | if Fatal < l.level {
214 | return
215 | }
216 |
217 | l.logger.SetPrefix("F ")
218 | l.logger.Output(2, fmt.Sprint(v...))
219 | os.Exit(1)
220 | }
221 |
222 | // Fatalf prints fatal level message with format.
223 | func (l *Logger) Fatalf(format string, v ...interface{}) {
224 | if Fatal < l.level {
225 | return
226 | }
227 |
228 | l.logger.SetPrefix("F ")
229 | l.logger.Output(2, fmt.Sprintf(format, v...))
230 | os.Exit(1)
231 | }
232 |
--------------------------------------------------------------------------------