├── .gitignore ├── LICENSE ├── README.en.md ├── README.md ├── crawler ├── charset.go ├── config.go ├── main.go ├── page_parser.go ├── request.go ├── task_queue.go ├── transform.go └── util.go ├── doc ├── golang写入文件失败, invalid arguement.md ├── golang协程池.md ├── goquery页面编码处理(二)-HTML字符实体.md ├── goquery页面编码处理.md ├── sqlite3并发写入.md └── src │ └── screenshot.jpg ├── docker-compose.yml ├── docker ├── nginx.conf └── readme.md ├── main.go ├── model ├── model.go └── url_record.go └── util ├── go_pool.go └── log.go /.gitignore: -------------------------------------------------------------------------------- 1 | sites 2 | site.db 3 | site.db-journal -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 general 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.en.md: -------------------------------------------------------------------------------- 1 | # site-mirror-go 2 | 3 | #### Description 4 | 网站镜像工具, 可以将整个站点上的全部资源抓取到本地访问. 5 | 6 | #### Software Architecture 7 | Software architecture description 8 | 9 | #### Installation 10 | 11 | 1. xxxx 12 | 2. xxxx 13 | 3. xxxx 14 | 15 | #### Instructions 16 | 17 | 1. xxxx 18 | 2. xxxx 19 | 3. xxxx 20 | 21 | #### Contribution 22 | 23 | 1. Fork the repository 24 | 2. Create Feat_xxx branch 25 | 3. Commit your code 26 | 4. Create Pull Request 27 | 28 | 29 | #### Gitee Feature 30 | 31 | 1. You can use Readme\_XXX.md to support different languages, such as Readme\_en.md, Readme\_zh.md 32 | 2. Gitee blog [blog.gitee.com](https://blog.gitee.com) 33 | 3. Explore open source project [https://gitee.com/explore](https://gitee.com/explore) 34 | 4. The most valuable open source project [GVP](https://gitee.com/gvp) 35 | 5. The manual of Gitee [https://gitee.com/help](https://gitee.com/help) 36 | 6. The most popular members [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # site-mirror-go 2 | 3 | 这是一个通用的爬虫, 整站下载工具, 可以下载包括页面, 图片, css样式及js文件的所有资源, 并存储到本地指定目录下. 4 | 5 | 功能特性: 6 | 7 | 1. 指定抓取深度(0为不限深度, 1为只抓取单页面) 8 | 2. 可以通过配置指定不下载图片, css, js或字体等资源 9 | 3. 设置黑名单以屏蔽指定链接的资源 10 | 11 | 完成后可以通过仓库中的`docker-compose.yml`启动一个nginx容器从本地访问. 12 | 13 | 注意: 本工具只能下载静态页面, 对于通过js动态加载的内容无能为力(比如bilibili), 一般只限于文章, 图片, 新闻资讯等网站. 14 | 15 | ------ 16 | 17 | 同类的python版本见 18 | 19 | - [site-mirror-py github](https://github.com/generals-space/site-mirror-py) 20 | - [site-mirror-py 码云](https://gitee.com/generals-space/site-mirror-py) 21 | 22 | 实现逻辑相同. 23 | 24 | golang版本: 1.11.1+ 25 | 26 | ------ 27 | 28 | 效果截图 29 | 30 | ![](./doc/src/screenshot.jpg) -------------------------------------------------------------------------------- /crawler/charset.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import ( 4 | "bytes" 5 | "html" 6 | "io/ioutil" 7 | "strings" 8 | 9 | "golang.org/x/text/encoding" 10 | "golang.org/x/text/encoding/simplifiedchinese" 11 | "golang.org/x/text/encoding/traditionalchinese" 12 | "golang.org/x/text/encoding/unicode" 13 | "golang.org/x/text/transform" 14 | ) 15 | 16 | // CharsetMap 字符集映射 17 | var CharsetMap = map[string]encoding.Encoding{ 18 | "utf-8": unicode.UTF8, 19 | "gbk": simplifiedchinese.GBK, 20 | "gb2312": simplifiedchinese.GB18030, 21 | "gb18030": simplifiedchinese.GB18030, 22 | "big5": traditionalchinese.Big5, 23 | } 24 | 25 | // HTMLCharacterEntitiesMap HTML 字符实体 26 | var HTMLCharacterEntitiesMap = map[string]string{ 27 | "\u00a0": " ", 28 | "©": "©", 29 | "®": "®", 30 | "™": "™", 31 | "¢": "¢", 32 | "£": "£", 33 | "¥": "¥", 34 | "€": "€", 35 | "§": "§", 36 | } 37 | 38 | // ReplaceHTMLCharacterEntities 替换页面中html实体字符, 以免写入文件时遇到不支持的字符 39 | func ReplaceHTMLCharacterEntities(input string, charset encoding.Encoding) (output string) { 40 | if charset == unicode.UTF8 { 41 | output = input 42 | return 43 | } 44 | output = html.UnescapeString(input) 45 | for char, entity := range HTMLCharacterEntitiesMap { 46 | output = strings.Replace(output, char, entity, -1) 47 | } 48 | return 49 | } 50 | 51 | // DecodeToUTF8 从输入的byte数组中按照指定的字符集解析出对应的utf8格式的内容并返回. 52 | func DecodeToUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) { 53 | if charset == unicode.UTF8 { 54 | output = input 55 | return 56 | } 57 | reader := transform.NewReader(bytes.NewReader(input), charset.NewDecoder()) 58 | output, err = ioutil.ReadAll(reader) 59 | if err != nil { 60 | return 61 | } 62 | return 63 | } 64 | 65 | // EncodeFromUTF8 将输入的utf-8格式的byte数组中按照指定的字符集编码并返回 66 | func EncodeFromUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) { 67 | if charset == unicode.UTF8 { 68 | output = input 69 | return 70 | } 71 | reader := transform.NewReader(bytes.NewReader(input), encoding.ReplaceUnsupported(charset.NewEncoder())) 72 | output, err = ioutil.ReadAll(reader) 73 | if err != nil { 74 | return 75 | } 76 | return 77 | } 78 | -------------------------------------------------------------------------------- /crawler/config.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | // Config ... 4 | type Config struct { 5 | // 单个页面中可能包含链接的最大数量 6 | // 用LinkRatioInSinglePage*PageWorkerCount得到PageQueueSize, 7 | // 这是为了防止由于队列满而造成worker阻塞引起的列锁, 8 | // 但仍然可能由于递归抓取而写满队列 9 | LinkRatioInSinglePage int 10 | PageWorkerCount int 11 | AssetWorkerCount int 12 | 13 | SiteDBPath string 14 | SitePath string 15 | 16 | StartPage string 17 | MainSite string 18 | UserAgent string 19 | // 爬取页面的深度, 从1开始计, 爬到第N层为止. 20 | // 1表示只抓取单页, 0表示无限制 21 | MaxDepth int 22 | // 请求出错最大重试次数(超时也算出错) 23 | MaxRetryTimes int 24 | 25 | OutsiteAsset bool 26 | NoJs bool 27 | NoCSS bool 28 | NoImages bool 29 | NoFonts bool 30 | BlackList []string 31 | } 32 | 33 | // NewConfig 获取默认配置 34 | func NewConfig() (config *Config) { 35 | config = &Config{ 36 | LinkRatioInSinglePage: 5000, 37 | PageWorkerCount: 10, 38 | AssetWorkerCount: 10, 39 | 40 | SiteDBPath: "site.db", 41 | SitePath: "sites", 42 | 43 | OutsiteAsset: true, 44 | NoJs: true, 45 | NoCSS: false, 46 | NoImages: false, 47 | NoFonts: false, 48 | BlackList: []string{}, 49 | } 50 | 51 | return 52 | } 53 | -------------------------------------------------------------------------------- /crawler/main.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import ( 4 | "bytes" 5 | "io/ioutil" 6 | "net/http" 7 | "net/url" 8 | "strings" 9 | "sync" 10 | 11 | "github.com/PuerkitoBio/goquery" 12 | "github.com/jinzhu/gorm" 13 | 14 | "gitee.com/generals-space/site-mirror-go.git/model" 15 | "gitee.com/generals-space/site-mirror-go.git/util" 16 | ) 17 | 18 | var logger *util.Logger 19 | 20 | // Crawler ... 21 | type Crawler struct { 22 | PageQueue chan *model.URLRecord // 页面任务队列 23 | AssetQueue chan *model.URLRecord // 静态资源任务队列 24 | 25 | Config *Config 26 | DBClient *gorm.DB 27 | DBClientMutex *sync.Mutex 28 | } 29 | 30 | // NewCrawler 创建Crawler对象 31 | func NewCrawler(config *Config, _logger *util.Logger) (crawler *Crawler, err error) { 32 | logger = _logger 33 | pageQueue := make(chan *model.URLRecord, config.PageWorkerCount*config.LinkRatioInSinglePage) 34 | assetQueue := make(chan *model.URLRecord, config.AssetWorkerCount*config.LinkRatioInSinglePage) 35 | urlObj, err := url.Parse(config.StartPage) 36 | if err != nil { 37 | logger.Errorf("解析起始地址失败: url: %s, %s", config.StartPage, err.Error()) 38 | return 39 | } 40 | mainSite := urlObj.Host // Host成员带端口. 41 | config.MainSite = mainSite 42 | 43 | dbClient, err := model.GetDB(config.SiteDBPath) 44 | if err != nil { 45 | logger.Errorf("初始化数据库失败: site db: %s, %s", config.SiteDBPath, err.Error()) 46 | return 47 | } 48 | crawler = &Crawler{ 49 | PageQueue: pageQueue, 50 | AssetQueue: assetQueue, 51 | 52 | Config: config, 53 | DBClient: dbClient, 54 | DBClientMutex: &sync.Mutex{}, 55 | } 56 | 57 | err = crawler.LoadTaskQueue() 58 | if err != nil { 59 | logger.Errorf("加载任务队列失败: %s", err.Error()) 60 | return 61 | } 62 | return 63 | } 64 | 65 | // Start 启动n个工作协程 66 | func (crawler *Crawler) Start() { 67 | req := &model.URLRecord{ 68 | URL: crawler.Config.StartPage, 69 | URLType: model.URLTypePage, 70 | Refer: "", 71 | Depth: 1, 72 | FailedTimes: 0, 73 | } 74 | crawler.EnqueuePage(req) 75 | 76 | for i := 0; i < crawler.Config.PageWorkerCount; i++ { 77 | go crawler.GetHTMLPage(i) 78 | } 79 | for i := 0; i < crawler.Config.AssetWorkerCount; i++ { 80 | go crawler.GetStaticAsset(i) 81 | } 82 | } 83 | 84 | // getAndRead 发起请求获取页面或静态资源, 返回响应体内容. 85 | func (crawler *Crawler) getAndRead(req *model.URLRecord) (body []byte, header http.Header, err error) { 86 | crawler.DBClientMutex.Lock() 87 | err = model.UpdateURLRecordStatus(crawler.DBClient, req.URL, model.URLTaskStatusPending) 88 | crawler.DBClientMutex.Unlock() 89 | if err != nil { 90 | logger.Infof("更新任务队列记录失败: req: %+v, error: %s", req, err.Error()) 91 | return 92 | } 93 | 94 | if req.FailedTimes > crawler.Config.MaxRetryTimes { 95 | logger.Infof("失败次数过多, 不再尝试: req: %+v", req) 96 | return 97 | } 98 | 99 | if req.URLType == model.URLTypePage && 0 < crawler.Config.MaxDepth && crawler.Config.MaxDepth < req.Depth { 100 | logger.Infof("当前页面已达到最大深度, 不再抓取: req: %+v", req) 101 | return 102 | } 103 | 104 | resp, err := getURL(req.URL, req.Refer, crawler.Config.UserAgent) 105 | if err != nil { 106 | logger.Errorf("请求失败, 重新入队列: req: %+v, error: %s", req, err.Error()) 107 | req.FailedTimes++ 108 | if req.URLType == model.URLTypePage { 109 | crawler.EnqueuePage(req) 110 | } else { 111 | crawler.EnqueueAsset(req) 112 | } 113 | return 114 | } else if resp.StatusCode == 404 { 115 | // 抓取失败一般是5xx或403, 405等, 出现404基本上就没有重试的意义了, 可以直接放弃 116 | crawler.DBClientMutex.Lock() 117 | err = model.UpdateURLRecordStatus(crawler.DBClient, req.URL, model.URLTaskStatusFailed) 118 | crawler.DBClientMutex.Unlock() 119 | if err != nil { 120 | logger.Errorf("更新任务记录状态失败: req: %+v, error: %s", req, err.Error()) 121 | } 122 | return 123 | } 124 | defer resp.Body.Close() 125 | 126 | header = resp.Header 127 | body, err = ioutil.ReadAll(resp.Body) 128 | 129 | return 130 | } 131 | 132 | // GetHTMLPage 工作协程, 从队列中获取任务, 请求html页面并解析 133 | func (crawler *Crawler) GetHTMLPage(num int) { 134 | for req := range crawler.PageQueue { 135 | logger.Infof("取得页面任务: %+v", req) 136 | 137 | respBody, _, err := crawler.getAndRead(req) 138 | 139 | // 编码处理 140 | charsetName, err := getPageCharset(respBody) 141 | if err != nil { 142 | logger.Errorf("获取页面编码失败: req: %+v, error: %s", req, err.Error()) 143 | continue 144 | } 145 | charsetName = strings.ToLower(charsetName) 146 | logger.Debugf("当前页面编码: %s, req: %+v", charsetName, req) 147 | charset, exist := CharsetMap[charsetName] 148 | if !exist { 149 | logger.Debugf("未找到匹配的编码: req: %+v, error: %s", req, err.Error()) 150 | continue 151 | } 152 | utf8Coutent, err := DecodeToUTF8(respBody, charset) 153 | if err != nil { 154 | logger.Errorf("页面解码失败: req: %+v, error: %s", req, err.Error()) 155 | continue 156 | } 157 | utf8Reader := bytes.NewReader(utf8Coutent) 158 | htmlDom, err := goquery.NewDocumentFromReader(utf8Reader) 159 | if err != nil { 160 | logger.Errorf("生成dom树失败: req: %+v, error: %s", req, err.Error()) 161 | continue 162 | } 163 | 164 | logger.Debugf("准备进行页面解析: req: %+v", req) 165 | 166 | if 0 < crawler.Config.MaxDepth && crawler.Config.MaxDepth < req.Depth+1 { 167 | logger.Infof("当前页面已达到最大深度, 不再解析新页面: %+v", req) 168 | } else { 169 | crawler.ParseLinkingPages(htmlDom, req) 170 | } 171 | crawler.ParseLinkingAssets(htmlDom, req) 172 | 173 | logger.Debugf("页面解析完成, 准备写入本地文件: req: %+v", req) 174 | 175 | htmlString, err := htmlDom.Html() 176 | if err != nil { 177 | logger.Errorf("获取页面Html()值失败: req: %+v, error: %s", req, err.Error()) 178 | continue 179 | } 180 | htmlString = ReplaceHTMLCharacterEntities(htmlString, charset) 181 | fileContent, err := EncodeFromUTF8([]byte(htmlString), charset) 182 | if err != nil { 183 | logger.Errorf("页面编码失败: req: %+v, error: %s", req, err.Error()) 184 | continue 185 | } 186 | fileDir, fileName, err := TransToLocalPath(crawler.Config.MainSite, req.URL, model.URLTypePage) 187 | if err != nil { 188 | logger.Errorf("转换为本地链接失败: req: %+v, error: %s", req, err.Error()) 189 | continue 190 | } 191 | err = WriteToLocalFile(crawler.Config.SitePath, fileDir, fileName, fileContent) 192 | if err != nil { 193 | logger.Errorf("写入文件失败: req: %+v, error: %s", req, err.Error()) 194 | continue 195 | } 196 | 197 | logger.Debugf("页面任务写入本地文件成功: req: %+v", req) 198 | 199 | crawler.DBClientMutex.Lock() 200 | err = model.UpdateURLRecordStatus(crawler.DBClient, req.URL, model.URLTaskStatusSuccess) 201 | crawler.DBClientMutex.Unlock() 202 | if err != nil { 203 | logger.Errorf("更新任务记录状态失败: req: %+v, error: %s", req, err.Error()) 204 | continue 205 | } 206 | logger.Debugf("页面任务完成: req: %+v", req) 207 | } 208 | } 209 | 210 | // GetStaticAsset 工作协程, 从队列中获取任务, 获取静态资源并存储 211 | func (crawler *Crawler) GetStaticAsset(num int) { 212 | for req := range crawler.AssetQueue { 213 | logger.Infof("取得静态资源任务: %+v", req) 214 | 215 | respBody, respHeader, err := crawler.getAndRead(req) 216 | 217 | // 如果是css文件, 解析其中的链接, 否则直接存储. 218 | field, exist := respHeader["Content-Type"] 219 | if exist && field[0] == "text/css" { 220 | respBody, err = crawler.parseCSSFile(respBody, req) 221 | if err != nil { 222 | logger.Errorf("解析css文件失败: req: %+v, error: %s", req, err.Error()) 223 | continue 224 | } 225 | } 226 | fileDir, fileName, err := TransToLocalPath(crawler.Config.MainSite, req.URL, model.URLTypeAsset) 227 | if err != nil { 228 | logger.Errorf("转换为本地链接失败: req: %+v, error: %s", req, err.Error()) 229 | continue 230 | } 231 | 232 | err = WriteToLocalFile(crawler.Config.SitePath, fileDir, fileName, respBody) 233 | if err != nil { 234 | logger.Errorf("写入文件失败: req: %+v, error: %s", req, err.Error()) 235 | continue 236 | } 237 | logger.Debugf("静态资源任务写入本地文件成功: req: %+v", req) 238 | 239 | crawler.DBClientMutex.Lock() 240 | err = model.UpdateURLRecordStatus(crawler.DBClient, req.URL, model.URLTaskStatusSuccess) 241 | crawler.DBClientMutex.Unlock() 242 | if err != nil { 243 | logger.Errorf("更新任务记录状态失败: req: %+v, error: %s", req, err.Error()) 244 | continue 245 | } 246 | logger.Debugf("静态资源任务完成: req: %+v", req) 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /crawler/page_parser.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import ( 4 | "strings" 5 | 6 | "gitee.com/generals-space/site-mirror-go.git/model" 7 | "github.com/PuerkitoBio/goquery" 8 | ) 9 | 10 | // ParseLinkingPages 解析并改写页面中的页面链接, 包括a, iframe等元素 11 | func (crawler *Crawler) ParseLinkingPages(htmlDom *goquery.Document, req *model.URLRecord) { 12 | aList := htmlDom.Find("a") 13 | crawler.parseLinkingPages(aList, req, "href") 14 | } 15 | 16 | // parseLinkingPages 遍历选中节点, 解析链接入库, 同时修改节点的链接属性. 17 | func (crawler *Crawler) parseLinkingPages(nodeList *goquery.Selection, req *model.URLRecord, attrName string) { 18 | // nodeList.Nodes 对象表示当前选择器中包含的元素 19 | nodeList.Each(func(i int, nodeItem *goquery.Selection) { 20 | subURL, exist := nodeItem.Attr(attrName) 21 | if !exist || emptyLinkPattern.MatchString(subURL) { 22 | return 23 | } 24 | 25 | fullURL, fullURLWithoutFrag := joinURL(req.URL, subURL) 26 | if !URLFilter(fullURL, model.URLTypePage, crawler.Config) { 27 | return 28 | } 29 | localLink, err := TransToLocalLink(crawler.Config.MainSite, fullURL, model.URLTypePage) 30 | if err != nil { 31 | return 32 | } 33 | nodeItem.SetAttr(attrName, localLink) 34 | 35 | // 新任务入队列 36 | req := &model.URLRecord{ 37 | URL: fullURLWithoutFrag, 38 | URLType: model.URLTypePage, 39 | Refer: req.URL, 40 | Depth: req.Depth + 1, 41 | } 42 | crawler.EnqueuePage(req) 43 | }) 44 | } 45 | 46 | // ParseLinkingAssets 解析并改写页面中的静态资源链接, 包括js, css, img等元素 47 | func (crawler *Crawler) ParseLinkingAssets(htmlDom *goquery.Document, req *model.URLRecord) { 48 | linkList := htmlDom.Find("link") 49 | crawler.parseLinkingAssets(linkList, req, "href") 50 | 51 | scriptList := htmlDom.Find("script") 52 | crawler.parseLinkingAssets(scriptList, req, "src") 53 | 54 | imgList := htmlDom.Find("img") 55 | crawler.parseLinkingAssets(imgList, req, "src") 56 | 57 | videoList := htmlDom.Find("video") 58 | crawler.parseLinkingAssets(videoList, req, "src") 59 | 60 | audioList := htmlDom.Find("audio") 61 | crawler.parseLinkingAssets(audioList, req, "src") 62 | } 63 | 64 | func (crawler *Crawler) parseLinkingAssets(nodeList *goquery.Selection, req *model.URLRecord, attrName string) { 65 | // nodeList.Nodes 对象表示当前选择器中包含的元素 66 | nodeList.Each(func(i int, nodeItem *goquery.Selection) { 67 | subURL, exist := nodeItem.Attr(attrName) 68 | if !exist || emptyLinkPattern.MatchString(subURL) { 69 | return 70 | } 71 | 72 | fullURL, fullURLWithoutFrag := joinURL(req.URL, subURL) 73 | if !URLFilter(fullURL, model.URLTypeAsset, crawler.Config) { 74 | return 75 | } 76 | localLink, err := TransToLocalLink(crawler.Config.MainSite, fullURL, model.URLTypeAsset) 77 | if err != nil { 78 | return 79 | } 80 | nodeItem.SetAttr(attrName, localLink) 81 | 82 | // 新任务入队列 83 | req := &model.URLRecord{ 84 | URL: fullURLWithoutFrag, 85 | URLType: model.URLTypeAsset, 86 | Refer: req.URL, 87 | Depth: req.Depth + 1, 88 | } 89 | crawler.EnqueueAsset(req) 90 | }) 91 | } 92 | 93 | // parseCSSFile 解析css文件中的链接, 获取资源并修改其引用路径. 94 | // css中可能包含url属性,或者是background-image属性的引用路径, 95 | // 格式可能为url('./bg.jpg'), url("./bg.jpg"), url(bg.jpg) 96 | func (crawler *Crawler) parseCSSFile(content []byte, req *model.URLRecord) (newContent []byte, err error) { 97 | fileStr := string(content) 98 | // FindAllStringSubmatch返回值为切片, 是所有匹配到的字符串集合. 99 | // 其成员也是切片, 此切片类似于FindStringSubmatch()的结果, 表示分组的匹配情况. 100 | matchedArray := cssAssetURLPattern.FindAllStringSubmatch(fileStr, -1) 101 | for _, matchedItem := range matchedArray { 102 | for _, matchedURL := range matchedItem[1:] { 103 | if matchedURL == "" || emptyLinkPattern.MatchString(matchedURL) { 104 | continue 105 | } 106 | fullURL, fullURLWithoutFrag := joinURL(req.URL, matchedURL) 107 | if !URLFilter(fullURL, model.URLTypeAsset, crawler.Config) { 108 | return 109 | } 110 | localLink, err := TransToLocalLink(crawler.Config.MainSite, fullURL, model.URLTypeAsset) 111 | if err != nil { 112 | continue 113 | } 114 | fileStr = strings.Replace(fileStr, matchedURL, localLink, -1) 115 | // 新任务入队列 116 | req := &model.URLRecord{ 117 | URL: fullURLWithoutFrag, 118 | URLType: model.URLTypeAsset, 119 | Refer: req.URL, 120 | Depth: req.Depth + 1, 121 | } 122 | crawler.EnqueueAsset(req) 123 | } 124 | } 125 | newContent = []byte(fileStr) 126 | return 127 | } 128 | -------------------------------------------------------------------------------- /crawler/request.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import ( 4 | "bytes" 5 | "net/http" 6 | "net/url" 7 | "os" 8 | "path" 9 | "regexp" 10 | "strings" 11 | 12 | "gitee.com/generals-space/site-mirror-go.git/model" 13 | "github.com/PuerkitoBio/goquery" 14 | ) 15 | 16 | func getURL(url, refer, ua string) (resp *http.Response, err error) { 17 | client := &http.Client{} 18 | req, err := http.NewRequest("GET", url, nil) 19 | req.Header.Set("User-Agent", ua) 20 | req.Header.Set("Referer", refer) 21 | 22 | resp, err = client.Do(req) 23 | if err != nil { 24 | logger.Errorf("请求失败: url: %s, refer: %s, error: %s", url, refer, err.Error()) 25 | return 26 | } 27 | return 28 | } 29 | 30 | func joinURL(baseURL, subURL string) (fullURL, fullURLWithoutFrag string) { 31 | baseURLObj, _ := url.Parse(baseURL) 32 | subURLObj, _ := url.Parse(subURL) 33 | fullURLObj := baseURLObj.ResolveReference(subURLObj) 34 | fullURL = fullURLObj.String() 35 | fullURLObj.Fragment = "" 36 | fullURLWithoutFrag = fullURLObj.String() 37 | return 38 | } 39 | 40 | // getPageCharset 解析页面, 从中获取页面编码信息 41 | func getPageCharset(body []byte) (charset string, err error) { 42 | bodyReader := bytes.NewReader(body) 43 | dom, err := goquery.NewDocumentFromReader(bodyReader) 44 | if err != nil { 45 | logger.Errorf("生成dom树失败: %s", err.Error()) 46 | return 47 | } 48 | var metaInfo string 49 | var exist bool 50 | metaInfo, exist = dom.Find("meta[charset]").Attr("charset") 51 | if exist { 52 | charset = metaInfo 53 | return 54 | } 55 | metaInfo, exist = dom.Find("meta[http-equiv]").Attr("content") 56 | if exist { 57 | // FindStringSubmatch返回值为切片, 第一个成员为模式匹配到的子串, 之后的成员分别是各分组匹配到的子串. 58 | // ta基本等效于FindStringSubmatch(metaInfo, 1), 只查询1个匹配项. 59 | matchedArray := charsetPattern.FindStringSubmatch(metaInfo) 60 | for _, matchedItem := range matchedArray[1:] { 61 | if matchedItem != "" { 62 | charset = matchedItem 63 | return 64 | } 65 | } 66 | } 67 | charset = "utf-8" 68 | return 69 | } 70 | 71 | // URLFilter ... 72 | func URLFilter(fullURL string, urlType int, config *Config) (boolean bool) { 73 | urlObj, err := url.Parse(fullURL) 74 | if err != nil { 75 | logger.Errorf("解析地址失败: url: %s, %s", fullURL, err.Error()) 76 | return 77 | } 78 | if urlType == model.URLTypePage && urlObj.Host != config.MainSite { 79 | logger.Infof("不抓取站外页面: %s", fullURL) 80 | return 81 | } 82 | if urlType == model.URLTypeAsset && urlObj.Host != config.MainSite && config.OutsiteAsset == false { 83 | logger.Infof("不抓取站外资源: %s", fullURL) 84 | return 85 | } 86 | if urlType == model.URLTypeAsset && strings.HasSuffix(fullURL, ".js") && config.NoJs == true { 87 | logger.Infof("不抓取js资源: %s", fullURL) 88 | return 89 | } 90 | if urlType == model.URLTypeAsset && strings.HasSuffix(fullURL, ".css") && config.NoCSS == true { 91 | logger.Infof("不抓取css资源: %s", fullURL) 92 | return 93 | } 94 | if urlType == model.URLTypeAsset && imagePattern.MatchString(fullURL) && config.NoImages == true { 95 | logger.Infof("不抓取图片资源: %s", fullURL) 96 | return 97 | } 98 | if urlType == model.URLTypeAsset && fontPattern.MatchString(fullURL) && config.NoFonts == true { 99 | logger.Infof("不抓取字体资源: %s", fullURL) 100 | return 101 | } 102 | for _, rule := range config.BlackList { 103 | pattern := regexp.MustCompile(rule) 104 | if pattern.MatchString(fullURL) { 105 | logger.Infof("不抓取黑名单中的url: %s", fullURL) 106 | return 107 | } 108 | } 109 | return true 110 | } 111 | 112 | // WriteToLocalFile ... 113 | func WriteToLocalFile(baseDir string, fileDir string, fileName string, fileContent []byte) (err error) { 114 | fileDir = path.Join(baseDir, fileDir) 115 | err = os.MkdirAll(fileDir, os.ModePerm) 116 | filePath := path.Join(fileDir, fileName) 117 | file, err := os.Create(filePath) 118 | defer file.Close() 119 | 120 | _, err = file.Write(fileContent) 121 | if err != nil { 122 | logger.Errorf("写入文件失败: %s", err.Error()) 123 | } 124 | return 125 | } 126 | -------------------------------------------------------------------------------- /crawler/task_queue.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import "gitee.com/generals-space/site-mirror-go.git/model" 4 | 5 | // LoadTaskQueue 初始化任务队列, 读取数据库中的`PageTask`与`AssetTask`表, 6 | // 将其中缓存的任务加载到任务队列中 7 | func (crawler *Crawler) LoadTaskQueue() (err error) { 8 | logger.Info("初始化任务队列") 9 | crawler.DBClientMutex.Lock() 10 | pageTasks, err := model.QueryUnfinishedPageTasks(crawler.DBClient) 11 | crawler.DBClientMutex.Unlock() 12 | if err != nil { 13 | logger.Errorf("获取页面任务失败: %s", err.Error()) 14 | return 15 | } 16 | 17 | logger.Debugf("获取页面队列任务数量: %d", len(pageTasks)) 18 | for _, task := range pageTasks { 19 | crawler.PageQueue <- task 20 | // crawler.EnqueuePage(task) 21 | } 22 | 23 | crawler.DBClientMutex.Lock() 24 | assetTasks, err := model.QueryUnfinishedAssetTasks(crawler.DBClient) 25 | crawler.DBClientMutex.Unlock() 26 | if err != nil { 27 | logger.Errorf("获取页面任务失败: %s", err.Error()) 28 | return 29 | } 30 | 31 | logger.Debugf("获取静态资源队列任务数量: %d", len(pageTasks)) 32 | for _, task := range assetTasks { 33 | crawler.AssetQueue <- task 34 | // crawler.EnqueueAsset(task) 35 | } 36 | logger.Infof("初始化任务队列完成, 页面任务数量: %d, 静态资源任务数量: %d", len(crawler.PageQueue), len(crawler.AssetQueue)) 37 | return 38 | } 39 | 40 | // EnqueuePage 页面任务入队列. 41 | // 入队列前查询数据库记录, 如已有记录则不再接受. 42 | // 已进入队列的任务, 必定已经存在记录, 但不一定能成功下载. 43 | // 由于队列长度有限, 这里可能会阻塞, 最可能发生死锁 44 | // 每个page worker在解析页面时, 会将页面中的链接全部入队列. 45 | // 如果此时队列已满, page worker就会阻塞, 当所有worker都阻塞到这里时, 程序就无法继续执行. 46 | func (crawler *Crawler) EnqueuePage(req *model.URLRecord) { 47 | var err error 48 | 49 | crawler.PageQueue <- req 50 | 51 | crawler.DBClientMutex.Lock() 52 | defer crawler.DBClientMutex.Unlock() 53 | 54 | err = model.AddOrUpdateURLRecord(crawler.DBClient, req) 55 | if err != nil { 56 | logger.Errorf("添加(更新)页面任务url记录失败, req: %+v, err: %s", req, err.Error()) 57 | return 58 | } 59 | return 60 | } 61 | 62 | // EnqueueAsset 页面任务入队列. 63 | // 入队列前查询数据库记录, 如已有记录则不再接受. 64 | func (crawler *Crawler) EnqueueAsset(req *model.URLRecord) { 65 | var err error 66 | // 由于队列长度有限, 这里可能会阻塞 67 | crawler.AssetQueue <- req 68 | 69 | crawler.DBClientMutex.Lock() 70 | defer crawler.DBClientMutex.Unlock() 71 | 72 | err = model.AddOrUpdateURLRecord(crawler.DBClient, req) 73 | if err != nil { 74 | logger.Errorf("添加(更新)静态资源任务url记录失败, req: %+v, err: %s", req, err.Error()) 75 | return 76 | } 77 | return 78 | } 79 | -------------------------------------------------------------------------------- /crawler/transform.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import ( 4 | "net/url" 5 | "path" 6 | "strings" 7 | 8 | "gitee.com/generals-space/site-mirror-go.git/model" 9 | ) 10 | 11 | // TransToLocalLink ... 12 | // @return: localLink 本地链接, 用于写入本地html文档中的link/script/img/a等标签的链接属性, 格式为以斜线/起始的根路径. 13 | func TransToLocalLink(mainSite string, fullURL string, urlType int) (localLink string, err error) { 14 | // 对于域名为host的url, 资源存放目录为output根目录, 而不是域名文件夹. 默认不设置主host 15 | urlObj, err := url.Parse(fullURL) 16 | if err != nil { 17 | logger.Errorf("解析URL出错: %s", err.Error()) 18 | return 19 | } 20 | originHost := urlObj.Host 21 | originPath := urlObj.Path 22 | 23 | localLink = originPath 24 | if urlType == model.URLTypePage { 25 | localLink = transToLocalLinkForPage(urlObj) 26 | } else { 27 | localLink = transToLocalLinkForAsset(urlObj) 28 | } 29 | 30 | // 如果该url就是当前站点域名下的,那么无需新建域名目录存放. 31 | // 如果是其他站点的(需要事先开启允许下载其他站点静态资源的配置), 32 | // 则要将资源存放在以站点域名为名的目录下, 路径中仍然需要保留域名部分. 33 | if originHost != mainSite { 34 | host := originHost 35 | // 有时originHost中可能包含端口, 冒号需要转义. 36 | host = strings.Replace(host, ":", SpecialCharsMap[":"], -1) 37 | localLink = "/" + host + localLink 38 | } 39 | /* 40 | // url中可能包含中文(不只是query中), 需要解码. 41 | localLink, err = url.QueryUnescape(localLink) 42 | if err != nil { 43 | logger.Errorf("解码URL出错: localLink: %s, %s", localLink, err.Error()) 44 | return 45 | } 46 | */ 47 | return 48 | } 49 | 50 | func transToLocalLinkForPage(urlObj *url.URL) (localLink string) { 51 | originPath := urlObj.Path 52 | originQuery := urlObj.RawQuery 53 | 54 | localLink = originPath 55 | 56 | // 如果path为空 57 | if localLink == "" { 58 | localLink = "index.html" 59 | } 60 | // 如果path以/结尾 61 | boolean := strings.HasSuffix(localLink, "/") 62 | if boolean { 63 | localLink += "index.html" 64 | } 65 | 66 | // 替换query参数中的特殊字符 67 | if originQuery != "" { 68 | queryStr := originQuery 69 | for key, val := range SpecialCharsMap { 70 | queryStr = strings.Replace(queryStr, key, val, -1) 71 | } 72 | localLink = localLink + SpecialCharsMap["?"] + queryStr 73 | } 74 | 75 | // 如果是不支持的页面后缀, 如.php, .jsp, .asp等 76 | // 注意此时localLink可能是拼接过query的字符串. 77 | if !htmlURLPattern.MatchString(localLink) { 78 | localLink += ".html" 79 | } 80 | 81 | return 82 | } 83 | 84 | func transToLocalLinkForAsset(urlObj *url.URL) (localLink string) { 85 | originPath := urlObj.Path 86 | originQuery := urlObj.RawQuery 87 | 88 | localLink = originPath 89 | 90 | // 如果path为空 91 | if localLink == "" { 92 | localLink = "index" 93 | } 94 | // 如果path以/结尾 95 | boolean := strings.HasSuffix(localLink, "/") 96 | if boolean { 97 | localLink += "index" 98 | } 99 | 100 | // 替换query参数中的特殊字符 101 | if originQuery != "" { 102 | queryStr := originQuery 103 | for key, val := range SpecialCharsMap { 104 | queryStr = strings.Replace(queryStr, key, val, -1) 105 | } 106 | localLink = localLink + SpecialCharsMap["?"] + queryStr 107 | } 108 | 109 | return 110 | } 111 | 112 | // TransToLocalPath ... 113 | // @return: 返回本地路径与文件名称, 用于写入本地文件 114 | func TransToLocalPath(mainSite string, fullURL string, urlType int) (fileDir string, fileName string, err error) { 115 | localLink, err := TransToLocalLink(mainSite, fullURL, urlType) 116 | 117 | // 如果是站外资源, local_link可能为/www.xxx.com/static/x.jpg, 118 | // 但我们需要的存储目录是相对路径, 所以需要事先将链接起始的斜线/移除, 作为相对路径. 119 | if strings.HasPrefix(localLink, "/") { 120 | localLink = localLink[1:] 121 | } 122 | 123 | fileDir = path.Dir(localLink) 124 | fileName = path.Base(localLink) 125 | return 126 | } 127 | -------------------------------------------------------------------------------- /crawler/util.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import "regexp" 4 | 5 | // SpecialCharsMap 查询参数中的特殊字符 6 | var SpecialCharsMap = map[string]string{ 7 | "\\": "xg", 8 | ":": "mh", 9 | "*": "xh", 10 | "?": "wh", 11 | "<": "xy", 12 | ">": "dy", 13 | "|": "sx", 14 | " ": "kg", 15 | } 16 | 17 | // 以如下格式结尾的路径才是可以直接以静态路径访问的. 18 | // 其他如.php, .jsp, .asp等如果nginx中没有相应的处理方法, 无法直接展示. 19 | var htmlURLPatternStr = `(\.(html)|(htm)|(xhtml)|(xml))$` 20 | var htmlURLPattern = regexp.MustCompile(htmlURLPatternStr) 21 | 22 | var imagePatternStr = `\.((jpg)|(png)|(bmp)|(jpeg)|(gif)|(webp))$` 23 | var imagePattern = regexp.MustCompile(imagePatternStr) 24 | 25 | var fontPatternStr = `\.((ttf)|(woff)|(woff2)|(otf)|(eot))$` 26 | var fontPattern = regexp.MustCompile(fontPatternStr) 27 | 28 | // charsetPatternInDOMStr meta[http-equiv]元素, content属性中charset截取的正则模式. 29 | // 如 30 | var charsetPatternInDOMStr = `charset\s*=\s*(\S*)\s*;?` 31 | 32 | // charsetPattern 普通的MatchString可直接接受模式字符串, 无需Compile, 33 | // 但是只能作为判断是否匹配, 无法从中获取其他信息. 34 | var charsetPattern = regexp.MustCompile(charsetPatternInDOMStr) 35 | 36 | var cssAssetURLPatternStr = `url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)` 37 | var cssAssetURLPattern = regexp.MustCompile(cssAssetURLPatternStr) 38 | 39 | var emptyLinkPatternStr = `(^data:)|(^mailto:)|(about:blank)|(javascript:)` 40 | var emptyLinkPattern = regexp.MustCompile(emptyLinkPatternStr) 41 | -------------------------------------------------------------------------------- /doc/golang写入文件失败, invalid arguement.md: -------------------------------------------------------------------------------- 1 | # golang写入文件失败, invalid arguement 2 | 3 | ``` 4 | file, _ := os.OpenFile(filePath, os.O_RDWR, os.ModePerm) 5 | _, err = file.Write(fileContent) 6 | ``` 7 | 8 | err不为空, 值为`invalid arguement` 9 | 10 | 后来发现是打开文件的标识位不正确, 创建文件还应加上`O_CREATE`, 干脆直接使用`os.Create()`函数. -------------------------------------------------------------------------------- /doc/golang协程池.md: -------------------------------------------------------------------------------- 1 | # golang协程池 2 | 3 | golang并没有原生的协程池, 虽然一个常规的协程池实现起来很简单, 只不过用`for{}`循环创建指定数量的`goroutine`, 各`goroutine`中用`for{}`死循环监听同一个channel对象就可以对这个channel队列协同处理了. 但是实际上一个协程池并没有那么简单. 4 | 5 | 比如如果一个goroutine出现异常而退出后, 如何补全协程池数量? 不然一个个都退出了, 最后就没了... 6 | 7 | 再比如如何动态调整协程池数量? 获取正在被占用/空闲的协程数量? 8 | 9 | 这些问题不得不考虑. 10 | -------------------------------------------------------------------------------- /doc/goquery页面编码处理(二)-HTML字符实体.md: -------------------------------------------------------------------------------- 1 | # goquery页面编码处理(二)-字符实体 2 | 3 | 1. [ReplaceWithHtml sees html entity where there is none](https://github.com/PuerkitoBio/goquery/issues/113) 4 | 5 | 2. [Should not escape &](https://github.com/PuerkitoBio/goquery/issues/109) 6 | 7 | 3. [Don't change   to space in Html()](https://github.com/PuerkitoBio/goquery/issues/28) 8 | 9 | 4. [w3school HTML 字符实体](http://www.w3school.com.cn/html/html_entities.asp) 10 | 11 | 接着上一篇的问题继续总结. 12 | 13 | 上一篇想着用第三方编码处理工具[mahonia](https://github.com/axgle/mahonia)进行编解码操作, 但是` `被解析为□, 有点可惜. 14 | 15 | 后来仔细一看还是有不少问题: 16 | 17 | 1. `Html()`方法得到的字符串中的某些符号被输出为字符实体编码. 如`'` -> `'`, `&` -> `&`. 一个`a`标签变成了`加入收藏` 18 | 19 | 2. ` `实体编码并没有出现在`Html()`结果中, 而且被转换成了一个不可见字符. 20 | 21 | ------ 22 | 23 | 首先要解析`Html()`结果中的字符实体, 毕竟这不是编码问题. 于是找到了参考文章1和2, 也是goquery的官方issue. 24 | 25 | 但作者说这不是goquery的原因, 而是使用了`/x/net/html`包的原因, `/x/net/html`本身会转义一些字符(这个转义的原则我还不清楚), 所以我选择使用`html.UnescapeString()`方法将这个实体反转回来, 得到我想要的. 26 | 27 | ------ 28 | 29 | 然后是` `的问题, `Html()`的结果中并没有出现` `, 这一点和上面所说的`'` -> `'`的行为不同. 但是`&bnsp;`又没有被转义成一个传统的空格`whitespace`(字符编码`\u0020`), 而是转换成了一个不间断空格`non-breaking space`(字符编码`\u00a0`). 见参考文章3(另外, `non-breaking space`的定义可见参考文章4.). 30 | 31 | 参考文章3中作者提到可以使用`property_test.go`中的`TestNbsp()`函数检测页面中的` `字符. 我使用了如下语句将其替换. 32 | 33 | ```go 34 | strings.Replace(output, "\u00a0", " ", -1) 35 | ``` 36 | 37 | 除了`\u00a0`, 还有`©`版权字符等也需要转换. 38 | 39 | 可以猜想, goquery在调用`Html()`方法时一定是把页面渲染了一次, 才把页面中不支持GBK的字符渲染成了`©`和`\u00a0`等, 我们需要将其全部转回来, 再写入到文件. 40 | 41 | ...md花了3天时间. -------------------------------------------------------------------------------- /doc/goquery页面编码处理.md: -------------------------------------------------------------------------------- 1 | # goquery页面编码处理 2 | 3 | 1. [goquery - Handle Non-UTF8 html Pages](https://github.com/PuerkitoBio/goquery/wiki/Tips-and-tricks) 4 | 5 | 2. [goquery 增加GBK支持](https://blog.csdn.net/jrainbow/article/details/52712685) 6 | 7 | 3. [Golang的字符编码介绍](https://www.cnblogs.com/yinzhengjie/p/7956689.html) 8 | 9 | 4. [colly 抓取页面乱码问题](https://studygolang.com/topics/6745) 10 | 11 | 5. [[go]“编码时不支持符号”utf8到sjis转换时出错](https://teratail.com/questions/106106) 12 | 13 | 6. [Best way to translate UTF-8 to ISO8859-1 in Go](https://stackoverflow.com/questions/47660160/best-way-to-translate-utf-8-to-iso8859-1-in-go) 14 | 15 | goquery的`NewDocumentXXX()`函数默认只接收utf-8编码的页面内容, 编码转换操作需要用户自行处理. (查了下, 另一个dom解析工具colly也是这样). 16 | 17 | 有人在issue中提到`GBK`编码和`CJK(Chinene, Japanese, Korean)`的支持问题, goquery作者回复可见wiki, 即本文参考文章1. 其中提到可以使用[iconv-go](https://github.com/djimenez/iconv-go)包, 这个包实际使用的是C语言中的`iconv`函数, 需要cgo支持, 我在win下无法安装此包, linux下没试过, 放弃了. 18 | 19 | 考虑到我抓取的目标网页中可能出现的编码有限, 不外乎如下几种 20 | 21 | ```go 22 | var CharsetMap = map[string]encoding.Encoding{ 23 | "utf-8": unicode.UTF8, 24 | "gbk": simplifiedchinese.GBK, 25 | "gb2312": simplifiedchinese.GB18030, 26 | "gb18030": simplifiedchinese.GB18030, 27 | "big5": traditionalchinese.Big5, 28 | } 29 | ``` 30 | 31 | 于是我在网上找到参考文章2和3, 尝试了一下`golang.org/x/text`简单的编解码操作. 32 | 33 | ```go 34 | // DecodeToUTF8 从输入的byte数组中按照指定的字符集解析出对应的utf8格式的内容并返回. 35 | func DecodeToUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) { 36 | reader := transform.NewReader(bytes.NewReader(input), charset.NewDecoder()) 37 | output, err = ioutil.ReadAll(reader) 38 | if err != nil { 39 | return 40 | } 41 | return 42 | } 43 | 44 | // EncodeFromUTF8 将输入的utf-8格式的byte数组中按照指定的字符集编码并返回 45 | func EncodeFromUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) { 46 | reader := transform.NewReader(bytes.NewReader(input), charset.NewEncoder()) 47 | output, err = ioutil.ReadAll(reader) 48 | if err != nil { 49 | return 50 | } 51 | return 52 | } 53 | ``` 54 | 55 | 本来实验时很正常, 编解码和读写文件没有什么问题. 56 | 57 | 但是使用goquery的`NewDocument()`读取http响应(已进行utf-8解码)构建dom对象后, 通过`Html()`方法得到页面内容, 我想将页面内容按照页面原始编码写入文件. 58 | 59 | 然后在调用`EncodeFromUTF8()`进行编码操作时出错, 报`encoding: rune not supported by encoding.` 60 | 61 | 这个问题是因为某个字符存在于utf-8但是不存在于gbk, 见参考文章5. 经过排查, 发现经过`goquery`处理的, 是页面中的` `字符还原成原编码内容时的地方报此错误. 62 | 63 | ------ 64 | 65 | 本来想换成colloy的, 但是发现colloy也有这个问题, 不过在参考文章4中找到[mahonia](https://github.com/axgle/mahonia), 试了下, 还好. 66 | 67 | 不过`&bnsp;`按原编码写到文件中变成了方框... 68 | 69 | ``` 70 |  71 | ``` 72 | 73 | 按照参考文章6的提示, `/x/text`包中有一个`ReplaceUnsupported()`方法, 改写`EncodeFromUTF8()`方法为如下 74 | 75 | ```go 76 | // EncodeFromUTF8 将输入的utf-8格式的byte数组中按照指定的字符集编码并返回 77 | func EncodeFromUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) { 78 | if charset == unicode.UTF8 { 79 | output = input 80 | return 81 | } 82 | reader := transform.NewReader(bytes.NewReader(input), encoding.ReplaceUnsupported(charset.NewEncoder())) 83 | output, err = ioutil.ReadAll(reader) 84 | if err != nil { 85 | return 86 | } 87 | return 88 | } 89 | ``` 90 | 91 | 这样会忽略不支持的字符, 也会得到□, 但总归不会出错, 也不用`mahonia`了. -------------------------------------------------------------------------------- /doc/sqlite3并发写入.md: -------------------------------------------------------------------------------- 1 | # sqlite3并发写入 2 | 3 | 在数据库操作未加锁之前, 多协程从channel队列中取任务, 再删除数据库中该任务的持久化记录, 会出现某些任务没有被删掉的情况. 一直不明白原因, 因为在删除持久化记录后再查询, 这个记录明明已经删掉了. 而且每次运行, 未删除的记录是会变动的. 4 | 5 | 之后尝试为每张表的写操作都加上锁, 为此还拆分了`page_task.go`和`asset_task.go`, 但这样在运行的时候出现了`database is locked`错误... 6 | 7 | 后来再查询资料, 发现sqlite3不支持表级锁, 只支持库级锁. 8 | 9 | 什么意思呢? 10 | 11 | 我们通过`Open`创建一个数据库连接(实际上是一个连接池), 只能有单个协程能使用其对数据库进行操作, 就算不同协程操作的表不同也一样. 12 | 13 | 这不同于我们常规的多线程数据库操作: 开一个连接池, 实际上与数据库建立了多条连接, 不同协程获取到的数据库连接都是池中的可用连接(gorm是支持多协程, 是并发安全的), 不需要额外加锁. 14 | 15 | 不过这样一个小程序运行时再开postgres, mysql这种数据库又有点太重了, 所以目前还是先这样吧, 起码安全. 16 | 17 | ...有一个用golang实现的基于sqlite的分布式数据库[rqlite](https://github.com/rqlite/rqlite), 不知道是不是可以用于轻量的本地存储, 以后试试. 18 | -------------------------------------------------------------------------------- /doc/src/screenshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/generals-space/site-mirror-go/fce466e9cf568e7ab3b1e60030b300b04f8c88ec/doc/src/screenshot.jpg -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | nginx: 5 | image: nginx 6 | ports: 7 | - 8080:80 8 | volumes: 9 | - ./docker/nginx.conf:/etc/nginx/conf.d/main.conf 10 | - ./sites:/usr/share/nginx/html 11 | -------------------------------------------------------------------------------- /docker/nginx.conf: -------------------------------------------------------------------------------- 1 | server{ 2 | listen 80; 3 | server_name _; 4 | 5 | root /usr/share/nginx/html; 6 | location / { 7 | try_files $uri $uri/ /index.html; 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /docker/readme.md: -------------------------------------------------------------------------------- 1 | windows 2 | 3 | ``` 4 | docker run -it --name site-mirror-go -v %gopath%/src/gitee.com/generals-space/site-mirror-go.git:/project generals/golang_node8 /bin/bash 5 | ``` 6 | 7 | linux 8 | 9 | ``` 10 | docker run -it --name site-mirror-go -v $GOPATH/src/gitee.com/generals-space/site-mirror-go.git:/project generals/golang_node8 /bin/bash 11 | ``` 12 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | "os/signal" 6 | "syscall" 7 | 8 | "gitee.com/generals-space/site-mirror-go.git/crawler" 9 | "gitee.com/generals-space/site-mirror-go.git/util" 10 | ) 11 | 12 | func main() { 13 | logger := util.NewLogger(os.Stdout) 14 | // logger.SetLevel("debug") 15 | 16 | config := crawler.NewConfig() 17 | config.StartPage = "https://www.lewenxiaoshuo.com/" 18 | config.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 19 | config.MaxDepth = 1 20 | 21 | c, err := crawler.NewCrawler(config, logger) 22 | if err != nil { 23 | panic(err) 24 | } 25 | c.Start() 26 | defer func() { 27 | logger.Info("用户取消") 28 | }() 29 | // 等待用户取消, 目前无法自动结束. 30 | channel := make(chan os.Signal) 31 | signal.Notify(channel, syscall.SIGINT, syscall.SIGTERM) 32 | logger.Info(<-channel) 33 | } 34 | -------------------------------------------------------------------------------- /model/model.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | import ( 4 | "github.com/jinzhu/gorm" 5 | _ "github.com/jinzhu/gorm/dialects/sqlite" // 注释防止绿色下划线语法提示 6 | ) 7 | 8 | const ( 9 | // URLTaskStatusInit 任务状态初始值, 0 10 | URLTaskStatusInit = iota 11 | // URLTaskStatusPending 从队列中取出, 未出结果时的状态. 12 | URLTaskStatusPending 13 | // URLTaskStatusSuccess 任务状态成功, 2 14 | URLTaskStatusSuccess 15 | // URLTaskStatusFailed 任务状态失败(404), 3 16 | URLTaskStatusFailed 17 | ) 18 | 19 | const ( 20 | URLTypePage int = iota 21 | URLTypeAsset 22 | ) 23 | 24 | // URLRecord 任务记录表 25 | type URLRecord struct { 26 | gorm.Model 27 | URL string `gorm:"unique, not null"` 28 | Refer string 29 | Depth int 30 | URLType int 31 | FailedTimes int 32 | Status int `gorm:"default 0"` 33 | } 34 | 35 | // GetDB 获取数据库链接 36 | func GetDB(dbPath string) (db *gorm.DB, err error) { 37 | db, err = gorm.Open("sqlite3", dbPath) 38 | tables := []interface{}{ 39 | &URLRecord{}, 40 | } 41 | db.AutoMigrate(tables...) 42 | return 43 | } 44 | -------------------------------------------------------------------------------- /model/url_record.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | import "github.com/jinzhu/gorm" 4 | 5 | // isExistInURLRecord 查询数据库中指定的url任务记录, 判断是否已存在 6 | func isExistInURLRecord(db *gorm.DB, url string) bool { 7 | var err error 8 | var count int 9 | err = db.Table("url_records").Where("url = ?", url).Count(&count).Error 10 | if err != nil || count == 0 { 11 | return false 12 | } 13 | return true 14 | } 15 | 16 | // queryUnfinishedTasks ... 17 | func queryUnfinishedTasks(db *gorm.DB, urlType int) (tasks []*URLRecord, err error) { 18 | tasks = []*URLRecord{} 19 | err = db.Where("url_type = ? and status in (?)", urlType, []int{URLTaskStatusInit, URLTaskStatusPending}).Find(&tasks).Error 20 | return 21 | } 22 | 23 | // QueryUnfinishedPageTasks ... 24 | func QueryUnfinishedPageTasks(db *gorm.DB) (tasks []*URLRecord, err error) { 25 | return queryUnfinishedTasks(db, URLTypePage) 26 | } 27 | 28 | // QueryUnfinishedAssetTasks ... 29 | func QueryUnfinishedAssetTasks(db *gorm.DB) (tasks []*URLRecord, err error) { 30 | return queryUnfinishedTasks(db, URLTypeAsset) 31 | } 32 | 33 | // AddOrUpdateURLRecord 任务入队列时添加URLRecord新记录(如果已存在则更新failed_times和status字段) 34 | func AddOrUpdateURLRecord(db *gorm.DB, task *URLRecord) (err error) { 35 | exist := isExistInURLRecord(db, task.URL) 36 | if exist { 37 | whereArgs := map[string]interface{}{ 38 | "url": task.URL, 39 | } 40 | dataToBeUpdated := map[string]interface{}{ 41 | "failed_times": task.FailedTimes, 42 | "status": URLTaskStatusInit, // 任务重新入队列要将状态修改为init状态 43 | } 44 | err = db.Model(&URLRecord{}).Where(whereArgs).Updates(dataToBeUpdated).Error 45 | } else { 46 | err = db.Create(task).Error 47 | } 48 | return 49 | } 50 | 51 | // UpdateURLRecordStatus 更新url任务记录状态 52 | func UpdateURLRecordStatus(db *gorm.DB, url string, status int) (err error) { 53 | urlRecord := &URLRecord{} 54 | err = db.Where("url = ?", url).First(urlRecord).Error 55 | if err != nil { 56 | return 57 | } 58 | 59 | err = db.Model(urlRecord).UpdateColumn("status", status).Error 60 | return 61 | } 62 | -------------------------------------------------------------------------------- /util/go_pool.go: -------------------------------------------------------------------------------- 1 | package util 2 | -------------------------------------------------------------------------------- /util/log.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | // Package log includes logging related manipulations. 4 | // 5 | // log.SetLevel("debug") 6 | // logger := log.NewLogger(os.Stdout) 7 | // 8 | // logger.Trace("trace message) 9 | // logger.Debug("debug message") 10 | // logger.Info("info message") 11 | // logger.Warn("warning message") 12 | // logger.Error("error message") 13 | // logger.Fatal("fatal message") 14 | // 15 | // logger.Errorf("formatted %s message", "error") 16 | 17 | import ( 18 | "fmt" 19 | "io" 20 | stdlog "log" 21 | "os" 22 | "strings" 23 | ) 24 | 25 | // Logging level. 26 | const ( 27 | Off = iota 28 | Trace 29 | Debug 30 | Info 31 | Warn 32 | Error 33 | Fatal 34 | ) 35 | 36 | // all loggers. 37 | var loggers []*Logger 38 | 39 | // the global default logging level, it will be used for creating logger. 40 | var logLevel = Debug 41 | 42 | // Logger represents a simple logger with level. 43 | // The underlying logger is the standard Go logging "log". 44 | type Logger struct { 45 | level int 46 | logger *stdlog.Logger 47 | } 48 | 49 | // NewLogger creates a logger. 50 | func NewLogger(out io.Writer) *Logger { 51 | ret := &Logger{level: logLevel, logger: stdlog.New(out, "", stdlog.Ldate|stdlog.Ltime|stdlog.Lshortfile)} 52 | 53 | loggers = append(loggers, ret) 54 | 55 | return ret 56 | } 57 | 58 | // SetLevel sets the logging level of all loggers. 59 | func SetLevel(level string) { 60 | logLevel = getLevel(level) 61 | 62 | for _, l := range loggers { 63 | l.SetLevel(level) 64 | } 65 | } 66 | 67 | // getLevel gets logging level int value corresponding to the specified level. 68 | func getLevel(level string) int { 69 | level = strings.ToLower(level) 70 | 71 | switch level { 72 | case "off": 73 | return Off 74 | case "trace": 75 | return Trace 76 | case "debug": 77 | return Debug 78 | case "info": 79 | return Info 80 | case "warn": 81 | return Warn 82 | case "error": 83 | return Error 84 | case "fatal": 85 | return Fatal 86 | default: 87 | return Info 88 | } 89 | } 90 | 91 | // SetLevel sets the logging level of a logger. 92 | func (l *Logger) SetLevel(level string) { 93 | l.level = getLevel(level) 94 | } 95 | 96 | // IsTraceEnabled determines whether the trace level is enabled. 97 | func (l *Logger) IsTraceEnabled() bool { 98 | return l.level <= Trace 99 | } 100 | 101 | // IsDebugEnabled determines whether the debug level is enabled. 102 | func (l *Logger) IsDebugEnabled() bool { 103 | return l.level <= Debug 104 | } 105 | 106 | // IsWarnEnabled determines whether the debug level is enabled. 107 | func (l *Logger) IsWarnEnabled() bool { 108 | return l.level <= Warn 109 | } 110 | 111 | // Trace prints trace level message. 112 | func (l *Logger) Trace(v ...interface{}) { 113 | if Trace < l.level { 114 | return 115 | } 116 | 117 | l.logger.SetPrefix("T ") 118 | l.logger.Output(2, fmt.Sprint(v...)) 119 | } 120 | 121 | // Tracef prints trace level message with format. 122 | func (l *Logger) Tracef(format string, v ...interface{}) { 123 | if Trace < l.level { 124 | return 125 | } 126 | 127 | l.logger.SetPrefix("T ") 128 | l.logger.Output(2, fmt.Sprintf(format, v...)) 129 | } 130 | 131 | // Debug prints debug level message. 132 | func (l *Logger) Debug(v ...interface{}) { 133 | if Debug < l.level { 134 | return 135 | } 136 | 137 | l.logger.SetPrefix("D ") 138 | l.logger.Output(2, fmt.Sprint(v...)) 139 | } 140 | 141 | // Debugf prints debug level message with format. 142 | func (l *Logger) Debugf(format string, v ...interface{}) { 143 | if Debug < l.level { 144 | return 145 | } 146 | 147 | l.logger.SetPrefix("D ") 148 | l.logger.Output(2, fmt.Sprintf(format, v...)) 149 | } 150 | 151 | // Info prints info level message. 152 | func (l *Logger) Info(v ...interface{}) { 153 | if Info < l.level { 154 | return 155 | } 156 | 157 | l.logger.SetPrefix("I ") 158 | l.logger.Output(2, fmt.Sprint(v...)) 159 | } 160 | 161 | // Infof prints info level message with format. 162 | func (l *Logger) Infof(format string, v ...interface{}) { 163 | if Info < l.level { 164 | return 165 | } 166 | 167 | l.logger.SetPrefix("I ") 168 | l.logger.Output(2, fmt.Sprintf(format, v...)) 169 | } 170 | 171 | // Warn prints warning level message. 172 | func (l *Logger) Warn(v ...interface{}) { 173 | if Warn < l.level { 174 | return 175 | } 176 | 177 | l.logger.SetPrefix("W ") 178 | l.logger.Output(2, fmt.Sprint(v...)) 179 | } 180 | 181 | // Warnf prints warning level message with format. 182 | func (l *Logger) Warnf(format string, v ...interface{}) { 183 | if Warn < l.level { 184 | return 185 | } 186 | 187 | l.logger.SetPrefix("W ") 188 | l.logger.Output(2, fmt.Sprintf(format, v...)) 189 | } 190 | 191 | // Error prints error level message. 192 | func (l *Logger) Error(v ...interface{}) { 193 | if Error < l.level { 194 | return 195 | } 196 | 197 | l.logger.SetPrefix("E ") 198 | l.logger.Output(2, fmt.Sprint(v...)) 199 | } 200 | 201 | // Errorf prints error level message with format. 202 | func (l *Logger) Errorf(format string, v ...interface{}) { 203 | if Error < l.level { 204 | return 205 | } 206 | 207 | l.logger.SetPrefix("E ") 208 | l.logger.Output(2, fmt.Sprintf(format, v...)) 209 | } 210 | 211 | // Fatal prints fatal level message. 212 | func (l *Logger) Fatal(v ...interface{}) { 213 | if Fatal < l.level { 214 | return 215 | } 216 | 217 | l.logger.SetPrefix("F ") 218 | l.logger.Output(2, fmt.Sprint(v...)) 219 | os.Exit(1) 220 | } 221 | 222 | // Fatalf prints fatal level message with format. 223 | func (l *Logger) Fatalf(format string, v ...interface{}) { 224 | if Fatal < l.level { 225 | return 226 | } 227 | 228 | l.logger.SetPrefix("F ") 229 | l.logger.Output(2, fmt.Sprintf(format, v...)) 230 | os.Exit(1) 231 | } 232 | --------------------------------------------------------------------------------