├── charset_test.go ├── extract ├── content_test.go ├── link_test.go ├── meta_test.go ├── icp_test.go ├── domain_test.go ├── domain.go ├── web_test.go ├── icp.go ├── meta.go ├── web.go ├── link.go └── content.go ├── .gitignore ├── banner.txt ├── go.mod ├── spider_news_test.go ├── http_test.go ├── http.go ├── lang_test.go ├── charset.go ├── go.sum ├── README.md ├── detect_test.go ├── spider.go ├── spider_news.go ├── detect.go ├── spider_test.go ├── lang.go └── LICENSE /charset_test.go: -------------------------------------------------------------------------------- 1 | package spider 2 | -------------------------------------------------------------------------------- /extract/content_test.go: -------------------------------------------------------------------------------- 1 | package extract 2 | -------------------------------------------------------------------------------- /extract/link_test.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | "testing" 7 | ) 8 | 9 | func TestMatch(t *testing.T) { 10 | m := regexp.MustCompile(`\p{Han}`) 11 | allString := m.FindAllString("123你好,世界asdf", -1) 12 | fmt.Println(allString) 13 | } 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | 17 | .idea 18 | .vscode 19 | .setting -------------------------------------------------------------------------------- /extract/meta_test.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import "testing" 4 | 5 | func TestHostMeta(t *testing.T) { 6 | hosts := []string{ 7 | "matichon.co.th", 8 | "wanbao.com.sg", 9 | "wanbao.com.sg", 10 | "waou.com.mo", 11 | "archives.gov.mo", 12 | "mfa.gov.sg", 13 | "nasa.gov", 14 | } 15 | 16 | for _, host := range hosts { 17 | t.Log(MetaFromHost(host, "")) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /extract/icp_test.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import "testing" 4 | 5 | func TestIcpFromText(t *testing.T) { 6 | texts := []string{ 7 | "粤ICP备17055554号", 8 | "粤ICP备17055554-34号", 9 | "沪ICP备05018492", 10 | "粤B2-20090059", 11 | "京公网安备31010402001073号", 12 | "京公网安备-31010-4020010-73号", 13 | "鲁ICP备05002386鲁公网安备37070502000027号", 14 | } 15 | 16 | for _, text := range texts { 17 | icp, loc := IcpFromText(text) 18 | t.Log(icp, loc) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /banner.txt: -------------------------------------------------------------------------------- 1 | __ _ __ 2 | ____ _____ ____ / /______ _ _________ (_)___/ /__ _____ 3 | / __ `/ __ \______/ __ \/ //_/ __ `/_____/ ___/ __ \/ / __ / _ \/ ___/ 4 | / /_/ / /_/ /_____/ /_/ / ,< / /_/ /_____(__ ) /_/ / / /_/ / __/ / 5 | \__, /\____/ / .___/_/|_|\__, / /____/ .___/_/\__,_/\___/_/ 6 | /____/ /_/ /____/ /_/ 7 | 8 | 9 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/suosi-inc/go-pkg-spider 2 | 3 | go 1.18 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.8.1 7 | github.com/microcosm-cc/bluemonday v1.0.26 8 | github.com/suosi-inc/chardet v0.1.0 9 | github.com/suosi-inc/lingua-go v1.0.51 10 | github.com/x-funs/go-fun v0.94.0 11 | golang.org/x/net v0.19.0 12 | ) 13 | 14 | require ( 15 | github.com/andybalholm/cascadia v1.3.2 // indirect 16 | github.com/aymerick/douceur v0.2.0 // indirect 17 | github.com/gorilla/css v1.0.1 // indirect 18 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect 19 | golang.org/x/text v0.17.0 // indirect 20 | ) 21 | -------------------------------------------------------------------------------- /extract/domain_test.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestDomainParse(t *testing.T) { 9 | domains := []string{ 10 | "www.net.cn", 11 | "hi.chinanews.com", 12 | "a.wh.cn", 13 | "siat.ac.cn", 14 | "abc.spring.io", 15 | "abc.spring.ai", 16 | "www.china-embassy.or.jp", 17 | "whszdj.wh.cn", 18 | "gk.wh.cn", 19 | "xwxc.mwr.cn", 20 | "legismac.safp.gov.mo", 21 | "dezhou.rcsd.cn", 22 | "www.gov.cn", 23 | "scopsr.gov.cn", 24 | "usa.gov", 25 | "bbc.co.uk", 26 | "dealer.auto.sohu.com", 27 | "bbs.sohu.com", 28 | } 29 | 30 | for _, domain := range domains { 31 | t.Log(DomainParse(domain)) 32 | } 33 | } 34 | 35 | func TestDomainTop(t *testing.T) { 36 | domains := []string{ 37 | "www.net.cn", 38 | "hi.chinanews.com", 39 | "a.wh.cn", 40 | "siat.ac.cn", 41 | "abc.spring.io", 42 | "abc.spring.ai", 43 | "www.china-embassy.or.jp", 44 | "whszdj.wh.cn", 45 | "gk.wh.cn", 46 | "xwxc.mwr.cn", 47 | "legismac.safp.gov.mo", 48 | "dezhou.rcsd.cn", 49 | "www.gov.cn", 50 | "scopsr.gov.cn", 51 | "usa.gov", 52 | "bbc.co.uk", 53 | } 54 | 55 | for _, domain := range domains { 56 | t.Log(DomainTop(domain)) 57 | } 58 | } 59 | 60 | func TestDomainTopFromUrl(t *testing.T) { 61 | fmt.Println(DomainTopFromUrl("https://www.google.com")) 62 | fmt.Println(DomainTopFromUrl("https://www.baidu.com/news")) 63 | fmt.Println(DomainTopFromUrl("http://szb.xnnews.com.cn/zhzx/202207/t20220722_2731400.htm")) 64 | } 65 | -------------------------------------------------------------------------------- /extract/domain.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import ( 4 | "errors" 5 | "strings" 6 | 7 | "github.com/x-funs/go-fun" 8 | "golang.org/x/net/publicsuffix" 9 | ) 10 | 11 | type Domain struct { 12 | Subdomain, Domain, TLD string 13 | ICANN bool 14 | } 15 | 16 | // DomainTop 返回顶级域名 17 | func DomainTop(d string) string { 18 | if d, err := DomainParse(d); err == nil { 19 | return d.Domain + fun.DOT + d.TLD 20 | } 21 | 22 | return "" 23 | } 24 | 25 | // DomainTopFromUrl 解析 URL 返回顶级域名 26 | func DomainTopFromUrl(urlStr string) string { 27 | if d, err := DomainParseFromUrl(urlStr); err == nil { 28 | return d.Domain + "." + d.TLD 29 | } 30 | 31 | return "" 32 | } 33 | 34 | // DomainParse 解析域名, 返回 Domain 35 | func DomainParse(domain string) (*Domain, error) { 36 | if fun.Blank(domain) { 37 | return nil, errors.New("domain is blank") 38 | } 39 | 40 | // etld+1 41 | etld1, err := publicsuffix.EffectiveTLDPlusOne(domain) 42 | _, icann := publicsuffix.PublicSuffix(strings.ToLower(domain)) 43 | if err != nil { 44 | return nil, err 45 | } 46 | 47 | // convert to domain name, and tld 48 | i := strings.Index(etld1, fun.DOT) 49 | domName := etld1[0:i] 50 | tld := etld1[i+1:] 51 | 52 | // and subdomain 53 | sub := "" 54 | if rest := strings.TrimSuffix(domain, "."+etld1); rest != domain { 55 | sub = rest 56 | } 57 | return &Domain{ 58 | Subdomain: sub, 59 | Domain: domName, 60 | TLD: tld, 61 | ICANN: icann, 62 | }, nil 63 | } 64 | 65 | // DomainParseFromUrl 解析域名, 返回 Domain 66 | func DomainParseFromUrl(urlStr string) (*Domain, error) { 67 | u, err := fun.UrlParse(urlStr) 68 | if err != nil { 69 | return nil, err 70 | } 71 | 72 | d := u.Hostname() 73 | 74 | return DomainParse(d) 75 | } 76 | -------------------------------------------------------------------------------- /extract/web_test.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import ( 4 | "fmt" 5 | "net/url" 6 | "path" 7 | "testing" 8 | "unicode/utf8" 9 | 10 | "github.com/x-funs/go-fun" 11 | ) 12 | 13 | func TestTitleClean(t *testing.T) { 14 | strs := map[string]string{ 15 | "“暴徒试图杀死他!”阿拉木图市长在1月5日的暗杀企图中幸存_网易订阅": "zh", 16 | "“暴徒试图杀死他!”阿拉木图市长在1月5日的暗杀企图中幸存 - 网易订阅": "zh", 17 | "北极圈内最高温达到38℃ 北极熊还好吗?南极情况怎么样?_科技频道_中国青年网": "zh", 18 | "About the Project on Nuclear Issues | Center for Strategic and International Studies": "en", 19 | } 20 | 21 | for str, l := range strs { 22 | t.Log(WebTitleClean(str, l)) 23 | } 24 | } 25 | 26 | func TestUrlQuery(t *testing.T) { 27 | // urlStr := "https://people.com/tag/stories-to-make-you-smile/a/b/abc.html?a=1&b=2&c=3#ddd" 28 | urlStr := "https://vipmail.163.com/index.html?abc=123" 29 | u, err := url.Parse(urlStr) 30 | 31 | fmt.Println(err) 32 | fmt.Println(u.Path) 33 | fmt.Println(u.RawQuery) 34 | fmt.Println(path.Dir(u.Path)) 35 | // fmt.Println(path.Base(u.Path)) 36 | 37 | fmt.Println(utf8.RuneCountInString("https://adx.36kr.com/api/ad/click?sign=2eda7665240cec93f902311eb10c195a¶m.redirectUrl=aHR0cHM6Ly8zNmtyLmNvbS9wLzE4NTM5NTQ2NzgxMzIzNTI¶m.adsdk=Phid2i9VOob6U23ybkDx8q7cr1KbBDM4oiu1d_-C6gY5qf5SKxqBPsptEVMy_wtzqB5Yr08U7ioREUL7HLxIrQ")) 38 | } 39 | 40 | func TestFilterUrl(t *testing.T) { 41 | urlStr := "http://www.163.com/a/b/" 42 | baseUrl, _ := fun.UrlParse(urlStr) 43 | 44 | t.Log(filterUrl("./c/123.html", baseUrl, true)) 45 | t.Log(filterUrl("../c/123.html", baseUrl, true)) 46 | t.Log(filterUrl("/c/123.html", baseUrl, true)) 47 | t.Log(filterUrl("//www.163.com/c/123.html", baseUrl, true)) 48 | t.Log(filterUrl("//www.163.com/c/123.pdf?abc=1123", baseUrl, true)) 49 | } 50 | 51 | func BenchmarkFilterUrl(b *testing.B) { 52 | urlStr := "http://www.163.com/a/b/" 53 | baseUrl, _ := fun.UrlParse(urlStr) 54 | 55 | b.ResetTimer() 56 | 57 | for i := 0; i < b.N; i++ { 58 | filterUrl("https://www.163.com/news/article/HEAJM4F1000189FH.html", baseUrl, true) 59 | 60 | // url.Parse("https://www.163.com/news/article/HEAJM4F1000189FH.html") 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /spider_news_test.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "crypto/tls" 5 | "fmt" 6 | "net/http" 7 | "net/url" 8 | "testing" 9 | 10 | "github.com/x-funs/go-fun" 11 | ) 12 | 13 | var ( 14 | newUrl = "http://www.cankaoxiaoxi.com/" 15 | newUrl_domain = "cankaoxiaoxi.com" 16 | overseaUrl = "https://www.bbc.com/news" 17 | ) 18 | 19 | func TestNews_GetLinkRes_Noctx(t *testing.T) { 20 | n := NewNewsSpider(newUrl, 2, processLink, nil, WithRetryTime(1), WithTimeOut(10000)) 21 | n.GetLinkRes() 22 | } 23 | 24 | func TestNews_GetLinkRes(t *testing.T) { 25 | ctx := "getLinkRes" 26 | n := NewNewsSpider(newUrl, 2, processLink, ctx, WithRetryTime(1), WithTimeOut(10000)) 27 | n.RetryTime = 1 28 | n.Depth = 2 29 | n.GetLinkRes() 30 | } 31 | 32 | func TestNews_GetLinkRes_Clone(t *testing.T) { 33 | ctx := "getLinkRes" 34 | n := NewNewsSpider(newUrl, 2, processLink, ctx) 35 | 36 | nc := n.Clone().(*NewsSpider) 37 | nc.Ctx = "getLinkRes_Clone" 38 | nc.GetLinkRes() 39 | } 40 | 41 | func processLink(data ...any) { 42 | newsData := data[0].(*NewsData) 43 | 44 | if newsData.Error == nil { 45 | fmt.Println(newsData.ListUrl) 46 | fmt.Println(newsData.Depth) 47 | for i := range newsData.LinkRes.List { 48 | fmt.Println(data[1], i) 49 | } 50 | } 51 | } 52 | 53 | func TestNews_GetContentNews(t *testing.T) { 54 | ctx := "getContentNews" 55 | n := NewNewsSpider(newUrl, 1, processContent, ctx) 56 | n.GetContentNews() 57 | } 58 | 59 | func processContent(data ...any) { 60 | dd := data[0].(*NewsContent) 61 | fmt.Println(data[1], dd.Title, dd.Lang) 62 | } 63 | 64 | func TestNews_GetNewsWithProxy(t *testing.T) { 65 | transport := &http.Transport{ 66 | TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, 67 | DisableKeepAlives: true, 68 | } 69 | proxyString := "http://username:password@host:port" 70 | proxy, _ := url.Parse(proxyString) 71 | transport.Proxy = http.ProxyURL(proxy) 72 | 73 | req := &HttpReq{ 74 | HttpReq: &fun.HttpReq{ 75 | MaxContentLength: HttpDefaultMaxContentLength, 76 | MaxRedirect: 2, 77 | Transport: transport, 78 | Headers: map[string]string{ 79 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36", 80 | }, 81 | }, 82 | ForceTextContentType: true, 83 | } 84 | 85 | ctx := "getNewsWithProxy" 86 | n := NewNewsSpider(overseaUrl, 1, processContent, ctx, WithReq(req)) 87 | n.GetContentNews() 88 | } 89 | -------------------------------------------------------------------------------- /extract/icp.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | 7 | "github.com/PuerkitoBio/goquery" 8 | "github.com/x-funs/go-fun" 9 | ) 10 | 11 | var ( 12 | ProvinceShortMap = map[string]string{ 13 | "京": "北京", 14 | "津": "天津", 15 | "沪": "上海", 16 | "渝": "重庆", 17 | "黑": "黑龙江", 18 | "吉": "吉林", 19 | "辽": "辽宁", 20 | "冀": "河北", 21 | "豫": "河南", 22 | "鲁": "山东", 23 | "晋": "山西", 24 | "陕": "陕西", 25 | "秦": "陕西", 26 | "蒙": "内蒙古", 27 | "宁": "宁夏", 28 | "陇": "甘肃", 29 | "甘": "甘肃", 30 | "新": "新疆", 31 | "青": "青海", 32 | "藏": "西藏", 33 | "鄂": "湖北", 34 | "皖": "安徽", 35 | "苏": "江苏", 36 | "浙": "浙江", 37 | "闽": "福建", 38 | "湘": "湖南", 39 | "赣": "江西", 40 | "川": "四川", 41 | "蜀": "四川", 42 | "黔": "贵州", 43 | "贵": "贵州", 44 | "滇": "云南", 45 | "云": "云南", 46 | "粤": "广东", 47 | "桂": "广西", 48 | "琼": "海南", 49 | "港": "中国香港", 50 | "澳": "中国澳门", 51 | "台": "中国台湾", 52 | } 53 | ) 54 | 55 | const ( 56 | RegexIcp = `(?i)(京|津|冀|晋|蒙|辽|吉|黑|沪|苏|浙|皖|闽|赣|鲁|豫|鄂|湘|粤|桂|琼|川|蜀|贵|黔|云|滇|渝|藏|陇|甘|陕|秦|青|宁|新)ICP(备|证|备案)?[0-9]+` 57 | RegexIcpGa = `(?i)(京|津|冀|晋|蒙|辽|吉|黑|沪|苏|浙|皖|闽|赣|鲁|豫|鄂|湘|粤|桂|琼|川|蜀|贵|黔|云|滇|渝|藏|陇|甘|陕|秦|青|宁|新)公网安备[0-9]+` 58 | RegexIcpDx = `(?i)(京|津|冀|晋|蒙|辽|吉|黑|沪|苏|浙|皖|闽|赣|鲁|豫|鄂|湘|粤|桂|琼|川|蜀|贵|黔|云|滇|渝|藏|陇|甘|陕|秦|青|宁|新)B2-[0-9]+` 59 | ) 60 | 61 | var ( 62 | RegexIcpPattern = regexp.MustCompile(RegexIcp) 63 | RegexIcpGaPattern = regexp.MustCompile(RegexIcpGa) 64 | RegexIcpDxPattern = regexp.MustCompile(RegexIcpDx) 65 | ) 66 | 67 | // Icp 返回网站备案相关的信息 68 | func Icp(doc *goquery.Document) (string, string) { 69 | text := doc.Find("body").Text() 70 | 71 | text = fun.RemoveLines(text) 72 | 73 | text = strings.ReplaceAll(text, fun.TAB, "") 74 | text = strings.ReplaceAll(text, fun.SPACE, "") 75 | 76 | return IcpFromText(text) 77 | 78 | } 79 | 80 | // IcpFromText 提取文本中备案相关的信息 81 | func IcpFromText(text string) (string, string) { 82 | var icp, loc string 83 | 84 | // 优先匹配ICP 85 | matches := RegexIcpPattern.FindStringSubmatch(text) 86 | if len(matches) > 1 { 87 | icp = matches[0] 88 | loc = matches[1] 89 | } 90 | 91 | // 匹配公网安备 92 | if icp == "" { 93 | matches = RegexIcpGaPattern.FindStringSubmatch(text) 94 | if len(matches) > 1 { 95 | icp = matches[0] 96 | loc = matches[1] 97 | } 98 | } 99 | 100 | // 匹配电信增值业务 101 | if icp == "" { 102 | matches = RegexIcpDxPattern.FindStringSubmatch(text) 103 | if len(matches) > 1 { 104 | icp = matches[0] 105 | loc = matches[1] 106 | } 107 | } 108 | 109 | return icp, loc 110 | } 111 | -------------------------------------------------------------------------------- /http_test.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/PuerkitoBio/goquery" 8 | "github.com/x-funs/go-fun" 9 | ) 10 | 11 | func TestHttpGetCharsetLang(t *testing.T) { 12 | var urlStrs = []string{ 13 | // "http://suosi.com.cn", 14 | // "https://www.163.com", 15 | // "https://english.news.cn", 16 | // "https://jp.news.cn", 17 | // "https://kr.news.cn", 18 | // "https://www.donga.com/", 19 | // "http://www.koreatimes.com/", 20 | // "https://arabic.news.cn", 21 | // "https://www.bbc.com", 22 | // "http://government.ru", 23 | "https://french.news.cn", 24 | // "https://www.gouvernement.fr", 25 | // "http://live.siammedia.org/", 26 | // "http://hanoimoi.com.vn", 27 | // "https://www.commerce.gov.mm", 28 | // "https://sanmarg.in/", 29 | // "https://www.rrdmyanmar.gov.mm", 30 | // "http://english.eastday.com/", 31 | // "http://jp.eastday.com/", 32 | // "https://mn.cctv.com/", 33 | } 34 | 35 | for _, urlStr := range urlStrs { 36 | 37 | resp, err := HttpGetResp(urlStr, nil, 30000) 38 | 39 | t.Log(urlStr) 40 | t.Log(err) 41 | t.Log(resp.Success) 42 | t.Log(resp.ContentLength) 43 | t.Log(resp.Headers) 44 | t.Log(resp.Charset) 45 | 46 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 47 | doc.Find(DefaultDocRemoveTags).Remove() 48 | 49 | start := fun.Timestamp(true) 50 | lang := Lang(doc, resp.Charset.Charset, true) 51 | t.Log(lang) 52 | 53 | t.Log(fun.Timestamp(true) - start) 54 | } 55 | } 56 | 57 | func TestHttpGetCharsetLangURL(t *testing.T) { 58 | var urlStrs = []string{ 59 | "https://marriott.co.kr", 60 | } 61 | 62 | for _, urlStr := range urlStrs { 63 | 64 | resp, err := HttpGetResp(urlStr, nil, 30000) 65 | 66 | t.Log(urlStr) 67 | t.Log(err) 68 | t.Log(resp.Success) 69 | t.Log(resp.ContentLength) 70 | t.Log(resp.Headers) 71 | t.Log(resp.Charset) 72 | 73 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 74 | doc.Find(DefaultDocRemoveTags).Remove() 75 | 76 | start := fun.Timestamp(true) 77 | lang := Lang(doc, resp.Charset.Charset, true) 78 | t.Log(lang) 79 | 80 | t.Log(fun.Timestamp(true) - start) 81 | } 82 | } 83 | 84 | func TestHttpGet(t *testing.T) { 85 | var urlStr string 86 | 87 | urlStr = "http://www.niuchaoqun.com" 88 | // urlStr = "http://www.qq.com" 89 | 90 | resp, err := HttpGetResp(urlStr, nil, 10000) 91 | 92 | t.Log(urlStr) 93 | t.Log(err) 94 | t.Log(resp.Success) 95 | t.Log(resp.ContentLength) 96 | t.Log(resp.Headers) 97 | t.Log(resp.Charset) 98 | 99 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 100 | doc.Find(DefaultDocRemoveTags).Remove() 101 | lang := Lang(doc, resp.Charset.Charset, true) 102 | t.Log(lang) 103 | 104 | t.Log(fun.String(resp.Body)) 105 | } 106 | 107 | func TestHttpGetContentType(t *testing.T) { 108 | var urlStr string 109 | 110 | urlStr = "https://mirrors.163.com/mysql/Downloads/MySQL-8.0/libmysqlclient-dev_8.0.27-1debian10_amd64.deb" 111 | 112 | req := &HttpReq{ 113 | ForceTextContentType: true, 114 | } 115 | resp, err := HttpGetResp(urlStr, req, 10000) 116 | 117 | t.Log(urlStr) 118 | t.Log(err) 119 | t.Log(resp.Success) 120 | t.Log(resp.ContentLength) 121 | t.Log(resp.Headers) 122 | t.Log(resp.Charset) 123 | 124 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 125 | doc.Find(DefaultDocRemoveTags).Remove() 126 | lang := Lang(doc, resp.Charset.Charset, true) 127 | t.Log(lang) 128 | 129 | t.Log(fun.String(resp.Body)) 130 | } 131 | 132 | func TestHttpGetContentLength(t *testing.T) { 133 | var urlStr string 134 | 135 | urlStr = "http://suosi.com.cn" 136 | 137 | req := &HttpReq{ 138 | HttpReq: &fun.HttpReq{ 139 | MaxContentLength: 1000, 140 | }, 141 | } 142 | resp, err := HttpGetResp(urlStr, req, 10000) 143 | 144 | t.Log(urlStr) 145 | t.Log(err) 146 | t.Log(resp.Success) 147 | t.Log(resp.ContentLength) 148 | t.Log(resp.Headers) 149 | t.Log(resp.Charset) 150 | 151 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 152 | doc.Find(DefaultDocRemoveTags).Remove() 153 | lang := Lang(doc, resp.Charset.Charset, true) 154 | t.Log(lang) 155 | 156 | t.Log(fun.String(resp.Body)) 157 | } 158 | -------------------------------------------------------------------------------- /extract/meta.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import ( 4 | "strings" 5 | ) 6 | 7 | var HostGovCountryMap = map[string]string{ 8 | "hk": "中国", 9 | "tw": "中国", 10 | "mo": "中国", 11 | "jp": "日本", 12 | "kr": "韩国", 13 | "in": "印度", 14 | "uk": "英国", 15 | "us": "美国", 16 | "it": "意大利", 17 | "es": "西班牙", 18 | "ru": "俄罗斯", 19 | "de": "德国", 20 | "fr": "法国", 21 | "th": "泰国", 22 | "vn": "越南", 23 | "sg": "新加坡", 24 | "au": "澳大利亚", 25 | "ca": "加拿大", 26 | "il": "以色列", 27 | "mm": "缅甸", 28 | "dz": "阿尔及利亚", 29 | "pl": "波兰", 30 | "az": "南非", 31 | "ng": "尼日利亚", 32 | "kp": "朝鲜", 33 | "lb": "黎巴嫩", 34 | "ua": "乌克兰", 35 | "tr": "土耳其", 36 | "se": "瑞典", 37 | "lk": "斯里兰卡", 38 | "si": "斯洛文尼亚", 39 | "sk": "斯洛伐克", 40 | "ro": "罗马尼亚", 41 | "pt": "葡萄牙", 42 | "ph": "菲律宾", 43 | "pk": "巴基斯坦", 44 | "py": "巴拉圭", 45 | "np": "尼泊尔", 46 | "ma": "摩洛哥", 47 | "my": "马来西亚", 48 | "lt": "立陶宛", 49 | "ie": "爱尔兰", 50 | "iq": "伊拉克", 51 | "ir": "伊朗", 52 | "id": "印度尼西亚", 53 | "hu": "匈牙利", 54 | "gr": "希腊", 55 | "eg": "埃及", 56 | "cz": "捷克", 57 | "hr": "克罗地亚", 58 | "co": "哥伦比亚", 59 | "cl": "智利", 60 | "br": "巴西", 61 | "bg": "保加利亚", 62 | "be": "比利时", 63 | "bd": "孟加拉国", 64 | "aw": "阿鲁巴", 65 | "am": "亚美尼亚", 66 | "ai": "安圭拉", 67 | "ao": "安哥拉", 68 | "al": "阿尔巴尼亚", 69 | "af": "阿富汗", 70 | "sa": "沙特阿拉伯", 71 | "nl": "荷兰", 72 | } 73 | 74 | // MetaFromHost 根据域名尽可能返回一些固定信息 75 | func MetaFromHost(host string, lang string) (string, string, string) { 76 | var tld string 77 | var country string 78 | var province string 79 | var category string 80 | 81 | host = strings.ToLower(host) 82 | 83 | if domain, err := DomainParse(host); err == nil { 84 | tld = domain.TLD 85 | } else { 86 | return country, province, category 87 | } 88 | 89 | // 美国政府顶级域名 90 | if tld == "gov" { 91 | country = "美国" 92 | category = "政务" 93 | return country, province, category 94 | } 95 | 96 | // 判断是否是政府域名 97 | for c, zh := range HostGovCountryMap { 98 | gov := "gov." + c 99 | if tld == gov { 100 | country = zh 101 | category = "政务" 102 | 103 | if strings.HasSuffix(host, ".hk") && lang == "zh" { 104 | province = "中国香港" 105 | } 106 | if strings.HasSuffix(host, ".tw") && lang == "zh" { 107 | province = "中国台湾" 108 | } 109 | if strings.HasSuffix(host, ".mo") && lang == "zh" { 110 | province = "中国澳门" 111 | } 112 | return country, province, category 113 | } 114 | } 115 | 116 | if strings.HasSuffix(host, ".hk") && lang == "zh" { 117 | country = "中国" 118 | province = "中国香港" 119 | return country, province, category 120 | } 121 | 122 | if strings.HasSuffix(host, ".tw") && lang == "zh" { 123 | country = "中国" 124 | province = "中国台湾" 125 | return country, province, category 126 | } 127 | 128 | if strings.HasSuffix(host, ".mo") && lang == "zh" { 129 | country = "中国" 130 | province = "中国澳门" 131 | return country, province, category 132 | } 133 | 134 | if strings.HasSuffix(host, ".cn") && lang == "zh" { 135 | country = "中国" 136 | return country, province, category 137 | } 138 | 139 | if strings.HasSuffix(host, ".jp") && lang == "ja" { 140 | country = "日本" 141 | return country, province, category 142 | } 143 | 144 | if strings.HasSuffix(host, ".kr") && lang == "ko" { 145 | country = "韩国" 146 | return country, province, category 147 | } 148 | 149 | if strings.HasSuffix(host, ".uk") && lang == "en" { 150 | country = "英国" 151 | return country, province, category 152 | } 153 | 154 | if strings.HasSuffix(host, ".us") && lang == "en" { 155 | country = "美国" 156 | return country, province, category 157 | } 158 | 159 | if strings.HasSuffix(host, ".in") && lang == "hi" { 160 | country = "印度" 161 | return country, province, category 162 | } 163 | 164 | if strings.HasSuffix(host, ".es") && lang == "es" { 165 | country = "西班牙" 166 | return country, province, category 167 | } 168 | 169 | if strings.HasSuffix(host, ".ru") && lang == "ru" { 170 | country = "俄罗斯" 171 | return country, province, category 172 | } 173 | 174 | if strings.HasSuffix(host, ".de") && lang == "de" { 175 | country = "德国" 176 | return country, province, category 177 | } 178 | 179 | if strings.HasSuffix(host, ".fr") && lang == "fr" { 180 | country = "法国" 181 | return country, province, category 182 | } 183 | 184 | return country, province, category 185 | } 186 | -------------------------------------------------------------------------------- /http.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "crypto/tls" 5 | "errors" 6 | "net" 7 | "net/http" 8 | "time" 9 | 10 | "github.com/x-funs/go-fun" 11 | ) 12 | 13 | const ( 14 | HttpDefaultTimeOut = 10000 15 | HttpDefaultMaxContentLength = 10 * 1024 * 1024 16 | HttpDefaultUserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36" 17 | HttpDefaultAcceptEncoding = "gzip, deflate" 18 | ) 19 | 20 | var ( 21 | textContentTypes = []string{ 22 | "text/plain", 23 | "text/html", 24 | "text/xml", 25 | "application/xml", 26 | "application/xhtml+xml", 27 | "application/json", 28 | } 29 | ) 30 | 31 | type HttpReq struct { 32 | // 嵌入 fun.HttpReq 33 | *fun.HttpReq 34 | 35 | // 禁止自动探测字符集和转换字符集 36 | DisableCharset bool 37 | 38 | // 强制 ContentType 为文本类型 39 | ForceTextContentType bool 40 | } 41 | 42 | type HttpResp struct { 43 | *fun.HttpResp 44 | 45 | // 字符集 46 | Charset CharsetRes 47 | } 48 | 49 | // HttpDefaultTransport 默认全局使用的 http.Transport 50 | var HttpDefaultTransport = &http.Transport{ 51 | DialContext: (&net.Dialer{Timeout: time.Second}).DialContext, 52 | DisableKeepAlives: true, 53 | IdleConnTimeout: 60 * time.Second, 54 | TLSHandshakeTimeout: 10 * time.Second, 55 | ExpectContinueTimeout: 1 * time.Second, 56 | TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, 57 | } 58 | 59 | // HttpGet 参数为请求地址 (HttpReq, 超时时间) 60 | // HttpGet(url)、HttpGet(url, HttpReq)、HttpGet(url, timeout)、HttpGet(url, HttpReq, timeout) 61 | // 返回 body, 错误信息 62 | func HttpGet(urlStr string, args ...any) ([]byte, error) { 63 | l := len(args) 64 | 65 | switch l { 66 | case 0: 67 | return HttpGetDo(urlStr, nil, 0) 68 | case 1: 69 | switch v := args[0].(type) { 70 | case int: 71 | timeout := fun.ToInt(args[0]) 72 | return HttpGetDo(urlStr, nil, timeout) 73 | case *HttpReq: 74 | return HttpGetDo(urlStr, v, 0) 75 | 76 | } 77 | case 2: 78 | timeout := fun.ToInt(args[1]) 79 | switch v := args[0].(type) { 80 | case *HttpReq: 81 | return HttpGetDo(urlStr, v, timeout) 82 | } 83 | 84 | } 85 | 86 | return nil, errors.New("http get params error") 87 | } 88 | 89 | // HttpGetDo Http Get 请求, 参数为请求地址, HttpReq, 超时时间(毫秒) 90 | // 返回 body, 错误信息 91 | func HttpGetDo(urlStr string, r *HttpReq, timeout int) ([]byte, error) { 92 | resp, err := HttpGetResp(urlStr, r, timeout) 93 | if err != nil { 94 | return nil, err 95 | } else { 96 | return resp.Body, nil 97 | } 98 | } 99 | 100 | // HttpGetResp Http Get 请求, 参数为请求地址, HttpReq, 超时时间(毫秒) 101 | // 返回 HttpResp, 错误信息 102 | func HttpGetResp(urlStr string, r *HttpReq, timeout int) (*HttpResp, error) { 103 | req, err := http.NewRequest(http.MethodGet, urlStr, nil) 104 | if err != nil { 105 | return nil, err 106 | } 107 | 108 | return HttpDoResp(req, r, timeout) 109 | } 110 | 111 | // HttpDo Http 请求, 参数为 http.Request, HttpReq, 超时时间(毫秒) 112 | // 返回 body, 错误信息 113 | func HttpDo(req *http.Request, r *HttpReq, timeout int) ([]byte, error) { 114 | resp, err := HttpDoResp(req, r, timeout) 115 | if err != nil { 116 | return nil, err 117 | } else { 118 | return resp.Body, nil 119 | } 120 | } 121 | 122 | // HttpDoResp Http 请求, 参数为 http.Request, HttpReq, 超时时间(毫秒) 123 | // 返回 HttpResp, 错误信息 124 | func HttpDoResp(req *http.Request, r *HttpReq, timeout int) (*HttpResp, error) { 125 | // 处理 Transport 126 | if r == nil { 127 | r = &HttpReq{ 128 | HttpReq: &fun.HttpReq{ 129 | Transport: HttpDefaultTransport, 130 | }, 131 | } 132 | } else if r.HttpReq == nil { 133 | r.HttpReq = &fun.HttpReq{ 134 | Transport: HttpDefaultTransport, 135 | } 136 | } else if r.Transport == nil { 137 | r.Transport = HttpDefaultTransport 138 | } 139 | 140 | // 强制文本类型 141 | if r != nil && r.ForceTextContentType { 142 | r.AllowedContentTypes = textContentTypes 143 | } 144 | 145 | // HttpResp 146 | var charset CharsetRes 147 | httpResp := &HttpResp{ 148 | Charset: charset, 149 | } 150 | 151 | resp, err := fun.HttpDoResp(req, r.HttpReq, timeout) 152 | httpResp.HttpResp = resp 153 | if err != nil { 154 | return httpResp, err 155 | } 156 | 157 | // 默认会自动进行探测编码和转码, 除非手动禁用 158 | if r == nil || !r.DisableCharset { 159 | charsetRes := Charset(httpResp.Body, httpResp.Headers) 160 | httpResp.Charset = charsetRes 161 | 162 | if charsetRes.Charset != "" && charsetRes.Charset != "UTF-8" { 163 | utf8Body, e := fun.ToUtf8(httpResp.Body, charsetRes.Charset) 164 | if e != nil { 165 | return httpResp, errors.New("ErrorCharset") 166 | } else { 167 | httpResp.Body = utf8Body 168 | } 169 | } 170 | } 171 | 172 | return httpResp, nil 173 | } 174 | -------------------------------------------------------------------------------- /lang_test.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "regexp" 7 | "testing" 8 | 9 | "github.com/PuerkitoBio/goquery" 10 | "github.com/suosi-inc/lingua-go" 11 | "github.com/x-funs/go-fun" 12 | ) 13 | 14 | func TestLinguaText(t *testing.T) { 15 | text := "BEIJING, 10 août (Xinhua) -- Un porte-parole du Bureau du Travail du Comité central du Parti communiste chinois pour les affaires de Taiwan a fait mercredi des remarques sur un livre blanc nouvellement publié intitulé \"La question de Taiwan et la réunification de la Chine dans la nouvelle ère\"." 16 | 17 | start := fun.Timestamp(true) 18 | languages := []lingua.Language{ 19 | lingua.French, 20 | lingua.Spanish, 21 | lingua.Portuguese, 22 | lingua.German, 23 | } 24 | detector := lingua.NewLanguageDetectorBuilder(). 25 | FromLanguages(languages...). 26 | Build() 27 | 28 | if language, exists := detector.DetectLanguageOf(text); exists { 29 | t.Log(text) 30 | t.Log(language.IsoCode639_1()) 31 | fmt.Println(fun.Timestamp(true) - start) 32 | } 33 | } 34 | 35 | func BenchmarkLinguaTest(b *testing.B) { 36 | 37 | text := "BEIJING" 38 | 39 | languages := []lingua.Language{ 40 | lingua.French, 41 | lingua.Spanish, 42 | lingua.Portuguese, 43 | lingua.German, 44 | lingua.English, 45 | } 46 | detector := lingua.NewLanguageDetectorBuilder(). 47 | FromLanguages(languages...). 48 | Build() 49 | 50 | b.ResetTimer() 51 | 52 | for i := 0; i < b.N; i++ { 53 | _, _ = detector.DetectLanguageOf(text) 54 | } 55 | } 56 | 57 | func TestLang(t *testing.T) { 58 | 59 | var urlStrs = []string{ 60 | 61 | "https://www.bbc.com", 62 | "https://www.ft.com/", 63 | 64 | "https://www.163.com/news/article/HEJGEVFT000189FH.html", 65 | "https://www.163.com", 66 | 67 | "https://english.news.cn", 68 | "https://jp.news.cn", 69 | "https://kr.news.cn", 70 | "https://german.news.cn/", 71 | "https://portuguese.news.cn/", 72 | "https://arabic.news.cn", 73 | "https://french.news.cn", 74 | 75 | "https://mn.cctv.com/", 76 | 77 | "http://government.ru", 78 | 79 | "https://www.gouvernement.fr", 80 | 81 | "http://live.siammedia.org/", 82 | "https://www.manchestereveningnews.co.uk/", 83 | 84 | "https://www.chinadaily.com.cn", 85 | "http://cn.chinadaily.com.cn/", 86 | "http://www.chinadaily.com.cn/chinawatch_fr/index.html", 87 | "https://d1ev.com/", 88 | "https://www.cngold.com.cn/", 89 | "https://china.guidechem.com/", 90 | "https://xdkb.net/", 91 | "https://www.lifeweek.com.cn/", 92 | "http://gxbsrd.gov.cn/", 93 | "https://defence24.com/", 94 | "http://www.gmp.or.kr/", 95 | "http://rdfmj.com/", 96 | "https://news.xmnn.cn/xmnn/2022/08/09/101067908.shtml", 97 | } 98 | 99 | for _, urlStr := range urlStrs { 100 | resp, _ := HttpGetResp(urlStr, nil, 10000) 101 | 102 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 103 | 104 | doc.Find(DefaultDocRemoveTags).Remove() 105 | 106 | // 语言 107 | start := fun.Timestamp(true) 108 | langRes := Lang(doc, resp.Charset.Charset, true) 109 | 110 | t.Log(urlStr) 111 | t.Log(resp.Charset) 112 | t.Log(langRes) 113 | t.Log(fun.Timestamp(true) - start) 114 | } 115 | 116 | } 117 | 118 | func TestLangText(t *testing.T) { 119 | start := fun.Timestamp(true) 120 | text := "中文" 121 | t.Log(fun.Timestamp(true) - start) 122 | t.Log(LangText(text)) 123 | } 124 | 125 | func TestUnicode(t *testing.T) { 126 | text := "BEIJING, 9. August 2022 (Xinhuanet) -- In einem am Dienstag veröffentlichten Bericht über die Menschenrechtsverletzungen der USA wird darauf hingewiesen, dass die Vereinigten Staaten einen \"Konflikt der Zivilisationen\" geschaffen, Haft und Folter missbraucht sowie die Religionsfreiheit und Menschenwürde verletzt hätten.\n\nDer Bericht mit dem Titel ''Die USA begehen schwerwiegende Verbrechen der Menschenrechtsverletzungen im Nahen Osten und darüber hinaus'' wurde von der Chinesischen Gesellschaft für Menschenrechtsstudien veröffentlicht.\n\nIn dem Bericht heißt es, dass die Vereinigten Staaten keinen Respekt vor der Diversität der Zivilisationen zeigten, der islamischen Zivilisation feindlich gegenüberständen, das historische und kulturelle Erbe des Nahen Ostens zerstörten, Muslime rücksichtslos inhaftierten und folterten und die grundlegenden Menschenrechte der Bevölkerung im Nahen Osten und in anderen Gebieten schwer verletzten.\n\n\"Die Vereinigten Staaten haben die 'islamische Bedrohungstheorie' in der ganzen Welt verbreitet. Sie haben die Überlegenheit der westlichen und christlichen Zivilisation befürwortet, die nicht-westliche Zivilisation verachtet und die islamische Zivilisation stigmatisiert, indem sie sie als 'rückständig', 'terroristisch' und 'gewalttätig' bezeichneten\", heißt es in dem Bericht." 127 | // latinRex := regexp.MustCompile(`\p{Lo}`) 128 | latinRex := regexp.MustCompile("[\u0080-\u00ff]") 129 | latin := latinRex.FindAllString(text, -1) 130 | 131 | t.Log(latin) 132 | } 133 | -------------------------------------------------------------------------------- /charset.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "net/http" 5 | "regexp" 6 | "strings" 7 | "unicode/utf8" 8 | 9 | "github.com/suosi-inc/chardet" 10 | "github.com/x-funs/go-fun" 11 | ) 12 | 13 | const ( 14 | CharsetPosHeader = "header" 15 | CharsetPosHtml = "html" 16 | CharsetPosGuess = "guess" 17 | CharsetPosValid = "valid" 18 | ) 19 | 20 | const ( 21 | RegexCharset = "(?i)charset=\\s*([a-z][_\\-0-9a-z]*)" 22 | RegexCharsetHtml4 = "(?i)]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>" 23 | RegexCharsetHtml5 = "(?i)]*>" 24 | ) 25 | 26 | var ( 27 | regexCharsetPattern = regexp.MustCompile(RegexCharset) 28 | regexCharsetHtml4Pattern = regexp.MustCompile(RegexCharsetHtml4) 29 | regexCharsetHtml5Pattern = regexp.MustCompile(RegexCharsetHtml5) 30 | ) 31 | 32 | type CharsetRes struct { 33 | Charset string 34 | CharsetPos string 35 | } 36 | 37 | // Charset 解析 HTTP body、http.Header 中的编码和语言, 如果未解析成功则尝试进行猜测 38 | func Charset(body []byte, headers *http.Header) CharsetRes { 39 | var charsetRes CharsetRes 40 | var guessCharset string 41 | 42 | // 优先检测是否是有效的 UTF-8 43 | valid := utf8.Valid(body) 44 | if valid { 45 | charsetRes.Charset = "UTF-8" 46 | charsetRes.CharsetPos = CharsetPosValid 47 | return charsetRes 48 | } 49 | 50 | // 根据 Content-Type、Body Html 标签探测编码 51 | charsetRes = CharsetFromHeaderHtml(body, headers) 52 | 53 | // 未识别到 charset 则使用 guess 54 | if charsetRes.Charset == "" { 55 | guessCharset = CharsetGuess(body) 56 | 57 | if guessCharset != "" { 58 | charsetRes.Charset = guessCharset 59 | charsetRes.CharsetPos = CharsetPosGuess 60 | } 61 | } 62 | 63 | return charsetRes 64 | } 65 | 66 | // CharsetFromHeaderHtml 解析 HTTP body、http.Header 中的 charset, 准确性高 67 | func CharsetFromHeaderHtml(body []byte, headers *http.Header) CharsetRes { 68 | var res CharsetRes 69 | 70 | cHeader := CharsetFromHeader(headers) 71 | 72 | cHtml := CharsetFromHtml(body) 73 | 74 | // 只有 Header 则使用 Header 75 | if cHeader != "" && cHtml == "" { 76 | res.Charset = cHeader 77 | res.CharsetPos = CharsetPosHeader 78 | return res 79 | } 80 | 81 | // 只有 Html 则使用 Html 82 | if cHeader == "" && cHtml != "" { 83 | res.Charset = cHtml 84 | res.CharsetPos = CharsetPosHtml 85 | return res 86 | } 87 | 88 | // 同时有 Header 和 Html, 根据情况使用 Header 或 Html 89 | if cHeader != "" && cHtml != "" { 90 | if cHeader == cHtml { 91 | res.Charset = cHeader 92 | res.CharsetPos = CharsetPosHeader 93 | return res 94 | } 95 | 96 | // Header 和 Html 不一致, 以下情况以 Html 为准 97 | if strings.HasPrefix(cHeader, "ISO") || strings.HasPrefix(cHeader, "WINDOWS") { 98 | res.Charset = cHtml 99 | res.CharsetPos = CharsetPosHtml 100 | return res 101 | } 102 | 103 | res.Charset = cHeader 104 | res.CharsetPos = CharsetPosHeader 105 | return res 106 | } 107 | 108 | return res 109 | } 110 | 111 | // CharsetFromHeader 解析 HTTP header 中的 charset 112 | func CharsetFromHeader(headers *http.Header) string { 113 | var charset string 114 | if headers != nil { 115 | contentType := headers.Get("Content-Type") 116 | if !fun.Blank(contentType) { 117 | matches := regexCharsetPattern.FindStringSubmatch(contentType) 118 | if len(matches) > 1 { 119 | charset = matches[1] 120 | } 121 | } 122 | } 123 | 124 | return convertCharset(charset) 125 | } 126 | 127 | // CharsetFromHtml 解析 Html 中的 charset 128 | func CharsetFromHtml(body []byte) string { 129 | var charset string 130 | 131 | if len(body) >= 0 { 132 | // 先检测 HTML 标签 133 | html := fun.String(body) 134 | 135 | // 匹配 HTML4 标签 136 | var charset4 string 137 | matches := regexCharsetHtml4Pattern.FindStringSubmatch(html) 138 | if len(matches) > 1 { 139 | matches = regexCharsetPattern.FindStringSubmatch(matches[1]) 140 | if len(matches) > 1 { 141 | charset4 = matches[1] 142 | } 143 | } 144 | 145 | // 匹配 HTML5 标签 146 | var charset5 string 147 | matches = regexCharsetHtml5Pattern.FindStringSubmatch(html) 148 | if len(matches) > 1 { 149 | charset5 = matches[1] 150 | } 151 | 152 | // 只有其中一个 153 | if charset4 != "" && charset5 == "" { 154 | charset = charset4 155 | } 156 | 157 | if charset4 == "" && charset5 != "" { 158 | charset = charset5 159 | } 160 | 161 | if charset4 != "" && charset5 != "" { 162 | // 竟然两个都有, 以最先出现的为准 163 | if charset4 == charset5 { 164 | charset = charset5 165 | } else { 166 | charset4Index := strings.Index(html, charset4) 167 | charset5Index := strings.Index(html, charset5) 168 | 169 | if charset4Index < charset5Index { 170 | charset = charset4 171 | } else { 172 | charset = charset5 173 | } 174 | } 175 | 176 | } 177 | } 178 | 179 | return convertCharset(charset) 180 | } 181 | 182 | // CharsetGuess 根据 HTTP body 猜测编码 183 | func CharsetGuess(body []byte) string { 184 | var guessCharset string 185 | 186 | detector := chardet.NewHtmlDetector() 187 | guess, err := detector.DetectBest(body) 188 | if err == nil { 189 | guessCharset = strings.ToUpper(guess.Charset) 190 | } 191 | 192 | return guessCharset 193 | } 194 | 195 | // convertCharset 格式化 charset 196 | func convertCharset(charset string) string { 197 | c := strings.ToUpper(strings.TrimSpace(charset)) 198 | 199 | if c != "" { 200 | // alias utf8 201 | if c == "UTF8" || c == "UTF_8" { 202 | return "UTF-8" 203 | } 204 | 205 | // alias gb2312, gb18030 206 | if strings.HasPrefix(c, "GB") { 207 | return "GBK" 208 | } 209 | 210 | // alias big5-hkscs.. 211 | if strings.HasPrefix(c, "BIG5") { 212 | return "Big5" 213 | } 214 | 215 | // alias shift-jis 216 | if strings.HasPrefix(c, "SHIFT") { 217 | return "SHIFT_JIS" 218 | } 219 | } 220 | 221 | return c 222 | } 223 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= 2 | github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= 3 | github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= 4 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= 5 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= 6 | github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= 7 | github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= 8 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 9 | github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8= 10 | github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0= 11 | github.com/microcosm-cc/bluemonday v1.0.26 h1:xbqSvqzQMeEHCqMi64VAs4d8uy6Mequs3rQ0k/Khz58= 12 | github.com/microcosm-cc/bluemonday v1.0.26/go.mod h1:JyzOCs9gkyQyjs+6h10UEVSe02CGwkhd72Xdqh78TWs= 13 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 14 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= 15 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= 16 | github.com/stretchr/objx v0.4.0 h1:M2gUjqZET1qApGOWNSnZ49BAIMX4F/1plDv3+l31EJ4= 17 | github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= 18 | github.com/suosi-inc/chardet v0.1.0 h1:AmAXYaZKPAXCpwthMeQG/ABwYreonxjP/BCbhOa7jfw= 19 | github.com/suosi-inc/chardet v0.1.0/go.mod h1:dhKdJO4yQeuLYMyu1QFjoNITgMJ/zyLhs4zwIUnQTKI= 20 | github.com/suosi-inc/lingua-go v1.0.51 h1:+IhIKGPwLWVTxayQSEnMdTaSCUs2GWS0qVwafGSR0wQ= 21 | github.com/suosi-inc/lingua-go v1.0.51/go.mod h1:XDS0K21fYH99TkkUs71HxmJH03SEhPoc+RPi531aaX0= 22 | github.com/x-funs/go-fun v0.94.0 h1:claEwnVz4ybQYcdHLjm6DeDuVRntavqjOHh5dcHJG2g= 23 | github.com/x-funs/go-fun v0.94.0/go.mod h1:fYbm5aJU4EbzJkUQlodJUphsmjWgJ70iGvZNMakMSw4= 24 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 25 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 26 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 27 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 28 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 29 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 30 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 31 | golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 32 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 33 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 34 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 35 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= 36 | golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= 37 | golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= 38 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 39 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 40 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 41 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 42 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 43 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 44 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 45 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 46 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 47 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 48 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 49 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 50 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 51 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 52 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= 53 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 54 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 55 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 56 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 57 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 58 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 59 | golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= 60 | golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= 61 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 62 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 63 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 64 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 65 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 66 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | __ _ __ 3 | ____ _____ ____ / /______ _ _________ (_)___/ /__ _____ 4 | / __ `/ __ \______/ __ \/ //_/ __ `/_____/ ___/ __ \/ / __ / _ \/ ___/ 5 | / /_/ / /_/ /_____/ /_/ / ,< / /_/ /_____(__ ) /_/ / / /_/ / __/ / 6 | \__, /\____/ / .___/_/|_|\__, / /____/ .___/_/\__,_/\___/_/ 7 | /____/ /_/ /____/ /_/ 8 | 9 | ``` 10 | 11 | 一个 Golang 实现的相对智能、无需规则维护的通用新闻网站数据提取工具库。含域名探测、网页编码语种识别、网页链接分类提取、网页新闻要素抽取以及新闻正文抽取等组件。 12 | 13 | # 预览 14 | 15 | 前往 [go-pkg-spider-gui Releases](https://github.com/suosi-inc/go-pkg-spider-gui/releases) 下载支持 Windows、MacOS GUI 客户端,进行体验。 16 | 17 |

18 | 19 |

20 | 21 | # 使用 22 | 23 | ```shell 24 | go get -u github.com/suosi-inc/go-pkg-spider 25 | ``` 26 | 27 | # 介绍 28 | 29 | ## Http 客户端 30 | 31 | Http 客户端对 go-fun 中的 `fun.HttpGet` 相关函数进行了一些扩展,增加了以下功能: 32 | 33 | * 自动识别字符集和转换字符集,统一转换为 UTF-8 34 | * 响应文本类型限制 35 | 36 | - **`HttpGet(urlStr string, args ...any) ([]byte, error)`** Http Get 请求 37 | - **`HttpGetResp(urlStr string, r *HttpReq, timeout int) (*HttpResp, error)`** Http Get 请求, 返回 HttpResp 38 | 39 | ## 网页语种自动识别 40 | 41 | 当前支持以下主流语种:**中文、英语、日语、韩语、俄语、阿拉伯语、印地语、德语、法语、西班牙语、葡萄牙语、意大利语、泰语、越南语、缅甸语**。 42 | 43 | 语种识别通过 HTML 、文本特征、字符集统计规则优先识别中文、英语、日语、韩语。 44 | 45 | 同时辅助集成了 [lingua-go](https://github.com/pemistahl/lingua-go) n-gram model 语言识别模型,fork 并移除了很多语种和语料(因为完整包很大) 46 | 47 | - **`LangText(text string) (string, string)`** 识别纯文本语种 48 | - **`Lang(doc *goquery.Document, charset string, listMode bool) LangRes `** 识别 HTML 语种 49 | 50 | ### 示例 51 | 52 | 识别纯文本语种: 53 | 54 | ```go 55 | // 识别纯文本语种 56 | lang, langPos := spider.LangText(text) 57 | ``` 58 | 59 | 识别 HTML 语种: 60 | 61 | ```go 62 | // Http 请求获取响应 63 | resp, err := spider.HttpGetResp(urlStr, req, timeout) 64 | 65 | // 转换 goquery.*Document 66 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 67 | 68 | // 根据字符集、页面类型返回 69 | langRes := spider.Lang(doc, resp.Charset.Charset, false) 70 | ``` 71 | 72 | ## 域名自动探测 73 | 74 | - **`DetectDomain(domain string, timeout int, retry int) (*DomainRes, error)`** 探测主域名基本信息 75 | - **`func DetectSubDomain(domain string, timeout int, retry int) (*DomainRes, error)`** 探测子域名基本信息 76 | 77 | 根据网站域名,尽可能的探测一些基本信息,基本信息包括: 78 | 79 | ```go 80 | type DomainRes struct { 81 | // 域名 82 | Domain string 83 | // 主页域名 84 | HomeDomain string 85 | // 协议 86 | Scheme string 87 | // 字符集 88 | Charset CharsetRes 89 | // 语种 90 | Lang LangRes 91 | // 国家 92 | Country string 93 | // 省份 94 | Province string 95 | // 分类 96 | Category string 97 | // 标题 98 | Title string 99 | // 描述 100 | Description string 101 | // ICP 102 | Icp string 103 | // 状态 104 | State bool 105 | // 状态码 106 | StatusCode int 107 | // 内容页链接数量 108 | ContentCount int 109 | // 列表页链接数量 110 | ListCount int 111 | // 子域名列表 112 | SubDomains map[string]bool 113 | } 114 | ``` 115 | 116 | ## 网页链接分类提取 117 | 118 | 根据页面内容,自动分析识别并提取页面上的内容页、列表页以及其他链接,支持传入自定义规则干扰最终结果 119 | 120 | 分类依据通过链接标题、URL特征、以及统计归纳的方式 121 | 122 | - **`GetLinkData(urlStr string, strictDomain bool, timeout int, retry int) (*LinkData, error)`** 获取页面链接分类数据 123 | 124 | ### 链接分类提取结果定义 125 | 126 | ```go 127 | type LinkData struct { 128 | LinkRes *extract.LinkRes 129 | // 过滤 130 | Filters map[string]string 131 | // 子域名 132 | SubDomains map[string]bool 133 | } 134 | 135 | type LinkRes struct { 136 | // 内容页 137 | Content map[string]string 138 | // 列表页 139 | List map[string]string 140 | // 未知链接 141 | Unknown map[string]string 142 | // 过滤链接 143 | None map[string]string 144 | } 145 | ``` 146 | 147 | ## 网页新闻提取 148 | 149 | 新闻最重要的三要素:标题、发布时间、正文。其中发布时间对精准度要求高,标题和正文更追求完整性。 150 | 151 | 体验下来,业内最强大的是: [diffbot](https://www.diffbot.com/) 公司,猜测它可能是基于网页视觉+深度学习来实现。 152 | 153 | 有不少新闻正文提取或新闻正文抽取的开源的方案,大都是基于规则或统计方法实现。如: 154 | 155 | * Python: [GeneralNewsExtractor](https://github.com/GeneralNewsExtractor/GeneralNewsExtractor) 156 | * Java: [WebCollector/ContentExtractor](https://github.com/CrawlScript/WebCollector) 157 | 158 | 更古老的还有:[python-goose](https://github.com/grangier/python-goose), [newspaper](https://github.com/codelucas/newspaper),甚至 Readability、Html2Article 等等。 159 | 160 | 其中:`WebCollector/ContentExtractor` 是 [基于标签路径特征融合新闻内容抽取的 CEPF 算法](http://www.jos.org.cn/jos/article/abstract/4868) 的 Java 实现版本。 161 | 162 | go-pkg-spider 实现了 CEPF 算法的 Golang 版本,在此基础上做了大量优化,内置了一些通用规则,更精细的控制了标题和发布时间的提取与转换,并支持多语种新闻网站的要素提取。 163 | 164 | 165 | ### 新闻要素提取结果定义 166 | 167 | ```go 168 | type News struct { 169 | // 标题 170 | Title string 171 | // 标题提取依据 172 | TitlePos string 173 | // 发布时间 174 | TimeLocal string 175 | // 原始时间 176 | Time string 177 | // 发布时间时间提取依据 178 | TimePos string 179 | // 正文纯文本 180 | Content string 181 | // 正文 Node 节点 182 | ContentNode *html.Node 183 | // 提取用时(毫秒) 184 | Spend int64 185 | // 语种 186 | Lang string 187 | } 188 | ``` 189 | 190 | 可根据 `ContentNode *html.Node` 来重新定义需要清洗保留的标签。 191 | 192 | ### 效果 193 | 194 |

195 | 196 |

197 | 198 | ### 示例 199 | 200 | ```go 201 | // Http 请求获取响应 202 | resp, err := spider.HttpGetResp(urlStr, req, timeout) 203 | 204 | // 转换 goquery.*Document 205 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 206 | 207 | // 基本清理 208 | doc.Find(spider.DefaultDocRemoveTags).Remove() 209 | 210 | // 语种 211 | langRes := Lang(doc, resp.Charset.Charset, false) 212 | 213 | // 新闻提取 214 | content := extract.NewContent(contentDoc, langRes.Lang, listTitle, urlStr) 215 | 216 | // 新闻提取结果 217 | news := content.ExtractNews() 218 | ``` 219 | 220 | 可以通过下面的已经封装好的方法完成以上步骤: 221 | 222 | - **`GetNews(urlStr string, title string, timeout int, retry int) (*extract.News, *HttpResp, error)`** 获取链接新闻数据 223 | 224 | # 免责声明 225 | 226 | 本项目是一个数据提取工具库,不是爬虫框架或采集软件,只限于技术交流,源码中请求目标网站的相关代码仅为功能测试需要。 227 | 228 | 请在符合法律法规和相关规定的情况下使用本项目,禁止使用本项目进行任何非法、侵权或者违反公序良俗的行为。 229 | 230 | 使用本项目造成的直接或间接的风险由用户自行承担。 231 | -------------------------------------------------------------------------------- /detect_test.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "testing" 7 | 8 | "github.com/PuerkitoBio/goquery" 9 | "github.com/suosi-inc/go-pkg-spider/extract" 10 | "github.com/x-funs/go-fun" 11 | ) 12 | 13 | func TestDomainDetect(t *testing.T) { 14 | domains := []string{ 15 | // "china-nengyuan.com", 16 | // "suosi.com.cn", 17 | // "wanjiaxian.com", 18 | "thediplomat.com", 19 | } 20 | 21 | for _, domain := range domains { 22 | domainRes, err := DetectDomain(domain, 10000, 1) 23 | if err == nil { 24 | t.Log(domainRes.Title) 25 | t.Log(domainRes.TitleClean) 26 | t.Log(domainRes) 27 | } else { 28 | t.Log(err) 29 | t.Log(domainRes) 30 | } 31 | } 32 | } 33 | 34 | func BenchmarkLinkTitles(b *testing.B) { 35 | urlStr := "http://www.qq.com/" 36 | 37 | resp, _ := HttpGetResp(urlStr, nil, 30000) 38 | 39 | // 解析 HTML 40 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 41 | doc.Find(DefaultDocRemoveTags).Remove() 42 | 43 | // 语言 44 | 45 | langRes := Lang(doc, resp.Charset.Charset, true) 46 | 47 | fmt.Println(langRes) 48 | 49 | var linkTitles map[string]string 50 | 51 | b.ResetTimer() 52 | 53 | for i := 0; i < b.N; i++ { 54 | // 标题 55 | linkTitles, _ = extract.WebLinkTitles(doc, resp.RequestURL, true) 56 | 57 | // 连接和子域名 58 | _, _ = extract.LinkTypes(linkTitles, langRes.Lang, nil) 59 | 60 | // rules := map[string][]string{ 61 | // "163.com": []string{ 62 | // "`\\w{16}\\.html`", 63 | // }, 64 | // } 65 | // _, _ = extract.LinkTypes(linkTitles, langRes.Lang, rules) 66 | } 67 | 68 | b.StopTimer() 69 | 70 | fmt.Println(langRes.Lang) 71 | fmt.Println(len(linkTitles)) 72 | 73 | } 74 | 75 | func TestLinkTitles(t *testing.T) { 76 | var urlStrs = []string{ 77 | "https://www.1905.com", 78 | // "https://www.people.com.cn", 79 | // "https://www.36kr.com", 80 | // "https://www.163.com", 81 | // "https://news.163.com/", 82 | // "http://jyj.suqian.gov.cn", 83 | // "https://www.huxiu.com/", 84 | // "http://www.news.cn/politicspro/", 85 | // "http://www.cankaoxiaoxi.com", 86 | // "http://www.bbc.com", 87 | // "https://www.ft.com", 88 | // "https://www.reuters.com/", 89 | // "https://nypost.com/", 90 | // "http://www.mengcheng.gov.cn/", 91 | // "https://www.chunichi.co.jp", 92 | // "https://www.donga.com/", 93 | // "https://people.com/", 94 | // "https://czql.gov.cn/", 95 | // "https://qiye.163.com/", 96 | // "https://www.washingtontimes.com/", 97 | // "https://www.gamersky.com/", 98 | // "https://www.cdns.com.tw/", 99 | // "http://www.163.com/", 100 | } 101 | 102 | for _, urlStr := range urlStrs { 103 | 104 | resp, err := HttpGetResp(urlStr, nil, 30000) 105 | 106 | t.Log(urlStr) 107 | t.Log(err) 108 | 109 | // 解析 HTML 110 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 111 | doc.Find(DefaultDocRemoveTags).Remove() 112 | 113 | // 语言 114 | langRes := Lang(doc, resp.Charset.Charset, true) 115 | 116 | fmt.Println(resp.Charset) 117 | fmt.Println(langRes) 118 | 119 | // 标题 120 | linkTitles, filters := extract.WebLinkTitles(doc, resp.RequestURL, true) 121 | 122 | // 分类链接和子域名列表 123 | linkRes, domainRes := extract.LinkTypes(linkTitles, langRes.Lang, nil) 124 | 125 | // 分类链接和子域名列表, 规则 126 | // rules := map[string][]string{ 127 | // "cankaoxiaoxi.com": []string{ 128 | // "\\d{7}\\.shtml$", 129 | // }, 130 | // } 131 | // linkRes, domainRes := extract.LinkTypes(linkTitles, langRes.Lang, rules) 132 | 133 | fmt.Println("all:", len(linkTitles)) 134 | fmt.Println("content:", len(linkRes.Content)) 135 | fmt.Println("list:", len(linkRes.List)) 136 | fmt.Println("unknown:", len(linkRes.Unknown)) 137 | fmt.Println("none:", len(linkRes.None)) 138 | 139 | i := 0 140 | for a, title := range filters { 141 | i = i + 1 142 | fmt.Println(i, "filter:"+a+"\t=>\t"+title) 143 | } 144 | i = 0 145 | for subdomain := range domainRes { 146 | i = i + 1 147 | fmt.Println(i, "domain:"+subdomain) 148 | } 149 | i = 0 150 | for a, title := range linkRes.Content { 151 | i = i + 1 152 | fmt.Println(i, "content:"+a+"\t=>\t"+title) 153 | } 154 | i = 0 155 | for a, title := range linkRes.Unknown { 156 | i = i + 1 157 | fmt.Println(i, "unknown:"+a+"\t=>\t"+title) 158 | } 159 | i = 0 160 | for a, title := range linkRes.List { 161 | i = i + 1 162 | fmt.Println(i, "list:"+a+"\t=>\t"+title) 163 | } 164 | i = 0 165 | for a, title := range linkRes.None { 166 | i = i + 1 167 | fmt.Println(i, "none:"+a+"\t=>\t"+title) 168 | } 169 | 170 | } 171 | } 172 | 173 | func TestDetectIcp(t *testing.T) { 174 | var urlStrs = []string{ 175 | // "http://suosi.com.cn", 176 | "https://www.163.com", 177 | // "https://www.sohu.com", 178 | // "https://www.qq.com", 179 | // "https://www.hexun.com", 180 | // "https://www.wfmc.edu.cn/", 181 | // "https://www.cankaoxiaoxi.com/", 182 | } 183 | 184 | for _, urlStr := range urlStrs { 185 | 186 | resp, err := HttpGetResp(urlStr, nil, 30000) 187 | 188 | t.Log(err) 189 | t.Log(urlStr) 190 | 191 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 192 | doc.Find(DefaultDocRemoveTags).Remove() 193 | icp, loc := extract.Icp(doc) 194 | t.Log(icp, loc) 195 | } 196 | } 197 | 198 | func TestLangFromUtf8Body(t *testing.T) { 199 | var urlStrs = []string{ 200 | // "https://www.163.com", 201 | // "https://english.news.cn", 202 | // "https://jp.news.cn", 203 | // "https://kr.news.cn", 204 | // "https://arabic.news.cn", 205 | // "https://www.bbc.com", 206 | // "http://government.ru", 207 | // "https://french.news.cn", 208 | // "https://www.gouvernement.fr", 209 | // "http://live.siammedia.org/", 210 | // "http://hanoimoi.com.vn", 211 | // "https://www.commerce.gov.mm", 212 | // "https://www.rrdmyanmar.gov.mm", 213 | "https://czql.gov.cn/", 214 | } 215 | 216 | for _, urlStr := range urlStrs { 217 | resp, _ := fun.HttpGetResp(urlStr, nil, 30000) 218 | 219 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 220 | doc.Find(DefaultDocRemoveTags).Remove() 221 | 222 | start := fun.Timestamp(true) 223 | lang, pos := LangFromUtf8Body(doc, false) 224 | t.Log(urlStr) 225 | t.Log(lang) 226 | t.Log(pos) 227 | t.Log(fun.Timestamp(true) - start) 228 | 229 | } 230 | } 231 | 232 | func TestDetectFriendDomainDo(t *testing.T) { 233 | var domains = []string{ 234 | "northnews.cn", 235 | } 236 | 237 | for _, domain := range domains { 238 | friendDomains, err := DetectFriendDomainDo(domain, 10000) 239 | 240 | t.Log(err) 241 | t.Log(friendDomains) 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /spider.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "regexp" 7 | "strings" 8 | 9 | "github.com/PuerkitoBio/goquery" 10 | "github.com/suosi-inc/go-pkg-spider/extract" 11 | "github.com/x-funs/go-fun" 12 | ) 13 | 14 | const ( 15 | RegexHostnameIp = `\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}` 16 | 17 | RegexMetaRefresh = `(?i)url=(.+)` 18 | ) 19 | 20 | var ( 21 | DefaultDocRemoveTags = "script,noscript,style,iframe,br,link,svg" 22 | 23 | RegexHostnameIpPattern = regexp.MustCompile(RegexHostnameIp) 24 | 25 | regexMetaRefreshPattern = regexp.MustCompile(RegexMetaRefresh) 26 | ) 27 | 28 | type LinkData struct { 29 | LinkRes *extract.LinkRes 30 | Filters map[string]string 31 | SubDomains map[string]bool 32 | } 33 | 34 | // GetLinkData 获取页面链接数据 35 | func GetLinkData(urlStr string, strictDomain bool, timeout int, retry int) (*LinkData, error) { 36 | if retry <= 0 { 37 | retry = 1 38 | } 39 | 40 | errs := make([]string, 0) 41 | 42 | for i := 0; i < retry; i++ { 43 | linkData, err := GetLinkDataDo(urlStr, strictDomain, nil, nil, timeout) 44 | if err == nil { 45 | return linkData, err 46 | } else { 47 | errs = append(errs, err.Error()) 48 | } 49 | } 50 | 51 | return nil, errors.New("ErrorLinkRes" + fun.ToString(errs)) 52 | } 53 | 54 | // GetLinkDataWithReq 获取页面链接数据 55 | func GetLinkDataWithReq(urlStr string, strictDomain bool, req *HttpReq, timeout int, retry int) (*LinkData, error) { 56 | if retry <= 0 { 57 | retry = 1 58 | } 59 | 60 | errs := make([]string, 0) 61 | 62 | for i := 0; i < retry; i++ { 63 | linkData, err := GetLinkDataDo(urlStr, strictDomain, nil, req, timeout) 64 | if err == nil { 65 | return linkData, err 66 | } else { 67 | errs = append(errs, err.Error()) 68 | } 69 | } 70 | 71 | return nil, errors.New("ErrorLinkRes" + fun.ToString(errs)) 72 | } 73 | 74 | // GetLinkDataWithReqAndRule 获取页面链接数据 75 | func GetLinkDataWithReqAndRule(urlStr string, strictDomain bool, rules extract.LinkTypeRule, req *HttpReq, timeout int, retry int) (*LinkData, error) { 76 | if retry <= 0 { 77 | retry = 1 78 | } 79 | 80 | errs := make([]string, 0) 81 | 82 | for i := 0; i < retry; i++ { 83 | linkData, err := GetLinkDataDo(urlStr, strictDomain, rules, req, timeout) 84 | if err == nil { 85 | return linkData, err 86 | } else { 87 | errs = append(errs, err.Error()) 88 | } 89 | } 90 | 91 | return nil, errors.New("ErrorLinkRes" + fun.ToString(errs)) 92 | } 93 | 94 | // GetLinkDataWithRule 获取页面链接数据 95 | func GetLinkDataWithRule(urlStr string, strictDomain bool, rules extract.LinkTypeRule, timeout int, retry int) (*LinkData, error) { 96 | if retry <= 0 { 97 | retry = 1 98 | } 99 | 100 | errs := make([]string, 0) 101 | 102 | for i := 0; i < retry; i++ { 103 | linkData, err := GetLinkDataDo(urlStr, strictDomain, rules, nil, timeout) 104 | if err == nil { 105 | return linkData, err 106 | } else { 107 | errs = append(errs, err.Error()) 108 | } 109 | } 110 | 111 | return nil, errors.New("ErrorLinkRes" + fun.ToString(errs)) 112 | } 113 | 114 | // GetLinkDataDo 获取页面链接数据 115 | func GetLinkDataDo(urlStr string, strictDomain bool, rules extract.LinkTypeRule, req *HttpReq, timeout int) (*LinkData, error) { 116 | if timeout == 0 { 117 | timeout = 10000 118 | } 119 | 120 | if req == nil { 121 | req = &HttpReq{ 122 | HttpReq: &fun.HttpReq{ 123 | MaxContentLength: HttpDefaultMaxContentLength, 124 | MaxRedirect: 3, 125 | }, 126 | ForceTextContentType: true, 127 | } 128 | } 129 | 130 | resp, err := HttpGetResp(urlStr, req, timeout) 131 | if resp != nil && err == nil && resp.Success { 132 | // 解析 HTML 133 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 134 | if docErr == nil { 135 | linkData := &LinkData{} 136 | 137 | doc.Find(DefaultDocRemoveTags).Remove() 138 | 139 | // 语言 140 | langRes := Lang(doc, resp.Charset.Charset, true) 141 | 142 | // 站内链接 143 | linkTitles, filters := extract.WebLinkTitles(doc, resp.RequestURL, strictDomain) 144 | 145 | // 链接分类 146 | linkRes, subDomains := extract.LinkTypes(linkTitles, langRes.Lang, rules) 147 | 148 | linkData.LinkRes = linkRes 149 | linkData.Filters = filters 150 | linkData.SubDomains = subDomains 151 | 152 | return linkData, nil 153 | } else { 154 | return nil, errors.New("ErrorDocParse") 155 | } 156 | } 157 | 158 | return nil, errors.New("ErrorRequest") 159 | } 160 | 161 | // GetNews 获取链接新闻数据 162 | func GetNews(urlStr string, title string, timeout int, retry int) (*extract.News, *HttpResp, error) { 163 | if retry <= 0 { 164 | retry = 1 165 | } 166 | 167 | errs := make([]string, 0) 168 | 169 | for i := 0; i < retry; i++ { 170 | news, resp, err := GetNewsDo(urlStr, title, nil, timeout) 171 | if err == nil { 172 | return news, resp, nil 173 | } else { 174 | errs = append(errs, err.Error()) 175 | } 176 | } 177 | 178 | return nil, nil, errors.New("ErrorRequest" + fun.ToString(errs)) 179 | } 180 | 181 | // GetNewsWithReq 获取链接新闻数据 182 | func GetNewsWithReq(urlStr string, title string, req *HttpReq, timeout int, retry int) (*extract.News, *HttpResp, error) { 183 | if retry <= 0 { 184 | retry = 1 185 | } 186 | 187 | errs := make([]string, 0) 188 | 189 | for i := 0; i < retry; i++ { 190 | news, resp, err := GetNewsDo(urlStr, title, req, timeout) 191 | if err == nil { 192 | return news, resp, nil 193 | } else { 194 | errs = append(errs, err.Error()) 195 | } 196 | } 197 | 198 | return nil, nil, errors.New("ErrorRequest" + fun.ToString(errs)) 199 | } 200 | 201 | // GetNewsDo 获取链接新闻数据 202 | func GetNewsDo(urlStr string, title string, req *HttpReq, timeout int) (*extract.News, *HttpResp, error) { 203 | return getNewsDoTop(urlStr, title, req, timeout, true) 204 | } 205 | 206 | // getNewsDoTop 获取链接新闻数据 207 | func getNewsDoTop(urlStr string, title string, req *HttpReq, timeout int, top bool) (*extract.News, *HttpResp, error) { 208 | if timeout == 0 { 209 | timeout = HttpDefaultTimeOut 210 | } 211 | 212 | if req == nil { 213 | req = &HttpReq{ 214 | HttpReq: &fun.HttpReq{ 215 | MaxContentLength: HttpDefaultMaxContentLength, 216 | MaxRedirect: 2, 217 | }, 218 | ForceTextContentType: true, 219 | } 220 | } 221 | 222 | resp, err := HttpGetResp(urlStr, req, timeout) 223 | 224 | if resp != nil && err == nil && resp.Success { 225 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 226 | if docErr == nil { 227 | contentDoc := goquery.CloneDocument(doc) 228 | doc.Find(DefaultDocRemoveTags).Remove() 229 | 230 | // 具有 HTML 跳转属性, 如果为本域名下, 则跳转一次 231 | if top { 232 | if refresh, exists := doc.Find("meta[http-equiv='refresh' i]").Attr("content"); exists { 233 | refreshMatch := regexMetaRefreshPattern.FindStringSubmatch(refresh) 234 | if len(refreshMatch) > 1 { 235 | requestHostname := resp.RequestURL.Hostname() 236 | requestTopDomain := extract.DomainTop(requestHostname) 237 | refreshUrl := strings.TrimSpace(refreshMatch[1]) 238 | if r, err := fun.UrlParse(refreshUrl); err == nil { 239 | refreshHostname := r.Hostname() 240 | refreshTopDomain := extract.DomainTop(refreshHostname) 241 | if refreshTopDomain != "" && refreshTopDomain == requestTopDomain { 242 | return getNewsDoTop(refreshUrl, title, req, timeout, false) 243 | } 244 | } 245 | } 246 | } 247 | } 248 | 249 | // 语言 250 | langRes := Lang(doc, resp.Charset.Charset, false) 251 | 252 | // 正文抽取 253 | content := extract.NewContent(contentDoc, langRes.Lang, title, urlStr) 254 | news := content.ExtractNews() 255 | 256 | return news, resp, nil 257 | } else { 258 | return nil, resp, errors.New("ErrorDocParse") 259 | } 260 | } 261 | 262 | return nil, nil, errors.New("ErrorRequest") 263 | } 264 | -------------------------------------------------------------------------------- /spider_news.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "strings" 5 | "sync" 6 | "time" 7 | 8 | "github.com/x-funs/go-fun" 9 | ) 10 | 11 | // 新闻采集器结构体 12 | type NewsSpider struct { 13 | Url string // 根链接 14 | Depth uint8 // 采集页面深度 15 | seen map[string]bool // 是否已采集 16 | IsSub bool // 是否采集子域名 17 | linkChan chan *NewsData // NewsData 通道共享 18 | contentChan chan *NewsContent // NewsContent 通道共享 19 | ProcessFunc func(...any) // 处理函数 20 | RetryTime int // 请求重试次数 21 | TimeOut int // 请求响应时间 22 | wg *sync.WaitGroup // 同步等待组 23 | Req *HttpReq // 请求体 24 | Ctx any // 任务详情上下文,传入ProcessFunc函数中 25 | } 26 | 27 | // 新闻内容结构体 28 | type NewsContent struct { 29 | Url string // 链接 30 | Title string // 标题 31 | Time string // 发布时间 32 | Content string // 正文纯文本 33 | Lang string // 语种 34 | } 35 | 36 | // 新闻LinkData总数据 37 | type NewsData struct { 38 | *LinkData 39 | Depth uint8 // 采集深度溯源 40 | ListUrl string // 列表页溯源 41 | Error error 42 | } 43 | 44 | // 自定义配置函数 45 | type Option func(*NewsSpider) 46 | 47 | // 原型链接口 48 | type Prototype interface { 49 | Clone() Prototype 50 | } 51 | 52 | // NewNewsSpider 初始化 53 | func NewNewsSpider(url string, depth uint8, pf func(...any), ctx any, options ...Option) *NewsSpider { 54 | n := &NewsSpider{ 55 | Url: url, 56 | Depth: depth, 57 | seen: map[string]bool{}, 58 | IsSub: false, 59 | linkChan: make(chan *NewsData), 60 | contentChan: make(chan *NewsContent), 61 | ProcessFunc: pf, 62 | RetryTime: 2, 63 | TimeOut: 20000, 64 | wg: &sync.WaitGroup{}, 65 | Req: nil, 66 | Ctx: ctx, 67 | } 68 | 69 | // 函数式选项模式 70 | for _, option := range options { 71 | option(n) 72 | } 73 | 74 | return n 75 | } 76 | 77 | func WithRetryTime(retryTime int) Option { 78 | return func(n *NewsSpider) { 79 | n.RetryTime = retryTime 80 | } 81 | } 82 | 83 | func WithTimeOut(timeout int) Option { 84 | return func(n *NewsSpider) { 85 | n.TimeOut = timeout 86 | } 87 | } 88 | 89 | func WithReq(req *HttpReq) Option { 90 | return func(n *NewsSpider) { 91 | n.Req = req 92 | } 93 | } 94 | 95 | func WithIsSub(isSub bool) Option { 96 | return func(n *NewsSpider) { 97 | n.IsSub = isSub 98 | } 99 | } 100 | 101 | // 原型链结构体拷贝 102 | func (n *NewsSpider) Clone() Prototype { 103 | nc := *n 104 | 105 | // 拷贝时需重置chan和wg等字段 106 | nc.seen = map[string]bool{} 107 | nc.linkChan = make(chan *NewsData) 108 | nc.contentChan = make(chan *NewsContent) 109 | nc.wg = &sync.WaitGroup{} 110 | 111 | return &nc 112 | } 113 | 114 | // GetNews 开始采集 115 | func (n *NewsSpider) GetNews(linksHandleFunc func(*NewsData)) { 116 | // 初始化列表页和内容页切片 117 | var ( 118 | listSlice []string 119 | listSliceTemp []string 120 | subDomainSlice []string 121 | ) 122 | 123 | // 获取首页url和协议 124 | scheme, indexUrl := GetIndexUrl(n.Url) 125 | 126 | // 首次添加当前页 127 | listSliceTemp = append(listSliceTemp, n.Url) 128 | 129 | if n.IsSub { 130 | // 先探测出首页url的所有子域名 131 | subDomains, _ := GetSubdomains(indexUrl, n.Req, n.TimeOut, n.RetryTime*100) 132 | 133 | for subDomain := range subDomains { 134 | subDomainSlice = append(subDomainSlice, subDomain) 135 | listSliceTemp = append(listSliceTemp, subDomain) 136 | } 137 | } 138 | 139 | // 深度优先循环遍历获取页面列表页和内容页 140 | for i := 0; i < int(n.Depth); i++ { 141 | listS, _ := n.GetNewsLinkRes(linksHandleFunc, scheme, listSliceTemp, uint8(i+1), n.TimeOut, n.RetryTime) 142 | listSlice = append(listSlice, listS...) 143 | 144 | // 重置循环列表页 145 | if len(listS) == 0 { 146 | break 147 | } 148 | listSliceTemp = listS 149 | } 150 | } 151 | 152 | // GetNewsLinkRes 获取news页面链接分组, 仅返回列表页和内容页 153 | func (n *NewsSpider) GetNewsLinkRes(linksHandleFunc func(*NewsData), scheme string, urls []string, depth uint8, timeout int, retry int) ([]string, error) { 154 | listSlice := []string{} 155 | 156 | for _, url := range urls { 157 | if !strings.Contains(url, "http") { 158 | url = scheme + url 159 | } 160 | 161 | if linkData, err := GetLinkDataWithReq(url, true, n.Req, timeout, retry); err == nil { 162 | for l := range linkData.LinkRes.List { 163 | if !n.seen[l] { 164 | n.seen[l] = true 165 | listSlice = append(listSlice, l) 166 | } 167 | } 168 | 169 | newsData := &NewsData{linkData, depth, url, nil} 170 | 171 | n.wg.Add(1) 172 | go linksHandleFunc(newsData) 173 | 174 | } else { 175 | // 报错空的LinkData也需要push 176 | newsData := &NewsData{nil, depth, url, err} 177 | 178 | n.wg.Add(1) 179 | go linksHandleFunc(newsData) 180 | 181 | // return nil, errors.New("GetNewsLinkRes Err") 182 | } 183 | } 184 | 185 | return listSlice, nil 186 | } 187 | 188 | // CrawlLinkRes 直接推送列表页内容页 189 | func (n *NewsSpider) CrawlLinkRes(l *NewsData) { 190 | defer n.wg.Done() 191 | // defer n.sleep() 192 | 193 | n.PushLinks(l) 194 | } 195 | 196 | // GetContentNews 解析内容页详情数据 197 | func (n *NewsSpider) CrawlContentNews(l *NewsData) { 198 | defer n.wg.Done() 199 | // defer n.sleep() 200 | 201 | if l.Error == nil { 202 | for c, v := range l.LinkRes.Content { 203 | if !n.seen[c] { 204 | n.seen[c] = true 205 | cc := map[string]string{} 206 | cc[c] = v 207 | 208 | n.wg.Add(1) 209 | go n.ReqContentNews(cc) 210 | } 211 | } 212 | } 213 | } 214 | 215 | // ReqContentNews 获取内容页详情数据 216 | func (n *NewsSpider) ReqContentNews(content map[string]string) { 217 | defer n.wg.Done() 218 | 219 | time.Sleep(time.Duration(fun.RandomInt(10, 100)) * time.Millisecond) 220 | 221 | for url, title := range content { 222 | if news, _, err := GetNews(url, title, n.TimeOut, n.RetryTime); err == nil { 223 | newsData := &NewsContent{} 224 | newsData.Url = url 225 | newsData.Title = news.Title 226 | newsData.Content = news.Content 227 | newsData.Time = news.TimeLocal 228 | newsData.Lang = news.Lang 229 | 230 | n.PushContentNews(newsData) 231 | } 232 | } 233 | } 234 | 235 | // PushLinks 推送links数据 236 | func (n *NewsSpider) PushLinks(data *NewsData) { 237 | n.linkChan <- data 238 | } 239 | 240 | // PushContentNews 推送详情页数据 241 | func (n *NewsSpider) PushContentNews(data *NewsContent) { 242 | n.contentChan <- data 243 | } 244 | 245 | // Wait wg阻塞等待退出 246 | func (n *NewsSpider) Wait() { 247 | n.wg.Wait() 248 | } 249 | 250 | // Close 关闭Chan 251 | func (n *NewsSpider) Close() { 252 | close(n.linkChan) 253 | close(n.contentChan) 254 | } 255 | 256 | // process 处理chan data函数 257 | func (n *NewsSpider) process(processFunc func(...any)) { 258 | for { 259 | select { 260 | case data, ok := <-n.linkChan: 261 | if !ok { 262 | return 263 | } 264 | processFunc(data, n.Ctx) 265 | case data, ok := <-n.contentChan: 266 | if !ok { 267 | return 268 | } 269 | processFunc(data, n.Ctx) 270 | } 271 | } 272 | } 273 | 274 | // GetLinkRes 回调获取LinkRes数据 275 | func (n *NewsSpider) GetLinkRes() { 276 | n.GetNews(n.CrawlLinkRes) 277 | 278 | go n.process(n.ProcessFunc) 279 | 280 | n.Wait() 281 | defer n.Close() 282 | } 283 | 284 | // GetContentNews 回调获取内容页数据 285 | func (n *NewsSpider) GetContentNews() { 286 | n.GetNews(n.CrawlContentNews) 287 | 288 | go n.process(n.ProcessFunc) 289 | 290 | n.Wait() 291 | defer n.Close() 292 | } 293 | 294 | // GetSubdomains 获取subDomain 295 | func GetSubdomains(url string, req *HttpReq, timeout int, retry int) (map[string]bool, error) { 296 | if linkData, err := GetLinkDataWithReq(url, true, req, timeout, retry); err == nil { 297 | return linkData.SubDomains, nil 298 | } else { 299 | return nil, err 300 | } 301 | } 302 | 303 | // GetIndexUrl 获取首页url 304 | func GetIndexUrl(url string) (string, string) { 305 | urlSlice := strings.Split(url, "/") 306 | if len(urlSlice) == 1 { 307 | // domain 308 | return "https://", "https://www." + url 309 | } 310 | scheme := urlSlice[0] + "//" 311 | indexUrl := scheme + urlSlice[2] 312 | return scheme, indexUrl 313 | } 314 | 315 | // sleep depth只有一层时,需要等待几秒,避免wg done后直接退出,导致select来不及取出数据 316 | func (n *NewsSpider) sleep() { 317 | if n.Depth == 1 { 318 | time.Sleep(2 * time.Second) 319 | } 320 | } 321 | -------------------------------------------------------------------------------- /extract/web.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import ( 4 | "errors" 5 | "net/url" 6 | "path" 7 | "regexp" 8 | "strings" 9 | 10 | "github.com/PuerkitoBio/goquery" 11 | "github.com/x-funs/go-fun" 12 | ) 13 | 14 | const ( 15 | RegexHostnameIp = `\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}` 16 | ) 17 | 18 | var ( 19 | filterUrlSuffix = []string{ 20 | ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".xml", 21 | ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", 22 | ".zip", ".rar", ".7z", ".gz", ".apk", ".cgi", ".exe", ".bz2", ".play", 23 | ".rss", ".sig", ".sgf", 24 | ".mp3", ".mp4", ".rm", ".rmvb", ".mov", ".ogv", ".flv", 25 | } 26 | 27 | invalidUrlCharsets = []string{"{", "}", "[", "]", "@", "$", "<", ">", "\""} 28 | 29 | titleZhSplits = []string{"_", "|", "-", "-", "|", "—", "*", ":", ",", ",", ":", "·", ">>", "="} 30 | 31 | titleZhContentSplits = []string{"_", "|", "-", "-", "|", "—"} 32 | 33 | titleEnSplits = []string{" - ", " | ", ":"} 34 | 35 | RegexHostnameIpPattern = regexp.MustCompile(RegexHostnameIp) 36 | ) 37 | 38 | // WebTitle 返回网页标题, 最大 128 个字符 39 | func WebTitle(doc *goquery.Document, maxLength int) string { 40 | var title string 41 | titleNode := doc.Find("title") 42 | if titleNode.Size() > 1 { 43 | // 竟然有多个 title, 只取第一个 44 | title = titleNode.First().Text() 45 | } else { 46 | title = titleNode.Text() 47 | } 48 | 49 | title = fun.RemoveLines(title) 50 | title = strings.TrimSpace(title) 51 | 52 | if maxLength > 0 && maxLength < 128 { 53 | return fun.SubString(title, 0, maxLength) 54 | } else { 55 | return fun.SubString(title, 0, 128) 56 | } 57 | } 58 | 59 | // WebTitleClean 返回尽量清洗后的网页标题 60 | func WebTitleClean(title string, lang string) string { 61 | // 中文网站, 查找中文网站的分割标记, 找到任意一个, 从尾部循环删除后返回 62 | if lang == "zh" { 63 | 64 | for _, split := range titleZhSplits { 65 | if fun.HasPrefixCase(title, split) { 66 | title = fun.RemovePrefix(title, split) 67 | } 68 | } 69 | 70 | // 去除首页开头 71 | if fun.HasPrefixCase(title, "首页") { 72 | title = regexp.MustCompile("首页([ |\\-_-—|])*").ReplaceAllString(title, "") 73 | } 74 | 75 | titleClean := title 76 | for _, split := range titleZhSplits { 77 | var exists bool 78 | end := strings.LastIndex(titleClean, split) 79 | if end != -1 { 80 | exists = true 81 | for { 82 | titleClean = strings.TrimSpace(titleClean[:end]) 83 | end = strings.LastIndex(titleClean, split) 84 | 85 | if end == -1 { 86 | break 87 | } 88 | } 89 | if exists { 90 | break 91 | } 92 | } 93 | } 94 | 95 | // 去除尾巴 96 | if titleClean != "首页" { 97 | titleClean = fun.RemoveSuffix(titleClean, "首页") 98 | } 99 | 100 | titleClean = fun.RemoveSign(titleClean) 101 | 102 | return titleClean 103 | 104 | } else { 105 | // 其他, 查找英文分割标记, 如果找到, 从尾部删除一次返回 106 | for _, split := range titleEnSplits { 107 | end := strings.LastIndex(title, split) 108 | if end != -1 { 109 | titleClean := strings.TrimSpace(title[:end]) 110 | return titleClean 111 | } 112 | } 113 | } 114 | 115 | return title 116 | } 117 | 118 | // WebContentTitleClean 返回内容页尽量清洗后的网页标题 119 | func WebContentTitleClean(title string, lang string) string { 120 | // 中文网站, 查找中文网站的分割标记, 找到任意一个, 从尾部循环删除后返回 121 | if lang == "zh" { 122 | for _, split := range titleZhContentSplits { 123 | if fun.HasPrefixCase(title, split) { 124 | title = fun.RemovePrefix(title, split) 125 | } 126 | } 127 | 128 | titleClean := title 129 | for _, split := range titleZhContentSplits { 130 | var exists bool 131 | end := strings.LastIndex(titleClean, split) 132 | if end != -1 { 133 | exists = true 134 | for { 135 | titleClean = strings.TrimSpace(titleClean[:end]) 136 | end = strings.LastIndex(titleClean, split) 137 | 138 | if end == -1 { 139 | break 140 | } 141 | } 142 | if exists { 143 | break 144 | } 145 | } 146 | } 147 | 148 | return titleClean 149 | 150 | } else { 151 | // 其他, 查找英文分割标记, 如果找到, 从尾部删除一次返回 152 | for _, split := range titleEnSplits { 153 | end := strings.LastIndex(title, split) 154 | if end != -1 { 155 | titleClean := strings.TrimSpace(title[:end]) 156 | return titleClean 157 | } 158 | } 159 | } 160 | 161 | return title 162 | } 163 | 164 | // WebKeywords 返回网页 Keyword 165 | func WebKeywords(doc *goquery.Document) string { 166 | keywords := doc.Find("meta[name='keywords' i]").AttrOr("content", "") 167 | keywords = fun.RemoveLines(keywords) 168 | keywords = strings.TrimSpace(keywords) 169 | 170 | return keywords 171 | } 172 | 173 | // WebDescription 返回网页描述, 最大 384 个字符 174 | func WebDescription(doc *goquery.Document, maxLength int) string { 175 | description := doc.Find("meta[name='description' i]").AttrOr("content", "") 176 | description = fun.RemoveLines(description) 177 | description = strings.TrimSpace(description) 178 | 179 | if maxLength > 0 && maxLength < 384 { 180 | return fun.SubString(description, 0, maxLength) 181 | } else { 182 | return fun.SubString(description, 0, 384) 183 | } 184 | } 185 | 186 | // WebLinkTitles 返回网页链接和锚文本 187 | func WebLinkTitles(doc *goquery.Document, baseUrl *url.URL, strictDomain bool) (map[string]string, map[string]string) { 188 | var linkTitles = make(map[string]string) 189 | var filters = make(map[string]string) 190 | 191 | // 当前请求的 urlStr 192 | if baseUrl == nil { 193 | return linkTitles, filters 194 | } 195 | 196 | // 获取所有 a 链接 197 | aTags := doc.Find("a") 198 | if aTags.Size() > 0 { 199 | var tmpLinks = make(map[string]string) 200 | 201 | // 提取所有的 a 链接 202 | aTags.Each(func(i int, s *goquery.Selection) { 203 | tmpLink, exists := s.Attr("href") 204 | if exists { 205 | tmpLink = fun.RemoveLines(tmpLink) 206 | tmpLink = strings.TrimSpace(tmpLink) 207 | 208 | tmpTitle := s.Text() 209 | tmpTitle = fun.NormaliseSpace(tmpTitle) 210 | tmpTitle = strings.TrimSpace(tmpTitle) 211 | if tmpLink != "" && tmpTitle != "" { 212 | // 如果链接已存在, 保留长标题 213 | if _, exists := tmpLinks[tmpLink]; exists { 214 | oldTitle := tmpLinks[tmpLink] 215 | if len(oldTitle) < len(tmpTitle) { 216 | tmpLinks[tmpLink] = tmpTitle 217 | } 218 | } else { 219 | tmpLinks[tmpLink] = tmpTitle 220 | } 221 | } 222 | } 223 | }) 224 | 225 | // 过滤链接 226 | tmpLinkLen := len(tmpLinks) 227 | if tmpLinkLen > 0 { 228 | for link, title := range tmpLinks { 229 | if a, err := filterUrl(link, baseUrl, strictDomain); err == nil { 230 | linkTitles[a] = title 231 | } else { 232 | filters[a] = err.Error() 233 | } 234 | } 235 | } 236 | } 237 | 238 | return linkTitles, filters 239 | } 240 | 241 | // filterUrl 过滤 url 242 | func filterUrl(link string, baseUrl *url.URL, strictDomain bool) (string, error) { 243 | var urlStr string 244 | 245 | // 过滤掉链接中包含特殊字符的 246 | if fun.ContainsAny(link, invalidUrlCharsets...) { 247 | return link, errors.New("invalid url with illegal characters") 248 | } 249 | 250 | // 转换为绝对路径 251 | if !fun.HasPrefixCase(link, "http") && !fun.HasPrefixCase(link, "https") { 252 | if l, err := baseUrl.Parse(link); err == nil { 253 | urlStr = l.String() 254 | } else { 255 | return link, errors.New("invalid url with baseUrl parse error") 256 | } 257 | } else { 258 | urlStr = link 259 | } 260 | 261 | // 解析验证 262 | u, err := fun.UrlParse(urlStr) 263 | if err != nil { 264 | return urlStr, errors.New("invalid url with parse error") 265 | } 266 | 267 | // 验证转换后是否是绝对路径 268 | if !u.IsAbs() { 269 | return urlStr, errors.New("invalid url with not absolute url") 270 | } 271 | 272 | // 验证非常规端口 273 | if u.Port() != "" { 274 | return urlStr, errors.New("invalid url with not 80 port") 275 | } 276 | 277 | // 验证主机名 278 | if RegexHostnameIpPattern.MatchString(u.Hostname()) { 279 | return urlStr, errors.New("invalid url with ip hostname") 280 | } 281 | 282 | // 过滤掉明显错误的后缀 283 | ext := path.Ext(u.Path) 284 | if strings.Contains(ext, ".") { 285 | ext = strings.ToLower(ext) 286 | if fun.SliceContains(filterUrlSuffix, ext) { 287 | return urlStr, errors.New("invalid url with suffix") 288 | } 289 | } 290 | 291 | // 过滤掉站外链接 292 | if strictDomain { 293 | hostname := u.Hostname() 294 | domainTop := DomainTop(hostname) 295 | baseDomainTop := DomainTop(baseUrl.Hostname()) 296 | if domainTop != baseDomainTop { 297 | return urlStr, errors.New("invalid url with strict domain") 298 | } 299 | } 300 | 301 | return urlStr, nil 302 | } 303 | -------------------------------------------------------------------------------- /detect.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "net/url" 7 | "strings" 8 | 9 | "github.com/PuerkitoBio/goquery" 10 | "github.com/suosi-inc/go-pkg-spider/extract" 11 | "github.com/x-funs/go-fun" 12 | ) 13 | 14 | type DomainRes struct { 15 | // 域名 16 | Domain string 17 | // 主页域名 18 | HomeDomain string 19 | // 协议 20 | Scheme string 21 | // 字符集 22 | Charset CharsetRes 23 | // 语种 24 | Lang LangRes 25 | // 国家 26 | Country string 27 | // 省份 28 | Province string 29 | // 分类 30 | Category string 31 | // 标题 32 | Title string 33 | // 标题 34 | TitleClean string 35 | // 描述 36 | Description string 37 | // ICP 38 | Icp string 39 | // 状态 40 | State bool 41 | // 状态码 42 | StatusCode int 43 | // 内容页链接数量 44 | ContentCount int 45 | // 列表页链接数量 46 | ListCount int 47 | // 子域名列表 48 | SubDomains map[string]bool 49 | } 50 | 51 | // DetectDomain 域名探测 52 | // DomainRes.State true 和 err nil 表示探测成功 53 | // DomainRes.State true 可能会返回 err, 如 doc 解析失败 54 | // DomainRes.State false 时根据 StatusCode 判断是请求是否成功或请求成功但响应失败(如404) 55 | func DetectDomain(domain string, timeout int, retry int) (*DomainRes, error) { 56 | if retry == 0 { 57 | retry = 1 58 | } 59 | 60 | for i := 0; i < retry; i++ { 61 | domainRes, err := DetectDomainDo(domain, true, timeout) 62 | if domainRes.StatusCode != 0 || err == nil { 63 | return domainRes, err 64 | } 65 | } 66 | 67 | var charset CharsetRes 68 | var lang LangRes 69 | domainRes := &DomainRes{ 70 | Charset: charset, 71 | Lang: lang, 72 | } 73 | return domainRes, errors.New("ErrorDomainDetect") 74 | } 75 | 76 | // DetectSubDomain 子域名探测 77 | // DomainRes.State true 和 err nil 表示探测成功 78 | // DomainRes.State true 可能会返回 err, 如 doc 解析失败 79 | // DomainRes.State false 时根据 StatusCode 判断是请求是否成功或请求成功但响应失败(如404) 80 | func DetectSubDomain(domain string, timeout int, retry int) (*DomainRes, error) { 81 | if retry == 0 { 82 | retry = 1 83 | } 84 | 85 | for i := 0; i < retry; i++ { 86 | domainRes, err := DetectDomainDo(domain, false, timeout) 87 | if domainRes.StatusCode != 0 || err == nil { 88 | return domainRes, err 89 | } 90 | } 91 | 92 | var charset CharsetRes 93 | var lang LangRes 94 | domainRes := &DomainRes{ 95 | Charset: charset, 96 | Lang: lang, 97 | } 98 | return domainRes, errors.New("ErrorDomainDetect") 99 | } 100 | 101 | func DetectDomainDo(domain string, isTop bool, timeout int) (*DomainRes, error) { 102 | if timeout == 0 { 103 | timeout = 10000 104 | } 105 | 106 | domainRes := &DomainRes{} 107 | 108 | req := &HttpReq{ 109 | HttpReq: &fun.HttpReq{ 110 | MaxContentLength: 10 * 1024 * 1024, 111 | MaxRedirect: 3, 112 | }, 113 | ForceTextContentType: true, 114 | } 115 | 116 | scheme := "http" 117 | 118 | // 是否进行首页探测 119 | var homes []string 120 | if isTop { 121 | homes = []string{"www", ""} 122 | } else { 123 | homes = []string{""} 124 | } 125 | 126 | for _, home := range homes { 127 | 128 | var urlStr string 129 | var homeDomain string 130 | if home != "" { 131 | homeDomain = home + fun.DOT + domain 132 | urlStr = scheme + "://" + homeDomain 133 | } else { 134 | homeDomain = domain 135 | urlStr = scheme + "://" + homeDomain 136 | } 137 | 138 | resp, err := HttpGetResp(urlStr, req, timeout) 139 | 140 | if resp != nil && err == nil && resp.Success { 141 | domainRes.Domain = domain 142 | domainRes.StatusCode = resp.StatusCode 143 | 144 | // 如果发生 HTTP 跳转, 则重新设置 homeDomain, 判断跳转后是否是同一个主域名, 如果域名改变则记录并返回错误 145 | domainRes.HomeDomain = homeDomain 146 | requestHostname := resp.RequestURL.Hostname() 147 | if domainRes.HomeDomain != requestHostname { 148 | requestTopDomain := extract.DomainTop(requestHostname) 149 | if requestTopDomain != "" && requestTopDomain != domain { 150 | // 验证主机名 151 | if RegexHostnameIpPattern.MatchString(requestHostname) { 152 | return domainRes, errors.New("ErrorRedirectHost") 153 | } 154 | // 验证非常规端口 155 | if resp.RequestURL.Port() != "" { 156 | return domainRes, errors.New("ErrorRedirectHost") 157 | } 158 | 159 | return domainRes, errors.New("ErrorRedirect:" + requestTopDomain) 160 | } 161 | 162 | domainRes.HomeDomain = requestHostname 163 | } 164 | 165 | // 如果发生了协议跳转, 则重新设置 scheme 166 | domainRes.Scheme = scheme 167 | if domainRes.Scheme != resp.RequestURL.Scheme { 168 | domainRes.Scheme = resp.RequestURL.Scheme 169 | } 170 | 171 | // 字符集 172 | domainRes.Charset = resp.Charset 173 | 174 | // 解析 HTML 175 | u, _ := url.Parse(urlStr) 176 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 177 | if docErr == nil { 178 | doc.Find(DefaultDocRemoveTags).Remove() 179 | 180 | // 具有 HTML 跳转属性, HTTP 无法自动处理永远返回错误, 判断跳转后是否是同一个主域名, 记录并返回 181 | if refresh, exists := doc.Find("meta[http-equiv='refresh' i]").Attr("content"); exists { 182 | refreshMatch := regexMetaRefreshPattern.FindStringSubmatch(refresh) 183 | if len(refreshMatch) > 1 { 184 | refreshUrl := refreshMatch[1] 185 | if r, err := fun.UrlParse(refreshUrl); err == nil { 186 | refreshHostname := r.Hostname() 187 | refreshTopDomain := extract.DomainTop(refreshHostname) 188 | if refreshTopDomain != "" && refreshTopDomain != domain { 189 | // 验证主机名 190 | if RegexHostnameIpPattern.MatchString(refreshHostname) { 191 | return domainRes, errors.New("ErrorMetaJumpHost") 192 | } 193 | // 验证非常规端口 194 | if r.Port() != "" { 195 | return domainRes, errors.New("ErrorMetaJumpHost") 196 | } 197 | 198 | return domainRes, errors.New("ErrorMetaJump:" + refreshTopDomain) 199 | } 200 | } 201 | return domainRes, errors.New("ErrorMetaJump") 202 | } 203 | } 204 | 205 | // 中国 ICP 解析 206 | icp, province := extract.Icp(doc) 207 | if icp != "" && province != "" { 208 | domainRes.Country = "中国" 209 | domainRes.Icp = icp 210 | domainRes.Province = extract.ProvinceShortMap[province] 211 | } 212 | 213 | // 语言 214 | langRes := Lang(doc, resp.Charset.Charset, true) 215 | domainRes.Lang = langRes 216 | 217 | // 尽可能的探测一些信息国家/省份/类别 218 | if domainRes.Country == "" { 219 | country, province, category := extract.MetaFromHost(u.Hostname(), langRes.Lang) 220 | domainRes.Country = country 221 | domainRes.Province = province 222 | domainRes.Category = category 223 | } 224 | 225 | // 标题摘要 226 | domainRes.Title = extract.WebTitle(doc, 0) 227 | domainRes.TitleClean = extract.WebTitleClean(domainRes.Title, langRes.Lang) 228 | domainRes.Description = extract.WebDescription(doc, 0) 229 | 230 | // 站内链接 231 | linkTitles, _ := extract.WebLinkTitles(doc, resp.RequestURL, true) 232 | 233 | // 链接分类 234 | links, subDomains := extract.LinkTypes(linkTitles, langRes.Lang, nil) 235 | 236 | domainRes.ContentCount = len(links.Content) 237 | domainRes.ListCount = len(links.List) 238 | domainRes.SubDomains = subDomains 239 | 240 | domainRes.State = true 241 | 242 | return domainRes, nil 243 | } else { 244 | return domainRes, errors.New("ErrorDocParse") 245 | } 246 | } else { 247 | if resp != nil { 248 | domainRes.StatusCode = resp.StatusCode 249 | } 250 | } 251 | } 252 | 253 | return domainRes, errors.New("ErrorDomainDetect") 254 | } 255 | 256 | func DetectFriendDomain(domain string, timeout int, retry int) (map[string]string, error) { 257 | if retry == 0 { 258 | retry = 1 259 | } 260 | 261 | friendDomains := make(map[string]string, 0) 262 | 263 | for i := 0; i < retry; i++ { 264 | friendDomains, err := DetectFriendDomainDo(domain, timeout) 265 | if err == nil { 266 | return friendDomains, err 267 | } 268 | } 269 | 270 | return friendDomains, errors.New("ErrorDomainDetect") 271 | } 272 | 273 | func DetectFriendDomainDo(domain string, timeout int) (map[string]string, error) { 274 | if timeout == 0 { 275 | timeout = 10000 276 | } 277 | 278 | friendDomains := make(map[string]string, 0) 279 | 280 | req := &HttpReq{ 281 | HttpReq: &fun.HttpReq{ 282 | MaxContentLength: 10 * 1024 * 1024, 283 | MaxRedirect: 3, 284 | }, 285 | ForceTextContentType: true, 286 | } 287 | 288 | scheme := "http" 289 | homes := []string{"www", ""} 290 | 291 | for _, home := range homes { 292 | 293 | var urlStr string 294 | var homeDomain string 295 | if home != "" { 296 | homeDomain = home + fun.DOT + domain 297 | urlStr = scheme + "://" + homeDomain 298 | } else { 299 | homeDomain = domain 300 | urlStr = scheme + "://" + homeDomain 301 | } 302 | 303 | resp, err := HttpGetResp(urlStr, req, timeout) 304 | 305 | if resp != nil && err == nil && resp.Success { 306 | 307 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 308 | if docErr == nil { 309 | doc.Find(DefaultDocRemoveTags).Remove() 310 | 311 | // 非限制域名所有链接 312 | linkTitles, _ := extract.WebLinkTitles(doc, resp.RequestURL, false) 313 | 314 | if len(linkTitles) > 0 { 315 | for link, title := range linkTitles { 316 | if link == "" || title == "" { 317 | continue 318 | } 319 | 320 | u, e := fun.UrlParse(link) 321 | if e != nil { 322 | continue 323 | } 324 | 325 | // 验证非常规端口 326 | if u.Port() != "" { 327 | continue 328 | } 329 | 330 | // 验证主机名 331 | if fun.Matches(`\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}`, u.Hostname()) { 332 | continue 333 | } 334 | 335 | pathDir := strings.TrimSpace(u.Path) 336 | if pathDir == "" || pathDir == fun.SLASH || pathDir == "/index.html" || pathDir == "/index.htm" || pathDir == "/index.shtml" { 337 | hostname := u.Hostname() 338 | domainTop := extract.DomainTop(hostname) 339 | baseDomainTop := domain 340 | if domainTop != baseDomainTop { 341 | friendDomains[domainTop] = title 342 | } 343 | } 344 | } 345 | } 346 | 347 | return friendDomains, nil 348 | } else { 349 | return friendDomains, errors.New("ErrorDocParse") 350 | } 351 | } else { 352 | return friendDomains, err 353 | } 354 | } 355 | 356 | return friendDomains, errors.New("ErrorDomainDetect") 357 | } 358 | -------------------------------------------------------------------------------- /spider_test.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "bytes" 5 | "crypto/tls" 6 | "fmt" 7 | "net/http" 8 | "net/url" 9 | "regexp" 10 | "strconv" 11 | "testing" 12 | "unicode/utf8" 13 | 14 | "github.com/PuerkitoBio/goquery" 15 | "github.com/microcosm-cc/bluemonday" 16 | "github.com/suosi-inc/go-pkg-spider/extract" 17 | "github.com/x-funs/go-fun" 18 | ) 19 | 20 | func BenchmarkHtmlParse(b *testing.B) { 21 | 22 | resp, _ := fun.HttpGetResp("https://www.163.com", nil, 30000) 23 | 24 | b.ResetTimer() 25 | for i := 0; i < b.N; i++ { 26 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body)) 27 | doc.Find(DefaultDocRemoveTags).Remove() 28 | } 29 | } 30 | 31 | func TestGoquery(t *testing.T) { 32 | body, _ := HttpGet("https://jp.news.cn/index.htm") 33 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(body)) 34 | 35 | // lang, exist := doc.Find("html").Attr("id") 36 | 37 | doc.Find("script,noscript,style,iframe,br,link,svg,textarea").Remove() 38 | text := doc.Find("body").Text() 39 | text = fun.RemoveSign(text) 40 | 41 | fmt.Println(text) 42 | } 43 | 44 | func TestRegex(t *testing.T) { 45 | str := ",.!,,D_NAME。!;‘’”“《》**dfs#%^&()-+我1431221 中国123漢字かどうかのjavaを<決定>$¥" 46 | r := regexp.MustCompile(`[\p{Hiragana}|\p{Katakana}]`) 47 | s := r.FindAllString(str, -1) 48 | t.Log(str) 49 | t.Log(s) 50 | } 51 | 52 | func TestUrlParse(t *testing.T) { 53 | var urlStrs = []string{ 54 | "https://www.163.com", 55 | "https://www.163.com/", 56 | "https://www.163.com/a", 57 | "https://www.163.com/aa.html", 58 | "https://www.163.com/a/b", 59 | "https://www.163.com/a/bb.html", 60 | "https://www.163.com/a/b/", 61 | "https://www.163.com/a/b/c", 62 | "https://www.163.com/a/b/cc.html", 63 | } 64 | 65 | for _, urlStr := range urlStrs { 66 | u, _ := url.Parse(urlStr) 67 | link := "javascript:;" 68 | absolute, err := u.Parse(link) 69 | t.Log(err) 70 | 71 | _, err = url.Parse(absolute.String()) 72 | if err != nil { 73 | t.Log(err) 74 | } 75 | 76 | t.Log(urlStr + " + " + link + " => " + absolute.String()) 77 | } 78 | 79 | } 80 | 81 | func TestCount(t *testing.T) { 82 | fmt.Println(regexLangHtmlPattern.MatchString("zh")) 83 | fmt.Println(regexLangHtmlPattern.MatchString("en")) 84 | fmt.Println(regexLangHtmlPattern.MatchString("zh-cn")) 85 | fmt.Println(regexLangHtmlPattern.MatchString("utf-8")) 86 | 87 | fmt.Println(utf8.RuneCountInString("https://khmers.cn/2022/05/23/%e6%b4%aa%e6%a3%ae%e6%80%bb%e7%90%86%ef%bc%9a%e6%9f%ac%e5%9f%94%e5%af%a8%e7%b4%af%e8%ae%a1%e8%8e%b7%e5%be%97%e8%b6%85%e8%bf%875200%e4%b8%87%e5%89%82%e6%96%b0%e5%86%a0%e7%96%ab%e8%8b%97%ef%bc%8c/")) 88 | } 89 | 90 | func TestGetLinkData(t *testing.T) { 91 | var urlStrs = []string{ 92 | // "https://www.1905.com", 93 | // "https://www.people.com.cn", 94 | // "https://www.36kr.com", 95 | // "https://www.163.com", 96 | // "https://news.163.com/", 97 | // "http://jyj.suqian.gov.cn", 98 | // "https://www.huxiu.com/", 99 | // "http://www.news.cn/politicspro/", 100 | // "http://www.cankaoxiaoxi.com", 101 | // "http://www.bbc.com", 102 | // "https://www.ft.com", 103 | // "https://www.reuters.com/", 104 | // "https://nypost.com/", 105 | // "http://www.mengcheng.gov.cn/", 106 | // "https://www.chunichi.co.jp", 107 | // "https://www.donga.com/", 108 | // "https://people.com/", 109 | // "https://czql.gov.cn/", 110 | // "https://qiye.163.com/", 111 | // "https://www.washingtontimes.com/", 112 | // "https://www.gamersky.com/", 113 | // "https://www.cdns.com.tw/", 114 | // "http://www.163.com/", 115 | 116 | // "https://data.163.com", 117 | // "https://www.sensetime.com/cn/news-index", 118 | // "", 119 | "https://www.sis.gov.eg/section/7413/7410?lang=zh-cn", 120 | } 121 | 122 | for _, urlStr := range urlStrs { 123 | 124 | if linkData, err := GetLinkData(urlStr, false, 10000, 1); err == nil { 125 | fmt.Println("subDomain:", len(linkData.SubDomains)) 126 | fmt.Println("content:", len(linkData.LinkRes.Content)) 127 | fmt.Println("list:", len(linkData.LinkRes.List)) 128 | fmt.Println("unknown:", len(linkData.LinkRes.Unknown)) 129 | fmt.Println("none:", len(linkData.LinkRes.None)) 130 | 131 | i := 0 132 | for a, title := range linkData.Filters { 133 | i = i + 1 134 | fmt.Println(i, "filter:"+a+"\t=>\t"+title) 135 | } 136 | i = 0 137 | for a, title := range linkData.SubDomains { 138 | i = i + 1 139 | fmt.Println(i, "subDomain:"+a+"\t=>\t"+strconv.FormatBool(title)) 140 | } 141 | i = 0 142 | for a, title := range linkData.LinkRes.Content { 143 | i = i + 1 144 | fmt.Println(i, "content:"+a+"\t=>\t"+title) 145 | } 146 | i = 0 147 | for a, title := range linkData.LinkRes.Unknown { 148 | i = i + 1 149 | fmt.Println(i, "unknown:"+a+"\t=>\t"+title) 150 | } 151 | i = 0 152 | for a, title := range linkData.LinkRes.List { 153 | i = i + 1 154 | fmt.Println(i, "list:"+a+"\t=>\t"+title) 155 | } 156 | i = 0 157 | for a, title := range linkData.LinkRes.None { 158 | i = i + 1 159 | fmt.Println(i, "none:"+a+"\t=>\t"+title) 160 | } 161 | } 162 | } 163 | } 164 | 165 | func TestGetNews(t *testing.T) { 166 | 167 | var urlStrs = []string{ 168 | // "http://www.cankaoxiaoxi.com/finance/20220831/2489264.shtml", 169 | // "https://www.163.com/news/article/HG3DE7AQ000189FH.html", 170 | // "http://suosi.com.cn/", 171 | // "http://www.cankaoxiaoxi.com/world/20220831/2489267.shtml", 172 | // "http://www.cankaoxiaoxi.com/photo/20220901/2489404.shtml", 173 | // "http://column.cankaoxiaoxi.com/2022/0831/2489330.shtml", 174 | // "http://www.gov.cn/xinwen/2022-08/31/content_5707661.htm", 175 | // "http://suosi.com.cn/2019/14.shtml", 176 | // "https://www.wangan.com/p/7fy78317feb66b37", 177 | // "https://www.wangan.com/news/7fy78y38c7207bf0", 178 | // "http://env.people.com.cn/n1/2022/0901/c1010-32516651.html", 179 | "http://com.gd.gov.cn/go/article.php?typeid=40&contentId=23262", 180 | // "http://www.changzhou.gov.cn/ns_news/827166202029392", 181 | // "https://www.163.com/money/article/HG4TRBL1002580S6.html?clickfrom=w_yw_money", 182 | // "https://mp.weixin.qq.com/s?__biz=MzUxODkxNTYxMA==&mid=2247484842&idx=1&sn=d9822ee4662523609aee7441066c2a96&chksm=f980d6dfcef75fc93cb1e7942cb16ec82a7fb7ec3c2d857c307766daff667bd63ab1b4941abd&exportkey=AXWfguuAyJjlOJgCHf10io8%3D&acctmode=0&pass_ticket=8eXqj", 183 | // "https://www.bbc.com/news/world-asia-62744522", 184 | // "https://www.sohu.com/a/581634395_121284943", 185 | // "https://edition.cnn.com/2022/01/30/europe/lithuania-took-on-china-intl-cmd/index.html", 186 | // "https://www.36kr.com/p/1897541916043649", 187 | // "https://www.huxiu.com/article/651531.html", 188 | // "http://www.news.cn/politics/2022-09/02/c_1128969463.htm", 189 | // "https://www.ccdi.gov.cn/yaowenn/202209/t20220901_215343.html", 190 | // "https://new.qq.com/omn/20200701/20200701A04H7500", 191 | // "http://v.china.com.cn/2022-09/06/content_78407150.html", 192 | // "http://www.chinagwy.org.cn/content-cat-10/143162.html", 193 | // "https://news.52pk.com/xwlm/201912/7366710.shtml", 194 | // "https://www.business-standard.com/article/finance/govt-rbi-propose-action-plan-for-facilitating-special-rupee-accounts-122090701260_1.html", 195 | // "https://www.squirepattonboggs.com/en/news/2022/09/squire-patton-boggs-advises-new-wave-group-ab-on-uk-acquisition", 196 | // "https://www.thebulletin.be/number-road-deaths-belgium-rises-sharply", 197 | // "https://www.dailyexpress.com.my/read/4840/ma63-zero-without-equitable-economic-partnership/", 198 | // "https://news.cgtn.com/news/2022-08-20/CGTN-documentary-Remote-Killing-released-1cE7t7RD104/index.html", 199 | // "https://www.sensetime.com/en/news-detail/51164633?categoryId=1072", 200 | } 201 | 202 | for _, urlStr := range urlStrs { 203 | if news, resp, err := GetNews(urlStr, "", 10000, 1); err == nil { 204 | t.Log(resp.Charset) 205 | t.Log(news.Lang) 206 | t.Log(news.Spend) 207 | t.Log(news.Title) 208 | t.Log(news.TitlePos) 209 | t.Log(news.TimeLocal) 210 | t.Log(news.Time) 211 | t.Log(news.TimePos) 212 | t.Log(news.Content) 213 | 214 | if news.ContentNode != nil { 215 | // 内容 html 节点 216 | node := goquery.NewDocumentFromNode(news.ContentNode) 217 | contentHtml, _ := node.Html() 218 | t.Log(fun.NormaliseLine(contentHtml)) 219 | 220 | // 内容 html 节点清理, 仅保留 p img 标签 221 | p := bluemonday.NewPolicy() 222 | p.AllowElements("p") 223 | p.AllowImages() 224 | html := p.Sanitize(contentHtml) 225 | t.Log(fun.NormaliseLine(html)) 226 | } 227 | } 228 | } 229 | } 230 | 231 | func TestGetNewsWithReq(t *testing.T) { 232 | transport := &http.Transport{ 233 | TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, 234 | DisableKeepAlives: true, 235 | } 236 | proxyString := "http://username:password@host:port" 237 | proxy, _ := url.Parse(proxyString) 238 | transport.Proxy = http.ProxyURL(proxy) 239 | 240 | req := &HttpReq{ 241 | HttpReq: &fun.HttpReq{ 242 | MaxContentLength: HttpDefaultMaxContentLength, 243 | MaxRedirect: 2, 244 | Transport: transport, 245 | }, 246 | ForceTextContentType: true, 247 | } 248 | 249 | var urlStrs = []string{ 250 | "https://www.bbc.com/news/world-asia-62744522", 251 | } 252 | 253 | for _, urlStr := range urlStrs { 254 | if news, resp, err := GetNewsWithReq(urlStr, "", req, 10000, 1); err == nil { 255 | t.Log(resp.Charset) 256 | t.Log(news.Spend) 257 | t.Log(news.Title) 258 | t.Log(news.TitlePos) 259 | t.Log(news.TimeLocal) 260 | t.Log(news.Time) 261 | t.Log(news.TimePos) 262 | t.Log(news.Content) 263 | 264 | if news.ContentNode != nil { 265 | // 内容 html 节点 266 | node := goquery.NewDocumentFromNode(news.ContentNode) 267 | contentHtml, _ := node.Html() 268 | t.Log(fun.NormaliseLine(contentHtml)) 269 | 270 | // 内容 html 节点清理, 仅保留 p img 标签 271 | p := bluemonday.NewPolicy() 272 | p.AllowElements("p") 273 | p.AllowImages() 274 | html := p.Sanitize(contentHtml) 275 | t.Log(fun.NormaliseLine(html)) 276 | } 277 | } 278 | } 279 | } 280 | 281 | func TestDemo(t *testing.T) { 282 | a := "2022-05-26 17:00:57 UTC" 283 | findString := regexp.MustCompile(extract.RegexPublishDate).FindStringSubmatch(a) 284 | t.Log(findString) 285 | t.Log(fun.Date(fun.StrToTime("2022-04-10T18:24:00"))) 286 | } 287 | -------------------------------------------------------------------------------- /lang.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | "unicode/utf8" 7 | 8 | "github.com/PuerkitoBio/goquery" 9 | "github.com/suosi-inc/go-pkg-spider/extract" 10 | "github.com/suosi-inc/lingua-go" 11 | "github.com/x-funs/go-fun" 12 | ) 13 | 14 | const ( 15 | LangPosCharset = "charset" 16 | LangPosHtmlTag = "html" 17 | LangPosBody = "body" 18 | LangPosLingua = "lingua" 19 | LangPosTitleZh = "title" 20 | BodyChunkSize = 2048 21 | BodyMinSize = 64 22 | 23 | RegexLangHtml = "^(?i)([a-z]{2}|[a-z]{2}\\-[a-z]+)$" 24 | ) 25 | 26 | var ( 27 | CharsetLangMap = map[string]string{ 28 | "GBK": "zh", 29 | "Big5": "zh", 30 | "ISO-2022-CN": "zh", 31 | "SHIFT_JIS": "ja", 32 | "KOI8-R": "ru", 33 | "EUC-JP": "ja", 34 | "EUC-KR": "ko", 35 | "EUC-CN": "zh", 36 | "ISO-2022-JP": "ja", 37 | "ISO-2022-KR": "ko", 38 | } 39 | 40 | LangEnZhMap = map[string]string{ 41 | "zh": "中文", 42 | "en": "英语", 43 | "ja": "日语", 44 | "ru": "俄语", 45 | "ko": "韩语", 46 | "ar": "阿拉伯语", 47 | "hi": "印地语", 48 | "de": "德语", 49 | "fr": "法语", 50 | "es": "西班牙语", 51 | "pt": "葡萄牙语", 52 | "it": "意大利语", 53 | "th": "泰语", 54 | "vi": "越南语", 55 | "my": "缅甸语", 56 | } 57 | 58 | LangZhEnMap = map[string]string{ 59 | "中文": "zh", 60 | "英语": "en", 61 | "日语": "ja", 62 | "俄语": "ru", 63 | "韩语": "ko", 64 | "阿拉伯语": "ar", 65 | "印地语": "hi", 66 | "德语": "de", 67 | "法语": "fr", 68 | "西班牙语": "es", 69 | "葡萄牙语": "pt", 70 | "意大利语": "it", 71 | "泰语": "th", 72 | "越南语": "vi", 73 | "缅甸语": "my", 74 | } 75 | 76 | langMetaSelectors = []string{ 77 | "meta[http-equiv='content-language' i]", 78 | "meta[name='lang' i]", 79 | } 80 | 81 | linguaLanguages = []lingua.Language{ 82 | lingua.Arabic, 83 | lingua.Russian, 84 | lingua.Hindi, 85 | lingua.Korean, 86 | } 87 | 88 | linguaLatinLanguages = []lingua.Language{ 89 | lingua.French, 90 | lingua.German, 91 | lingua.Spanish, 92 | lingua.Portuguese, 93 | lingua.English, 94 | } 95 | 96 | linguaMap = map[string]string{ 97 | "arabic": "ar", 98 | "russian": "ru", 99 | "hindi": "hi", 100 | "korean": "ko", 101 | "french": "fr", 102 | "german": "de", 103 | "spanish": "es", 104 | "portuguese": "pt", 105 | "english": "en", 106 | } 107 | 108 | regexLangHtmlPattern = regexp.MustCompile(RegexLangHtml) 109 | regexPuncsPattern = regexp.MustCompile(`[\pP\pS]`) 110 | regexEnPattern = regexp.MustCompile(`[a-zA-Z]`) 111 | regexLatinPattern = regexp.MustCompile("[\u0080-\u00ff]") 112 | regexZhPattern = regexp.MustCompile(`\p{Han}`) 113 | regexJaPattern = regexp.MustCompile(`[\p{Hiragana}|\p{Katakana}]`) 114 | regexKoPattern = regexp.MustCompile(`\p{Hangul}`) 115 | ) 116 | 117 | type LangRes struct { 118 | Lang string 119 | LangPos string 120 | } 121 | 122 | // LangText 探测纯文本语种 123 | func LangText(text string) (string, string) { 124 | return langFromText(text) 125 | } 126 | 127 | // Lang 探测 HTML 语种 128 | func Lang(doc *goquery.Document, charset string, listMode bool) LangRes { 129 | var res LangRes 130 | var lang string 131 | 132 | // 如果存在特定语言的 charset 对照表, 则直接返回 133 | if charset != "" { 134 | if _, exist := CharsetLangMap[charset]; exist { 135 | res.Lang = CharsetLangMap[charset] 136 | res.LangPos = LangPosCharset 137 | return res 138 | } 139 | } 140 | 141 | // 优先判断Title是否包含中文, 辅助内容排除日韩 142 | titleLang, pos := LangFromTitle(doc, listMode) 143 | if titleLang != "" { 144 | res.Lang = titleLang 145 | res.LangPos = pos 146 | return res 147 | } 148 | 149 | // 解析 Html 语言属性, 当不为空不为 en 时可信度比较高, 直接返回 150 | lang = LangFromHtml(doc) 151 | if lang != "" && lang != "en" { 152 | res.Lang = lang 153 | res.LangPos = LangPosHtmlTag 154 | return res 155 | } 156 | 157 | // 当 utf 编码时, lang 为空或 en 可信度比较低, 进行基于内容语种的检测 158 | if strings.HasPrefix(charset, "UTF") && (lang == "" || lang == "en") { 159 | bodyLang, pos := LangFromUtf8Body(doc, listMode) 160 | if bodyLang != "" { 161 | res.Lang = bodyLang 162 | res.LangPos = pos 163 | } 164 | } 165 | 166 | return res 167 | } 168 | 169 | func LangFromHtml(doc *goquery.Document) string { 170 | var lang string 171 | 172 | // html lang 173 | if lang, exists := doc.Find("html").Attr("lang"); exists { 174 | lang = strings.TrimSpace(lang) 175 | if regexLangHtmlPattern.MatchString(lang) { 176 | lang = fun.SubString(lang, 0, 2) 177 | return lang 178 | } 179 | } 180 | if lang, exists := doc.Find("html").Attr("xml:lang"); exists { 181 | lang = strings.TrimSpace(lang) 182 | if regexLangHtmlPattern.MatchString(lang) { 183 | lang = fun.SubString(lang, 0, 2) 184 | return lang 185 | } 186 | 187 | } 188 | for _, selector := range langMetaSelectors { 189 | if lang, exists := doc.Find(selector).Attr("content"); exists { 190 | lang = strings.TrimSpace(lang) 191 | if regexLangHtmlPattern.MatchString(lang) { 192 | lang = fun.SubString(lang, 0, 2) 193 | return lang 194 | } 195 | } 196 | } 197 | 198 | return lang 199 | } 200 | func LangFromTitle(doc *goquery.Document, listMode bool) (string, string) { 201 | var lang string 202 | var text string 203 | 204 | // 获取 Title 205 | title := extract.WebTitle(doc, 0) 206 | text = fun.RemoveSign(title) 207 | text = strings.TrimSpace(text) 208 | 209 | if text != "" { 210 | // 首先判断标题是否包含汉字 211 | han := regexZhPattern.FindAllString(text, -1) 212 | if han != nil { 213 | hanCount := len(han) 214 | 215 | // 汉字数量 >=2 216 | if hanCount >= 2 { 217 | 218 | // 需要抽取内容验证包含有日语韩语, 如(日本語_新華網) 219 | bodyText := bodyTextForLang(doc, listMode) 220 | 221 | // 去除所有符号 222 | bodyText = fun.RemoveSign(bodyText) 223 | 224 | // 最大截取 BodyChunkSize 个字符 225 | bodyText = fun.SubString(bodyText, 0, BodyChunkSize) 226 | bodyText = strings.TrimSpace(bodyText) 227 | 228 | bodyTextCount := utf8.RuneCountInString(bodyText) 229 | 230 | // 包含一定的日语 231 | ja := regexJaPattern.FindAllString(bodyText, -1) 232 | if ja != nil { 233 | jaCount := len(ja) 234 | jaRate := float64(jaCount) / float64(bodyTextCount) 235 | 236 | // 日语出现比例 237 | if jaRate > 0.2 { 238 | lang = "ja" 239 | return lang, LangPosTitleZh 240 | } 241 | } 242 | 243 | // 包含一定的韩语 244 | ko := regexKoPattern.FindAllString(bodyText, -1) 245 | if ko != nil { 246 | koCount := len(ko) 247 | koRate := float64(koCount) / float64(bodyTextCount) 248 | 249 | // 韩语出现比例 250 | if koRate > 0.2 { 251 | lang = "ko" 252 | return lang, LangPosTitleZh 253 | } 254 | } 255 | 256 | lang = "zh" 257 | return lang, LangPosTitleZh 258 | } 259 | } 260 | } 261 | 262 | return lang, "" 263 | } 264 | 265 | func LangFromUtf8Body(doc *goquery.Document, listMode bool) (string, string) { 266 | var text string 267 | 268 | // 抽取内容 269 | text = bodyTextForLang(doc, listMode) 270 | 271 | return langFromText(text) 272 | } 273 | 274 | func langFromText(text string) (string, string) { 275 | var lang string 276 | 277 | // 去除换行(为了保留语义只替换多余的空格) 278 | text = fun.RemoveLines(text) 279 | text = strings.ReplaceAll(text, fun.TAB, "") 280 | text = strings.ReplaceAll(text, " ", "") 281 | 282 | // 去除符号 283 | text = regexPuncsPattern.ReplaceAllString(text, "") 284 | 285 | // 最大截取 BodyChunkSize 个字符 286 | text = fun.SubString(text, 0, BodyChunkSize) 287 | text = strings.TrimSpace(text) 288 | 289 | // 截取后的字符长度 290 | textCount := utf8.RuneCountInString(text) 291 | 292 | // 内容太少不足以判断语言, 放弃 293 | if textCount < BodyMinSize { 294 | return "", "" 295 | } 296 | 297 | // 首先判断是否包含汉字, 中文和日语 298 | han := regexZhPattern.FindAllString(text, -1) 299 | if han != nil { 300 | hanCount := len(han) 301 | hanRate := float64(hanCount) / float64(textCount) 302 | 303 | // 汉字比例 304 | if hanRate >= 0.3 { 305 | ja := regexJaPattern.FindAllString(text, -1) 306 | if ja != nil { 307 | jaCount := len(ja) 308 | jaRate := float64(jaCount) / float64(hanCount) 309 | 310 | // 日语在汉字中的占比 311 | if jaRate > 0.1 { 312 | lang = "ja" 313 | return lang, LangPosBody 314 | } 315 | } 316 | 317 | lang = "zh" 318 | return lang, LangPosBody 319 | } 320 | } 321 | 322 | // 其次判断拉丁语系, 分析主要的一些语种 323 | english := regexEnPattern.FindAllString(text, -1) 324 | if english != nil { 325 | englishCount := len(english) 326 | englishRate := float64(englishCount) / float64(textCount) 327 | if englishRate > 0.618 { 328 | 329 | // 包含拉丁补充字符集, 使用 lingua 分析主要的非英语拉丁语种 330 | latin := regexLatinPattern.FindAllString(text, -1) 331 | if latin != nil { 332 | latinCount := len(latin) 333 | 334 | if latinCount > 5 { 335 | detector := lingua.NewLanguageDetectorBuilder().FromLanguages(linguaLatinLanguages...).Build() 336 | if language, exists := detector.DetectLanguageOf(text); exists { 337 | key := strings.ToLower(language.String()) 338 | linguaLang := linguaMap[key] 339 | return linguaLang, LangPosLingua 340 | } 341 | } 342 | } 343 | 344 | return "en", LangPosBody 345 | } 346 | } 347 | 348 | // 最后, 使用 lingua 分析其他主要的非拉丁语种 349 | detector := lingua.NewLanguageDetectorBuilder().FromLanguages(linguaLanguages...).Build() 350 | if language, exists := detector.DetectLanguageOf(text); exists { 351 | 352 | key := strings.ToLower(language.String()) 353 | linguaLang := linguaMap[key] 354 | return linguaLang, LangPosLingua 355 | } 356 | 357 | return lang, "" 358 | } 359 | 360 | func bodyTextForLang(doc *goquery.Document, listMode bool) string { 361 | var text string 362 | 363 | // 列表页模式 364 | if listMode { 365 | // 优先获取网页中最多 64 个 a 标签, 如果没有 a 标签或过少,放弃 366 | aTag := doc.Find("a") 367 | aTagSize := aTag.Size() 368 | if aTagSize >= 16 { 369 | sliceMax := fun.Min(aTagSize, 64) 370 | text = aTag.Slice(0, sliceMax).Text() 371 | 372 | // 如果 a 标签中包含过多的 {} 可能是动态渲染, 放弃 373 | if strings.Count(text, "{") >= 5 && strings.Count(text, "}") >= 5 { 374 | return "" 375 | } 376 | } 377 | } else { 378 | // 内容页模式, 获取网页中最多 64 个 p 标签 379 | pTag := doc.Find("p") 380 | pTagSize := pTag.Size() 381 | sliceMax := fun.Min(pTagSize, 64) 382 | text = pTag.Slice(0, sliceMax).Text() 383 | 384 | // 如果内容太少, 获取全部 body 文本 385 | textCount := utf8.RuneCountInString(text) 386 | if textCount < BodyMinSize { 387 | text = doc.Find("body").Text() 388 | } 389 | } 390 | 391 | return text 392 | } 393 | -------------------------------------------------------------------------------- /extract/link.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import ( 4 | "net/url" 5 | "path" 6 | "regexp" 7 | "strings" 8 | "unicode/utf8" 9 | 10 | "github.com/x-funs/go-fun" 11 | ) 12 | 13 | const ( 14 | LinkTypeNone LinkType = 0 15 | LinkTypeContent LinkType = 1 16 | LinkTypeList LinkType = 2 17 | LinkTypeUnknown LinkType = 3 18 | 19 | RegexUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2]|[1-9])[/]?(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])?)` 20 | 21 | RegexIndexSuffix = `^/index\.(html|shtml|htm|php|asp|aspx|jsp)$` 22 | 23 | RegexTitleZhBlack = "(经营|制作|信息服务|出版|出版服务|演出|视听节目|新闻|视听|新网)许可证" 24 | ) 25 | 26 | var ( 27 | zhPuncs = []string{",", "。", ";", ":", "?", "!", "(", ")", "“", "”"} 28 | 29 | wordLangs = []string{"en", "ru", "ar", "de", "fr", "es", "pt"} 30 | 31 | zhEnTitles = []string{"nba", "cba", "5g", "ai", "it", "ipo"} 32 | 33 | regexUrlPublishDatePattern = regexp.MustCompile(RegexUrlPublishDate) 34 | 35 | regexZhPattern = regexp.MustCompile(`\p{Han}`) 36 | 37 | regexEnPattern = regexp.MustCompile(`[a-zA-Z]`) 38 | 39 | regexPuncPattern = regexp.MustCompile(`\pP`) 40 | 41 | regexTitleZhBlackPattern = regexp.MustCompile(RegexTitleZhBlack) 42 | 43 | regexIndexSuffixPattern = regexp.MustCompile(RegexIndexSuffix) 44 | ) 45 | 46 | type LinkType int 47 | 48 | type LinkTypeRule map[string][]string 49 | 50 | type LinkRes struct { 51 | // 内容页 52 | Content map[string]string 53 | // 列表页 54 | List map[string]string 55 | // 未知链接 56 | Unknown map[string]string 57 | // 过滤链接 58 | None map[string]string 59 | } 60 | 61 | // LinkTypes 返回链接分类结果 62 | func LinkTypes(linkTitles map[string]string, lang string, rules LinkTypeRule) (*LinkRes, map[string]bool) { 63 | linkRes := &LinkRes{ 64 | Content: make(map[string]string), 65 | List: make(map[string]string), 66 | Unknown: make(map[string]string), 67 | None: make(map[string]string), 68 | } 69 | 70 | subDomains := make(map[string]bool) 71 | 72 | // 统计数据 73 | var contentPublishCount int 74 | contentTopPaths := make(map[string]int) 75 | 76 | for link, title := range linkTitles { 77 | if linkUrl, err := fun.UrlParse(link); err == nil { 78 | hostname := linkUrl.Hostname() 79 | domainTop := DomainTop(hostname) 80 | if hostname != domainTop { 81 | subDomains[hostname] = true 82 | } 83 | 84 | // 无规则自动模式 85 | if rules == nil { 86 | linkType := LinkIsContentByTitle(linkUrl, title, lang) 87 | switch linkType { 88 | case LinkTypeContent: 89 | linkRes.Content[link] = title 90 | 91 | // 内容页 URL path 时间特征统计 92 | pathDir := path.Dir(strings.TrimSpace(linkUrl.Path)) 93 | pathClean := pathDirClean(pathDir) 94 | if regexUrlPublishDatePattern.MatchString(pathClean) { 95 | contentPublishCount++ 96 | } 97 | 98 | // 内容页 URL path 统计 99 | paths := fun.SplitTrim(pathDir, fun.SLASH) 100 | if len(paths) > 0 { 101 | pathIndex := paths[0] 102 | contentTopPaths[pathIndex]++ 103 | } 104 | case LinkTypeList: 105 | linkRes.List[link] = title 106 | case LinkTypeNone: 107 | linkRes.None[link] = title 108 | case LinkTypeUnknown: 109 | linkRes.Unknown[link] = title 110 | } 111 | } else { 112 | // 有规则匹配模式 113 | if LinkIsContentByRegex(linkUrl, rules) { 114 | linkRes.Content[link] = title 115 | } else { 116 | // 无 path 或者默认 path, 应当由 domain 处理 117 | pathDir := strings.TrimSpace(linkUrl.Path) 118 | if pathDir == "" || pathDir == fun.SLASH || regexIndexSuffixPattern.MatchString(pathDir) { 119 | linkRes.None[link] = title 120 | } else { 121 | linkRes.List[link] = title 122 | } 123 | } 124 | } 125 | } 126 | } 127 | 128 | // 基于内容页 URL path 特征统计与分类 129 | if rules == nil { 130 | linkRes = linkTypePathProcess(linkRes, contentTopPaths, contentPublishCount) 131 | } 132 | 133 | // 最后的清洗 134 | linkRes = linkClean(linkRes, lang) 135 | 136 | return linkRes, subDomains 137 | } 138 | 139 | func linkClean(linkRes *LinkRes, lang string) *LinkRes { 140 | if lang == "zh" { 141 | contentCount := len(linkRes.Content) 142 | if contentCount > 0 { 143 | for link, title := range linkRes.Content { 144 | if regexTitleZhBlackPattern.MatchString(title) { 145 | linkRes.None[link] = title 146 | delete(linkRes.Content, link) 147 | } 148 | } 149 | } 150 | } 151 | 152 | return linkRes 153 | } 154 | 155 | func linkTypePathProcess(linkRes *LinkRes, contentTopPaths map[string]int, contentPublishCount int) *LinkRes { 156 | // 统计 157 | contentCount := len(linkRes.Content) 158 | listCount := len(linkRes.List) 159 | unknownCount := len(linkRes.Unknown) 160 | 161 | // 内容页 URL path 发布时间特征比例 162 | publishProb := float32(contentPublishCount) / float32(contentCount) 163 | 164 | // 内容页 URL path 占比较多的特征, 只取 Top 2 165 | topPaths := make([]string, 0) 166 | if contentCount >= 8 { 167 | for topPath, stat := range contentTopPaths { 168 | if stat > 1 { 169 | prob := float32(stat) / float32(contentCount) 170 | if prob > 0.4 { 171 | topPaths = append(topPaths, topPath) 172 | } 173 | } 174 | } 175 | } 176 | 177 | // 内容页 URL path 具有明显的发布时间特征比例, 处理 List、Unknown 178 | if publishProb > 0.7 { 179 | if listCount > 0 { 180 | for link, title := range linkRes.List { 181 | linkUrl, _ := fun.UrlParse(link) 182 | pathDir := path.Dir(strings.TrimSpace(linkUrl.Path)) 183 | pathClean := pathDirClean(pathDir) 184 | if regexUrlPublishDatePattern.MatchString(pathClean) { 185 | // 判断下长度才加入 186 | titleLen := utf8.RuneCountInString(title) 187 | if titleLen >= 2 { 188 | linkRes.Content[link] = title 189 | delete(linkRes.List, link) 190 | } 191 | } 192 | } 193 | } 194 | if unknownCount > 0 { 195 | for link, title := range linkRes.Unknown { 196 | linkUrl, _ := fun.UrlParse(link) 197 | pathDir := path.Dir(strings.TrimSpace(linkUrl.Path)) 198 | pathClean := pathDirClean(pathDir) 199 | if regexUrlPublishDatePattern.MatchString(pathClean) { 200 | // 判断下长度才加入 201 | titleLen := utf8.RuneCountInString(title) 202 | if titleLen >= 2 { 203 | linkRes.Content[link] = title 204 | } else { 205 | linkRes.List[link] = title 206 | } 207 | } else { 208 | linkRes.List[link] = title 209 | } 210 | delete(linkRes.Unknown, link) 211 | } 212 | } 213 | } else if len(topPaths) > 0 && unknownCount > 0 { 214 | // 内容页 URL path 具有前缀特征, 处理 Unknown 215 | for link, title := range linkRes.Unknown { 216 | linkUrl, _ := fun.UrlParse(link) 217 | 218 | pathDir := path.Dir(strings.TrimSpace(linkUrl.Path)) 219 | paths := fun.SplitTrim(pathDir, fun.SLASH) 220 | if len(paths) > 0 { 221 | pathIndex := paths[0] 222 | if fun.SliceContains(topPaths, pathIndex) { 223 | // 判断下长度才加入 224 | titleLen := utf8.RuneCountInString(title) 225 | if titleLen >= 2 { 226 | linkRes.Content[link] = title 227 | } else { 228 | linkRes.List[link] = title 229 | } 230 | } else { 231 | linkRes.List[link] = title 232 | } 233 | delete(linkRes.Unknown, link) 234 | } 235 | } 236 | } 237 | 238 | // path 具有特征, 清洗一下内容页中无 path 的 239 | if contentCount > 0 && (publishProb > 0.7 || len(topPaths) > 0) { 240 | for link, title := range linkRes.Content { 241 | linkUrl, _ := fun.UrlParse(link) 242 | pathStr := strings.TrimSpace(linkUrl.Path) 243 | pathDir := path.Dir(pathStr) 244 | paths := fun.SplitTrim(pathDir, fun.SLASH) 245 | if pathStr == "" || pathStr == "/" || len(paths) == 0 { 246 | linkRes.Unknown[link] = title 247 | delete(linkRes.Content, link) 248 | } 249 | } 250 | } 251 | 252 | return linkRes 253 | } 254 | 255 | func LinkIsContentByRegex(linkUrl *url.URL, rules LinkTypeRule) bool { 256 | hostname := linkUrl.Hostname() 257 | domainTop := DomainTop(hostname) 258 | 259 | if _, exist := rules[hostname]; exist { 260 | for _, regex := range rules[hostname] { 261 | if fun.Matches(linkUrl.String(), regex) { 262 | return true 263 | } 264 | } 265 | } else if _, exist := rules[domainTop]; exist { 266 | for _, regex := range rules[domainTop] { 267 | if fun.Matches(linkUrl.String(), regex) { 268 | return true 269 | } 270 | } 271 | } 272 | 273 | return false 274 | } 275 | 276 | func LinkIsContentByTitle(linkUrl *url.URL, title string, lang string) LinkType { 277 | link := linkUrl.String() 278 | 279 | if utf8.RuneCountInString(link) > 255 { 280 | return LinkTypeNone 281 | } 282 | 283 | // 无 path 或者默认 path, 应当由 domain 处理 284 | pathDir := strings.TrimSpace(linkUrl.Path) 285 | if pathDir == "" || pathDir == fun.SLASH || regexIndexSuffixPattern.MatchString(pathDir) { 286 | return LinkTypeNone 287 | } 288 | 289 | if lang == "zh" { 290 | // 中文 291 | zhs := regexZhPattern.FindAllString(title, -1) 292 | hanCount := len(zhs) 293 | 294 | // 必须包含中文才可能是内容页 295 | if hanCount > 0 { 296 | // 内容页标题中文大于 5 297 | if hanCount > 5 { 298 | 299 | // 去掉空格 300 | title = strings.ReplaceAll(title, fun.SPACE, "") 301 | titleLen := utf8.RuneCountInString(title) 302 | 303 | // >= 8 判定为内容页 URL 304 | if titleLen >= 8 { 305 | return LinkTypeContent 306 | } else if titleLen < 8 { 307 | 308 | // 如果是中文, 判断是否包含常用标点 309 | if lang == "zh" { 310 | if fun.ContainsAny(title, zhPuncs...) { 311 | return LinkTypeContent 312 | } 313 | } 314 | return LinkTypeUnknown 315 | } 316 | } else { 317 | return LinkTypeList 318 | } 319 | } else { 320 | // 没有中文, 简单匹配英文字典 321 | if fun.SliceContains(zhEnTitles, strings.ToLower(title)) { 322 | return LinkTypeList 323 | } 324 | 325 | return LinkTypeNone 326 | } 327 | 328 | } else if fun.SliceContains(wordLangs, lang) { 329 | // 英语等单词类的语种 330 | // 去掉所有标点 331 | title = regexPuncPattern.ReplaceAllString(title, "") 332 | 333 | ens := regexEnPattern.FindAllString(title, -1) 334 | enCount := len(ens) 335 | 336 | // 必须包含英文字母 337 | if enCount > 0 { 338 | // 按照空格切分计算长度 339 | words := fun.SplitTrim(title, fun.SPACE) 340 | 341 | // 大于等于5个单词 342 | if len(words) >= 5 { 343 | return LinkTypeContent 344 | } else { 345 | return LinkTypeList 346 | } 347 | } else { 348 | return LinkTypeNone 349 | } 350 | } else { 351 | // 其他语种, 去除标点, 计算长度 352 | title = regexPuncPattern.ReplaceAllString(title, "") 353 | 354 | titleLen := utf8.RuneCountInString(title) 355 | if titleLen >= 8 { 356 | return LinkTypeContent 357 | } else if titleLen < 8 { 358 | // TODO 其他规则 359 | return LinkTypeList 360 | } 361 | } 362 | 363 | return LinkTypeNone 364 | } 365 | 366 | func pathDirClean(pathDir string) string { 367 | pathClean := strings.ReplaceAll(pathDir, fun.DOT, "") 368 | pathClean = strings.ReplaceAll(pathClean, fun.DASH, "") 369 | pathClean = strings.ReplaceAll(pathClean, fun.UNDERSCORE, "") 370 | 371 | return pathClean 372 | } 373 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /extract/content.go: -------------------------------------------------------------------------------- 1 | // Package extract 新闻要素抽取, 在 CEPF 算法基础上做了大量的优化 2 | // Refer to: 基于标签路径特征融合新闻内容抽取的 CEPF 算法 (吴共庆等) http://www.jos.org.cn/jos/article/abstract/4868 3 | package extract 4 | 5 | import ( 6 | "bytes" 7 | "log" 8 | "math" 9 | "path" 10 | "regexp" 11 | "strings" 12 | "unicode/utf8" 13 | 14 | "github.com/PuerkitoBio/goquery" 15 | "github.com/x-funs/go-fun" 16 | "golang.org/x/net/html" 17 | ) 18 | 19 | const ( 20 | ContentRemoveTags = "script,noscript,style,iframe,br,link,svg,textarea" 21 | 22 | // RegexPublishDate 完整的发布时间正则 23 | RegexPublishDate = "(((20[1-3]\\d{1})[-/年.])(0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?((\\.\\d{3})?)(z|Z|[\\+-]\\d{2}[:]?\\d{2})?)?)" 24 | 25 | // RegexPublishShortDate 年份缩写发布时间正则, 如 22-09-02 11:11:11 26 | RegexPublishShortDate = "(((20[1-3]\\d{1}|[1-3]\\d{1})[-/年.])(0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?((\\.\\d{3})?)(z|Z|[\\+-]\\d{2}[:]?\\d{2})?)?)" 27 | 28 | // RegexPublishDateNoYear 不包含年的发布时间(优先级低), 09-02 29 | RegexPublishDateNoYear = "((0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?)?)" 30 | 31 | // RegexEnPublishDate1 英文格式的正则1, 如 02 Sep 2022 11:40:53 pm 32 | RegexEnPublishDate1 = "(?i)((?:(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])(?:st|nd|rd|th)?)[, ]{0,4}(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sept?|oct|nov|dec)[, ]{0,4}(20[1-3]\\d{1})([, ]{0,4}([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:]([0-5][0-9]|[0-9])([:]([0-5][0-9]|[0-9]))?([, ]{0,4}(am|pm))?)?)" 33 | 34 | // RegexEnPublishDate2 英文格式的正则2, 如 Sep 02 2022 11:40:53 pm 35 | RegexEnPublishDate2 = "(?i)((january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sept?|oct|nov|dec)[, ]{0,4}(?:(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])(?:st|nd|rd|th)?)[, ]{0,4}(20[1-3]\\d{1})([, ]{0,4}([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:]([0-5][0-9]|[0-9])([:]([0-5][0-9]|[0-9]))?([, ]{0,4}(am|pm))?)?)" 36 | 37 | // RegexEnUsPublishDate 英文美式格式的正则3, 如 8/30/2022 11:11:11 38 | RegexEnUsPublishDate = "((0[1-9]|1[0-2]|[1-9])[-/.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[-/.](20[1-3]\\d{1}|[1-3]\\d{1})[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:]([0-5][0-9]|[0-9])[:]?(([0-5][0-9]|[0-9]))?)?)" 39 | 40 | // RegexTime 仅时间正则 41 | RegexTime = "([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?" 42 | 43 | // RegexZhPublishPrefix 中文的发布时间前缀 44 | RegexZhPublishPrefix = "(?i)(发布|创建|出版|发表|编辑)?(时间|日期)" 45 | 46 | // RegexZhPublishDate 中文的固定格式, 如 发布时间: xxx 47 | RegexZhPublishDate = RegexZhPublishPrefix + "[\\pP ]{1,8}" + RegexPublishShortDate 48 | 49 | // RegexScriptTitle Script 中的标题 50 | RegexScriptTitle = `(?i)"title"[\t ]{0,4}:[\t ]{0,4}"(.*)"` 51 | 52 | // RegexScriptTime Script 中的发布时间 53 | RegexScriptTime = `(?i)"[\w_\-]*pub.*"[\t ]{0,4}:[\t ]{0,4}"(((20[1-3]\d{1})[-/年.])(0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?((\.\d{3})?)(z|Z|[\+-]\d{2}[:]?\d{2})?))"` 54 | 55 | // RegexWxScriptTime 微信 Script 中的发布时间 56 | RegexWxScriptTime = `(?i)ct[\t ]{0,4}=[\t ]{0,4}"(1[2-9]\d{8})"` 57 | 58 | // RegexContentUrlPublishDate 内容页URL中隐藏的时间, 必须是非常完整标准的时间 20221003 59 | RegexContentUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2])[/]?(0[1-9]|[1-2][0-9]|3[0-1]))` 60 | 61 | // RegexFormatTime3 错误的时间格式, 用于过滤 62 | RegexFormatTime3 = `[:分]\d{3}$` 63 | 64 | // RegexFormatTime4 错误的时间格式, 用于过滤 65 | RegexFormatTime4 = `[:分]\d{4}$` 66 | 67 | // RegexZone 错误的时区格式, 用于过滤 68 | RegexZone = `(([\+-]\d{2})[:]?\d{2})$` 69 | 70 | // TitleSimZh 中文相似度阈值 71 | TitleSimZh = 0.3 72 | 73 | // TitleSimWord 单词相似度阈值 74 | TitleSimWord = 0.5 75 | ) 76 | 77 | var ( 78 | contentMetaTitleSelectors = []string{ 79 | "meta[property='og:title' i]", 80 | "meta[property='twitter:title' i]", 81 | "meta[name='twitter:title' i]", 82 | } 83 | 84 | contentMetaDatetimeDicts = []string{"publish", "pubdate", "pubtime", "release", "dctermsdate"} 85 | 86 | regexPublishDatePattern = regexp.MustCompile(RegexPublishDate) 87 | 88 | regexPublishShortDatePattern = regexp.MustCompile(RegexPublishShortDate) 89 | 90 | regexPublishDateNoYearPattern = regexp.MustCompile(RegexPublishDateNoYear) 91 | 92 | regexZhPublishDatePattern = regexp.MustCompile(RegexZhPublishDate) 93 | 94 | regexEnPublishDatePattern1 = regexp.MustCompile(RegexEnPublishDate1) 95 | 96 | regexEnPublishDatePattern2 = regexp.MustCompile(RegexEnPublishDate2) 97 | 98 | regexEnUsPublishDatePattern = regexp.MustCompile(RegexEnUsPublishDate) 99 | 100 | regexTimePattern = regexp.MustCompile(RegexTime) 101 | 102 | regexScriptTitlePattern = regexp.MustCompile(RegexScriptTitle) 103 | 104 | regexScriptTimePattern = regexp.MustCompile(RegexScriptTime) 105 | 106 | regexWxScriptTimePattern = regexp.MustCompile(RegexWxScriptTime) 107 | 108 | regexContentUrlPublishDatePattern = regexp.MustCompile(RegexContentUrlPublishDate) 109 | 110 | regexFormatTime3 = regexp.MustCompile(RegexFormatTime3) 111 | 112 | regexFormatTime4 = regexp.MustCompile(RegexFormatTime4) 113 | 114 | regexZonePattern = regexp.MustCompile(RegexZone) 115 | ) 116 | 117 | type News struct { 118 | // 标题 119 | Title string 120 | // 标题提取依据 121 | TitlePos string 122 | // 发布时间 123 | TimeLocal string 124 | // 原始时间 125 | Time string 126 | // 发布时间时间提取依据 127 | TimePos string 128 | // 正文纯文本 129 | Content string 130 | // 正文 Node 节点 131 | ContentNode *html.Node 132 | // 提取用时(毫秒) 133 | Spend int64 134 | // 语种 135 | Lang string 136 | } 137 | 138 | type Content struct { 139 | // 原始 Doc 140 | OriginDoc *goquery.Document 141 | // Doc 142 | Doc *goquery.Document 143 | // 原始标题, 来自于上级页面 144 | OriginTitle string 145 | // 原始链接, 来自于上级页面 146 | OriginUrl string 147 | // 语种 148 | Lang string 149 | 150 | infoMap map[*html.Node]countInfo 151 | bodyNode *html.Node 152 | title string 153 | titlePos string 154 | titleSim float64 155 | timePos string 156 | timeEnFormat bool 157 | } 158 | 159 | type countInfo struct { 160 | // 文本长度, 如

标签的文本 161 | TextCount int 162 | // 带有链接的文本长度, 如 标签中的文本 163 | LinkTextCount int 164 | // 标签数量 165 | TagCount int 166 | // 带有链接的标签数量 167 | LinkTagCount int 168 | // 密度 169 | Density float64 170 | // 密度统计 171 | DensitySum float64 172 | //

标签数量 173 | PCount int 174 | // 叶子列表 175 | LeafList []int 176 | } 177 | 178 | func NewContent(docOrg *goquery.Document, lang string, originTitle string, originUrl string) *Content { 179 | originDoc := goquery.CloneDocument(docOrg) 180 | doc := goquery.CloneDocument(docOrg) 181 | doc.Find(ContentRemoveTags).Remove() 182 | 183 | // 标题相似度阈值判定 184 | titleSim := TitleSimZh 185 | if fun.SliceContains(wordLangs, lang) { 186 | titleSim = TitleSimWord 187 | } 188 | 189 | infoMap := make(map[*html.Node]countInfo, 0) 190 | 191 | return &Content{OriginDoc: originDoc, Doc: doc, OriginTitle: originTitle, OriginUrl: originUrl, Lang: lang, infoMap: infoMap, titleSim: titleSim} 192 | } 193 | 194 | func (c *Content) ExtractNews() *News { 195 | news := &News{} 196 | 197 | // 开始时间 198 | begin := fun.Timestamp(true) 199 | 200 | // 提取正文结点和正文 201 | contentNode := c.getContentNode() 202 | if contentNode != nil { 203 | news.ContentNode = contentNode 204 | 205 | content := c.formatContent(contentNode) 206 | news.Content = content 207 | } 208 | 209 | // 提取标题 210 | title := c.getTitle(contentNode) 211 | news.Title = title 212 | news.TitlePos = c.titlePos 213 | c.title = title 214 | 215 | // 提取发布时间 216 | time := c.getTime() 217 | if time != "" { 218 | // 格式化时间 219 | news.Time = time 220 | news.TimePos = c.timePos 221 | time = c.formatTime(time) 222 | ts := fun.StrToTime(time) 223 | if ts > 0 { 224 | news.TimeLocal = fun.Date(ts) 225 | } 226 | } 227 | 228 | news.Spend = fun.Timestamp(true) - begin 229 | news.Lang = c.Lang 230 | 231 | return news 232 | } 233 | 234 | // formatTime 时间格式化清洗(尽可能的) 235 | func (c *Content) formatTime(time string) string { 236 | if !c.timeEnFormat { 237 | // 当包含时区信息时格式化空格 238 | if fun.ContainsAny(time, "T", "t", "Z", "z") { 239 | time = strings.ReplaceAll(time, " ", "") 240 | } 241 | // 当包含时区T时又没有偏移, 按本地时间处理 242 | if fun.Contains(time, "T") && !fun.ContainsCase(time, "z") { 243 | if !regexZonePattern.MatchString(time) { 244 | time = strings.ReplaceAll(time, "T", " ") 245 | } 246 | } 247 | } 248 | 249 | // 错误的尾巴处理 250 | if fun.Contains(time, ":") && !fun.ContainsAny(time, "时", "点") { 251 | time = strings.TrimSuffix(time, "分") 252 | } 253 | return time 254 | } 255 | 256 | // formatContent 正文格式化, 处理

的换行, 最终将多个换行符和空格均合并为一个 257 | func (c *Content) formatContent(contentNode *html.Node) string { 258 | // 先提取 HTML 259 | node := goquery.NewDocumentFromNode(contentNode) 260 | contentHtml, _ := node.Html() 261 | 262 | // 给

则增加换行 \n 263 | contentHtml = strings.ReplaceAll(contentHtml, "

", "

\n") 264 | n, _ := goquery.NewDocumentFromReader(strings.NewReader(contentHtml)) 265 | str := n.Text() 266 | 267 | // 最后合并多余的换行 268 | lines := fun.SplitTrim(str, fun.LF) 269 | if len(lines) > 0 { 270 | for i, line := range lines { 271 | lines[i] = fun.NormaliseSpace(line) 272 | } 273 | str = strings.Join(lines, fun.LF) 274 | } else { 275 | str = fun.NormaliseSpace(str) 276 | } 277 | 278 | return str 279 | } 280 | 281 | func (c *Content) getContentNode() *html.Node { 282 | var maxScore float64 283 | var contentNode *html.Node 284 | 285 | // 取第一个 body 标签 286 | bodyNodes := c.Doc.Find("body").Nodes 287 | if len(bodyNodes) > 0 { 288 | bodyNode := bodyNodes[0] 289 | c.bodyNode = bodyNode 290 | 291 | // 递归遍历计算并统计, 最后找得分最高那个节点 292 | c.computeInfo(c.bodyNode) 293 | 294 | for node := range c.infoMap { 295 | if node.Data == "a" || node == bodyNode { 296 | continue 297 | } 298 | 299 | score := c.computeScore(node) 300 | if score > maxScore { 301 | maxScore = score 302 | contentNode = node 303 | } 304 | } 305 | } 306 | 307 | return contentNode 308 | } 309 | 310 | func (c *Content) getTime() string { 311 | // meta 312 | regexZhPatterns := []*regexp.Regexp{ 313 | regexPublishDatePattern, 314 | } 315 | metaZhTime := c.getTimeByMeta(regexZhPatterns) 316 | if metaZhTime != "" { 317 | c.timePos = "meta" 318 | return metaZhTime 319 | } 320 | 321 | // meta En 322 | if c.Lang != "zh" { 323 | regexEnPatterns := []*regexp.Regexp{ 324 | regexEnPublishDatePattern1, 325 | regexEnPublishDatePattern2, 326 | } 327 | metaEnTime := c.getTimeByMetaEn(regexEnPatterns) 328 | if metaEnTime != "" { 329 | c.timePos = "meta" 330 | c.timeEnFormat = true 331 | return metaEnTime 332 | } 333 | } 334 | 335 | //