├── charset_test.go
├── extract
    ├── content_test.go
    ├── link_test.go
    ├── meta_test.go
    ├── icp_test.go
    ├── domain_test.go
    ├── domain.go
    ├── web_test.go
    ├── icp.go
    ├── meta.go
    ├── web.go
    ├── link.go
    └── content.go
├── .gitignore
├── banner.txt
├── go.mod
├── spider_news_test.go
├── http_test.go
├── http.go
├── lang_test.go
├── charset.go
├── go.sum
├── README.md
├── detect_test.go
├── spider.go
├── spider_news.go
├── detect.go
├── spider_test.go
├── lang.go
└── LICENSE


/charset_test.go:
--------------------------------------------------------------------------------
1 | package spider
2 | 


--------------------------------------------------------------------------------
/extract/content_test.go:
--------------------------------------------------------------------------------
1 | package extract
2 | 


--------------------------------------------------------------------------------
/extract/link_test.go:
--------------------------------------------------------------------------------
 1 | package extract
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"regexp"
 6 | 	"testing"
 7 | )
 8 | 
 9 | func TestMatch(t *testing.T) {
10 | 	m := regexp.MustCompile(`\p{Han}`)
11 | 	allString := m.FindAllString("123你好，世界asdf", -1)
12 | 	fmt.Println(allString)
13 | }
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | 
 8 | # Test binary, built with `go test -c`
 9 | *.test
10 | 
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 | 
14 | # Dependency directories (remove the comment below to include it)
15 | # vendor/
16 | 
17 | .idea
18 | .vscode
19 | .setting


--------------------------------------------------------------------------------
/extract/meta_test.go:
--------------------------------------------------------------------------------
 1 | package extract
 2 | 
 3 | import "testing"
 4 | 
 5 | func TestHostMeta(t *testing.T) {
 6 | 	hosts := []string{
 7 | 		"matichon.co.th",
 8 | 		"wanbao.com.sg",
 9 | 		"wanbao.com.sg",
10 | 		"waou.com.mo",
11 | 		"archives.gov.mo",
12 | 		"mfa.gov.sg",
13 | 		"nasa.gov",
14 | 	}
15 | 
16 | 	for _, host := range hosts {
17 | 		t.Log(MetaFromHost(host, ""))
18 | 	}
19 | }
20 | 


--------------------------------------------------------------------------------
/extract/icp_test.go:
--------------------------------------------------------------------------------
 1 | package extract
 2 | 
 3 | import "testing"
 4 | 
 5 | func TestIcpFromText(t *testing.T) {
 6 | 	texts := []string{
 7 | 		"粤ICP备17055554号",
 8 | 		"粤ICP备17055554-34号",
 9 | 		"沪ICP备05018492",
10 | 		"粤B2-20090059",
11 | 		"京公网安备31010402001073号",
12 | 		"京公网安备-31010-4020010-73号",
13 | 		"鲁ICP备05002386鲁公网安备37070502000027号",
14 | 	}
15 | 
16 | 	for _, text := range texts {
17 | 		icp, loc := IcpFromText(text)
18 | 		t.Log(icp, loc)
19 | 	}
20 | }
21 | 


--------------------------------------------------------------------------------
/banner.txt:
--------------------------------------------------------------------------------
1 |                             __                          _     __         
2 |    ____ _____        ____  / /______ _      _________  (_)___/ /__  _____
3 |   / __ `/ __ \______/ __ \/ //_/ __ `/_____/ ___/ __ \/ / __  / _ \/ ___/
4 |  / /_/ / /_/ /_____/ /_/ / ,< / /_/ /_____(__  ) /_/ / / /_/ /  __/ /    
5 |  \__, /\____/     / .___/_/|_|\__, /     /____/ .___/_/\__,_/\___/_/     
6 | /____/           /_/         /____/          /_/                         
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/suosi-inc/go-pkg-spider
 2 | 
 3 | go 1.18
 4 | 
 5 | require (
 6 | 	github.com/PuerkitoBio/goquery v1.8.1
 7 | 	github.com/microcosm-cc/bluemonday v1.0.26
 8 | 	github.com/suosi-inc/chardet v0.1.0
 9 | 	github.com/suosi-inc/lingua-go v1.0.51
10 | 	github.com/x-funs/go-fun v0.94.0
11 | 	golang.org/x/net v0.19.0
12 | )
13 | 
14 | require (
15 | 	github.com/andybalholm/cascadia v1.3.2 // indirect
16 | 	github.com/aymerick/douceur v0.2.0 // indirect
17 | 	github.com/gorilla/css v1.0.1 // indirect
18 | 	github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
19 | 	golang.org/x/text v0.17.0 // indirect
20 | )
21 | 


--------------------------------------------------------------------------------
/extract/domain_test.go:
--------------------------------------------------------------------------------
 1 | package extract
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestDomainParse(t *testing.T) {
 9 | 	domains := []string{
10 | 		"www.net.cn",
11 | 		"hi.chinanews.com",
12 | 		"a.wh.cn",
13 | 		"siat.ac.cn",
14 | 		"abc.spring.io",
15 | 		"abc.spring.ai",
16 | 		"www.china-embassy.or.jp",
17 | 		"whszdj.wh.cn",
18 | 		"gk.wh.cn",
19 | 		"xwxc.mwr.cn",
20 | 		"legismac.safp.gov.mo",
21 | 		"dezhou.rcsd.cn",
22 | 		"www.gov.cn",
23 | 		"scopsr.gov.cn",
24 | 		"usa.gov",
25 | 		"bbc.co.uk",
26 | 		"dealer.auto.sohu.com",
27 | 		"bbs.sohu.com",
28 | 	}
29 | 
30 | 	for _, domain := range domains {
31 | 		t.Log(DomainParse(domain))
32 | 	}
33 | }
34 | 
35 | func TestDomainTop(t *testing.T) {
36 | 	domains := []string{
37 | 		"www.net.cn",
38 | 		"hi.chinanews.com",
39 | 		"a.wh.cn",
40 | 		"siat.ac.cn",
41 | 		"abc.spring.io",
42 | 		"abc.spring.ai",
43 | 		"www.china-embassy.or.jp",
44 | 		"whszdj.wh.cn",
45 | 		"gk.wh.cn",
46 | 		"xwxc.mwr.cn",
47 | 		"legismac.safp.gov.mo",
48 | 		"dezhou.rcsd.cn",
49 | 		"www.gov.cn",
50 | 		"scopsr.gov.cn",
51 | 		"usa.gov",
52 | 		"bbc.co.uk",
53 | 	}
54 | 
55 | 	for _, domain := range domains {
56 | 		t.Log(DomainTop(domain))
57 | 	}
58 | }
59 | 
60 | func TestDomainTopFromUrl(t *testing.T) {
61 | 	fmt.Println(DomainTopFromUrl("https://www.google.com"))
62 | 	fmt.Println(DomainTopFromUrl("https://www.baidu.com/news"))
63 | 	fmt.Println(DomainTopFromUrl("http://szb.xnnews.com.cn/zhzx/202207/t20220722_2731400.htm"))
64 | }
65 | 


--------------------------------------------------------------------------------
/extract/domain.go:
--------------------------------------------------------------------------------
 1 | package extract
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"strings"
 6 | 
 7 | 	"github.com/x-funs/go-fun"
 8 | 	"golang.org/x/net/publicsuffix"
 9 | )
10 | 
11 | type Domain struct {
12 | 	Subdomain, Domain, TLD string
13 | 	ICANN                  bool
14 | }
15 | 
16 | // DomainTop 返回顶级域名
17 | func DomainTop(d string) string {
18 | 	if d, err := DomainParse(d); err == nil {
19 | 		return d.Domain + fun.DOT + d.TLD
20 | 	}
21 | 
22 | 	return ""
23 | }
24 | 
25 | // DomainTopFromUrl 解析 URL 返回顶级域名
26 | func DomainTopFromUrl(urlStr string) string {
27 | 	if d, err := DomainParseFromUrl(urlStr); err == nil {
28 | 		return d.Domain + "." + d.TLD
29 | 	}
30 | 
31 | 	return ""
32 | }
33 | 
34 | // DomainParse 解析域名, 返回 Domain
35 | func DomainParse(domain string) (*Domain, error) {
36 | 	if fun.Blank(domain) {
37 | 		return nil, errors.New("domain is blank")
38 | 	}
39 | 
40 | 	// etld+1
41 | 	etld1, err := publicsuffix.EffectiveTLDPlusOne(domain)
42 | 	_, icann := publicsuffix.PublicSuffix(strings.ToLower(domain))
43 | 	if err != nil {
44 | 		return nil, err
45 | 	}
46 | 
47 | 	// convert to domain name, and tld
48 | 	i := strings.Index(etld1, fun.DOT)
49 | 	domName := etld1[0:i]
50 | 	tld := etld1[i+1:]
51 | 
52 | 	// and subdomain
53 | 	sub := ""
54 | 	if rest := strings.TrimSuffix(domain, "."+etld1); rest != domain {
55 | 		sub = rest
56 | 	}
57 | 	return &Domain{
58 | 		Subdomain: sub,
59 | 		Domain:    domName,
60 | 		TLD:       tld,
61 | 		ICANN:     icann,
62 | 	}, nil
63 | }
64 | 
65 | // DomainParseFromUrl 解析域名, 返回 Domain
66 | func DomainParseFromUrl(urlStr string) (*Domain, error) {
67 | 	u, err := fun.UrlParse(urlStr)
68 | 	if err != nil {
69 | 		return nil, err
70 | 	}
71 | 
72 | 	d := u.Hostname()
73 | 
74 | 	return DomainParse(d)
75 | }
76 | 


--------------------------------------------------------------------------------
/extract/web_test.go:
--------------------------------------------------------------------------------
 1 | package extract
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"net/url"
 6 | 	"path"
 7 | 	"testing"
 8 | 	"unicode/utf8"
 9 | 
10 | 	"github.com/x-funs/go-fun"
11 | )
12 | 
13 | func TestTitleClean(t *testing.T) {
14 | 	strs := map[string]string{
15 | 		"“暴徒试图杀死他！”阿拉木图市长在1月5日的暗杀企图中幸存_网易订阅":                                                   "zh",
16 | 		"“暴徒试图杀死他！”阿拉木图市长在1月5日的暗杀企图中幸存 - 网易订阅":                                                 "zh",
17 | 		"北极圈内最高温达到38℃ 北极熊还好吗？南极情况怎么样？_科技频道_中国青年网":                                              "zh",
18 | 		"About the Project on Nuclear Issues | Center for Strategic and International Studies": "en",
19 | 	}
20 | 
21 | 	for str, l := range strs {
22 | 		t.Log(WebTitleClean(str, l))
23 | 	}
24 | }
25 | 
26 | func TestUrlQuery(t *testing.T) {
27 | 	// urlStr := "https://people.com/tag/stories-to-make-you-smile/a/b/abc.html?a=1&b=2&c=3#ddd"
28 | 	urlStr := "https://vipmail.163.com/index.html?abc=123"
29 | 	u, err := url.Parse(urlStr)
30 | 
31 | 	fmt.Println(err)
32 | 	fmt.Println(u.Path)
33 | 	fmt.Println(u.RawQuery)
34 | 	fmt.Println(path.Dir(u.Path))
35 | 	// fmt.Println(path.Base(u.Path))
36 | 
37 | 	fmt.Println(utf8.RuneCountInString("https://adx.36kr.com/api/ad/click?sign=2eda7665240cec93f902311eb10c195a&param.redirectUrl=aHR0cHM6Ly8zNmtyLmNvbS9wLzE4NTM5NTQ2NzgxMzIzNTI&param.adsdk=Phid2i9VOob6U23ybkDx8q7cr1KbBDM4oiu1d_-C6gY5qf5SKxqBPsptEVMy_wtzqB5Yr08U7ioREUL7HLxIrQ"))
38 | }
39 | 
40 | func TestFilterUrl(t *testing.T) {
41 | 	urlStr := "http://www.163.com/a/b/"
42 | 	baseUrl, _ := fun.UrlParse(urlStr)
43 | 
44 | 	t.Log(filterUrl("./c/123.html", baseUrl, true))
45 | 	t.Log(filterUrl("../c/123.html", baseUrl, true))
46 | 	t.Log(filterUrl("/c/123.html", baseUrl, true))
47 | 	t.Log(filterUrl("//www.163.com/c/123.html", baseUrl, true))
48 | 	t.Log(filterUrl("//www.163.com/c/123.pdf?abc=1123", baseUrl, true))
49 | }
50 | 
51 | func BenchmarkFilterUrl(b *testing.B) {
52 | 	urlStr := "http://www.163.com/a/b/"
53 | 	baseUrl, _ := fun.UrlParse(urlStr)
54 | 
55 | 	b.ResetTimer()
56 | 
57 | 	for i := 0; i < b.N; i++ {
58 | 		filterUrl("https://www.163.com/news/article/HEAJM4F1000189FH.html", baseUrl, true)
59 | 
60 | 		// url.Parse("https://www.163.com/news/article/HEAJM4F1000189FH.html")
61 | 	}
62 | }
63 | 


--------------------------------------------------------------------------------
/spider_news_test.go:
--------------------------------------------------------------------------------
 1 | package spider
 2 | 
 3 | import (
 4 | 	"crypto/tls"
 5 | 	"fmt"
 6 | 	"net/http"
 7 | 	"net/url"
 8 | 	"testing"
 9 | 
10 | 	"github.com/x-funs/go-fun"
11 | )
12 | 
13 | var (
14 | 	newUrl     = "http://www.cankaoxiaoxi.com/"
15 | 	newUrl_domain     = "cankaoxiaoxi.com"
16 | 	overseaUrl = "https://www.bbc.com/news"
17 | )
18 | 
19 | func TestNews_GetLinkRes_Noctx(t *testing.T) {
20 | 	n := NewNewsSpider(newUrl, 2, processLink, nil, WithRetryTime(1), WithTimeOut(10000))
21 | 	n.GetLinkRes()
22 | }
23 | 
24 | func TestNews_GetLinkRes(t *testing.T) {
25 | 	ctx := "getLinkRes"
26 | 	n := NewNewsSpider(newUrl, 2, processLink, ctx, WithRetryTime(1), WithTimeOut(10000))
27 | 	n.RetryTime = 1
28 | 	n.Depth = 2
29 | 	n.GetLinkRes()
30 | }
31 | 
32 | func TestNews_GetLinkRes_Clone(t *testing.T) {
33 | 	ctx := "getLinkRes"
34 | 	n := NewNewsSpider(newUrl, 2, processLink, ctx)
35 | 
36 | 	nc := n.Clone().(*NewsSpider)
37 | 	nc.Ctx = "getLinkRes_Clone"
38 | 	nc.GetLinkRes()
39 | }
40 | 
41 | func processLink(data ...any) {
42 | 	newsData := data[0].(*NewsData)
43 | 
44 | 	if newsData.Error == nil {
45 | 		fmt.Println(newsData.ListUrl)
46 | 		fmt.Println(newsData.Depth)
47 | 		for i := range newsData.LinkRes.List {
48 | 			fmt.Println(data[1], i)
49 | 		}
50 | 	}
51 | }
52 | 
53 | func TestNews_GetContentNews(t *testing.T) {
54 | 	ctx := "getContentNews"
55 | 	n := NewNewsSpider(newUrl, 1, processContent, ctx)
56 | 	n.GetContentNews()
57 | }
58 | 
59 | func processContent(data ...any) {
60 | 	dd := data[0].(*NewsContent)
61 | 	fmt.Println(data[1], dd.Title, dd.Lang)
62 | }
63 | 
64 | func TestNews_GetNewsWithProxy(t *testing.T) {
65 | 	transport := &http.Transport{
66 | 		TLSClientConfig:   &tls.Config{InsecureSkipVerify: true},
67 | 		DisableKeepAlives: true,
68 | 	}
69 | 	proxyString := "http://username:password@host:port"
70 | 	proxy, _ := url.Parse(proxyString)
71 | 	transport.Proxy = http.ProxyURL(proxy)
72 | 
73 | 	req := &HttpReq{
74 | 		HttpReq: &fun.HttpReq{
75 | 			MaxContentLength: HttpDefaultMaxContentLength,
76 | 			MaxRedirect:      2,
77 | 			Transport:        transport,
78 | 			Headers: map[string]string{
79 | 				"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
80 | 			},
81 | 		},
82 | 		ForceTextContentType: true,
83 | 	}
84 | 
85 | 	ctx := "getNewsWithProxy"
86 | 	n := NewNewsSpider(overseaUrl, 1, processContent, ctx, WithReq(req))
87 | 	n.GetContentNews()
88 | }
89 | 


--------------------------------------------------------------------------------
/extract/icp.go:
--------------------------------------------------------------------------------
  1 | package extract
  2 | 
  3 | import (
  4 | 	"regexp"
  5 | 	"strings"
  6 | 
  7 | 	"github.com/PuerkitoBio/goquery"
  8 | 	"github.com/x-funs/go-fun"
  9 | )
 10 | 
 11 | var (
 12 | 	ProvinceShortMap = map[string]string{
 13 | 		"京": "北京",
 14 | 		"津": "天津",
 15 | 		"沪": "上海",
 16 | 		"渝": "重庆",
 17 | 		"黑": "黑龙江",
 18 | 		"吉": "吉林",
 19 | 		"辽": "辽宁",
 20 | 		"冀": "河北",
 21 | 		"豫": "河南",
 22 | 		"鲁": "山东",
 23 | 		"晋": "山西",
 24 | 		"陕": "陕西",
 25 | 		"秦": "陕西",
 26 | 		"蒙": "内蒙古",
 27 | 		"宁": "宁夏",
 28 | 		"陇": "甘肃",
 29 | 		"甘": "甘肃",
 30 | 		"新": "新疆",
 31 | 		"青": "青海",
 32 | 		"藏": "西藏",
 33 | 		"鄂": "湖北",
 34 | 		"皖": "安徽",
 35 | 		"苏": "江苏",
 36 | 		"浙": "浙江",
 37 | 		"闽": "福建",
 38 | 		"湘": "湖南",
 39 | 		"赣": "江西",
 40 | 		"川": "四川",
 41 | 		"蜀": "四川",
 42 | 		"黔": "贵州",
 43 | 		"贵": "贵州",
 44 | 		"滇": "云南",
 45 | 		"云": "云南",
 46 | 		"粤": "广东",
 47 | 		"桂": "广西",
 48 | 		"琼": "海南",
 49 | 		"港": "中国香港",
 50 | 		"澳": "中国澳门",
 51 | 		"台": "中国台湾",
 52 | 	}
 53 | )
 54 | 
 55 | const (
 56 | 	RegexIcp   = `(?i)(京|津|冀|晋|蒙|辽|吉|黑|沪|苏|浙|皖|闽|赣|鲁|豫|鄂|湘|粤|桂|琼|川|蜀|贵|黔|云|滇|渝|藏|陇|甘|陕|秦|青|宁|新)ICP(备|证|备案)?[0-9]+`
 57 | 	RegexIcpGa = `(?i)(京|津|冀|晋|蒙|辽|吉|黑|沪|苏|浙|皖|闽|赣|鲁|豫|鄂|湘|粤|桂|琼|川|蜀|贵|黔|云|滇|渝|藏|陇|甘|陕|秦|青|宁|新)公网安备[0-9]+`
 58 | 	RegexIcpDx = `(?i)(京|津|冀|晋|蒙|辽|吉|黑|沪|苏|浙|皖|闽|赣|鲁|豫|鄂|湘|粤|桂|琼|川|蜀|贵|黔|云|滇|渝|藏|陇|甘|陕|秦|青|宁|新)B2-[0-9]+`
 59 | )
 60 | 
 61 | var (
 62 | 	RegexIcpPattern   = regexp.MustCompile(RegexIcp)
 63 | 	RegexIcpGaPattern = regexp.MustCompile(RegexIcpGa)
 64 | 	RegexIcpDxPattern = regexp.MustCompile(RegexIcpDx)
 65 | )
 66 | 
 67 | // Icp 返回网站备案相关的信息
 68 | func Icp(doc *goquery.Document) (string, string) {
 69 | 	text := doc.Find("body").Text()
 70 | 
 71 | 	text = fun.RemoveLines(text)
 72 | 
 73 | 	text = strings.ReplaceAll(text, fun.TAB, "")
 74 | 	text = strings.ReplaceAll(text, fun.SPACE, "")
 75 | 
 76 | 	return IcpFromText(text)
 77 | 
 78 | }
 79 | 
 80 | // IcpFromText 提取文本中备案相关的信息
 81 | func IcpFromText(text string) (string, string) {
 82 | 	var icp, loc string
 83 | 
 84 | 	// 优先匹配ICP
 85 | 	matches := RegexIcpPattern.FindStringSubmatch(text)
 86 | 	if len(matches) > 1 {
 87 | 		icp = matches[0]
 88 | 		loc = matches[1]
 89 | 	}
 90 | 
 91 | 	// 匹配公网安备
 92 | 	if icp == "" {
 93 | 		matches = RegexIcpGaPattern.FindStringSubmatch(text)
 94 | 		if len(matches) > 1 {
 95 | 			icp = matches[0]
 96 | 			loc = matches[1]
 97 | 		}
 98 | 	}
 99 | 
100 | 	// 匹配电信增值业务
101 | 	if icp == "" {
102 | 		matches = RegexIcpDxPattern.FindStringSubmatch(text)
103 | 		if len(matches) > 1 {
104 | 			icp = matches[0]
105 | 			loc = matches[1]
106 | 		}
107 | 	}
108 | 
109 | 	return icp, loc
110 | }
111 | 


--------------------------------------------------------------------------------
/http_test.go:
--------------------------------------------------------------------------------
  1 | package spider
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/PuerkitoBio/goquery"
  8 | 	"github.com/x-funs/go-fun"
  9 | )
 10 | 
 11 | func TestHttpGetCharsetLang(t *testing.T) {
 12 | 	var urlStrs = []string{
 13 | 		// "http://suosi.com.cn",
 14 | 		// "https://www.163.com",
 15 | 		// "https://english.news.cn",
 16 | 		// "https://jp.news.cn",
 17 | 		// "https://kr.news.cn",
 18 | 		// "https://www.donga.com/",
 19 | 		// "http://www.koreatimes.com/",
 20 | 		// "https://arabic.news.cn",
 21 | 		// "https://www.bbc.com",
 22 | 		// "http://government.ru",
 23 | 		"https://french.news.cn",
 24 | 		// "https://www.gouvernement.fr",
 25 | 		// "http://live.siammedia.org/",
 26 | 		// "http://hanoimoi.com.vn",
 27 | 		// "https://www.commerce.gov.mm",
 28 | 		// "https://sanmarg.in/",
 29 | 		// "https://www.rrdmyanmar.gov.mm",
 30 | 		// "http://english.eastday.com/",
 31 | 		// "http://jp.eastday.com/",
 32 | 		// "https://mn.cctv.com/",
 33 | 	}
 34 | 
 35 | 	for _, urlStr := range urlStrs {
 36 | 
 37 | 		resp, err := HttpGetResp(urlStr, nil, 30000)
 38 | 
 39 | 		t.Log(urlStr)
 40 | 		t.Log(err)
 41 | 		t.Log(resp.Success)
 42 | 		t.Log(resp.ContentLength)
 43 | 		t.Log(resp.Headers)
 44 | 		t.Log(resp.Charset)
 45 | 
 46 | 		doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
 47 | 		doc.Find(DefaultDocRemoveTags).Remove()
 48 | 
 49 | 		start := fun.Timestamp(true)
 50 | 		lang := Lang(doc, resp.Charset.Charset, true)
 51 | 		t.Log(lang)
 52 | 
 53 | 		t.Log(fun.Timestamp(true) - start)
 54 | 	}
 55 | }
 56 | 
 57 | func TestHttpGetCharsetLangURL(t *testing.T) {
 58 | 	var urlStrs = []string{
 59 | 		"https://marriott.co.kr",
 60 | 	}
 61 | 
 62 | 	for _, urlStr := range urlStrs {
 63 | 
 64 | 		resp, err := HttpGetResp(urlStr, nil, 30000)
 65 | 
 66 | 		t.Log(urlStr)
 67 | 		t.Log(err)
 68 | 		t.Log(resp.Success)
 69 | 		t.Log(resp.ContentLength)
 70 | 		t.Log(resp.Headers)
 71 | 		t.Log(resp.Charset)
 72 | 
 73 | 		doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
 74 | 		doc.Find(DefaultDocRemoveTags).Remove()
 75 | 
 76 | 		start := fun.Timestamp(true)
 77 | 		lang := Lang(doc, resp.Charset.Charset, true)
 78 | 		t.Log(lang)
 79 | 
 80 | 		t.Log(fun.Timestamp(true) - start)
 81 | 	}
 82 | }
 83 | 
 84 | func TestHttpGet(t *testing.T) {
 85 | 	var urlStr string
 86 | 
 87 | 	urlStr = "http://www.niuchaoqun.com"
 88 | 	// urlStr = "http://www.qq.com"
 89 | 
 90 | 	resp, err := HttpGetResp(urlStr, nil, 10000)
 91 | 
 92 | 	t.Log(urlStr)
 93 | 	t.Log(err)
 94 | 	t.Log(resp.Success)
 95 | 	t.Log(resp.ContentLength)
 96 | 	t.Log(resp.Headers)
 97 | 	t.Log(resp.Charset)
 98 | 
 99 | 	doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
100 | 	doc.Find(DefaultDocRemoveTags).Remove()
101 | 	lang := Lang(doc, resp.Charset.Charset, true)
102 | 	t.Log(lang)
103 | 
104 | 	t.Log(fun.String(resp.Body))
105 | }
106 | 
107 | func TestHttpGetContentType(t *testing.T) {
108 | 	var urlStr string
109 | 
110 | 	urlStr = "https://mirrors.163.com/mysql/Downloads/MySQL-8.0/libmysqlclient-dev_8.0.27-1debian10_amd64.deb"
111 | 
112 | 	req := &HttpReq{
113 | 		ForceTextContentType: true,
114 | 	}
115 | 	resp, err := HttpGetResp(urlStr, req, 10000)
116 | 
117 | 	t.Log(urlStr)
118 | 	t.Log(err)
119 | 	t.Log(resp.Success)
120 | 	t.Log(resp.ContentLength)
121 | 	t.Log(resp.Headers)
122 | 	t.Log(resp.Charset)
123 | 
124 | 	doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
125 | 	doc.Find(DefaultDocRemoveTags).Remove()
126 | 	lang := Lang(doc, resp.Charset.Charset, true)
127 | 	t.Log(lang)
128 | 
129 | 	t.Log(fun.String(resp.Body))
130 | }
131 | 
132 | func TestHttpGetContentLength(t *testing.T) {
133 | 	var urlStr string
134 | 
135 | 	urlStr = "http://suosi.com.cn"
136 | 
137 | 	req := &HttpReq{
138 | 		HttpReq: &fun.HttpReq{
139 | 			MaxContentLength: 1000,
140 | 		},
141 | 	}
142 | 	resp, err := HttpGetResp(urlStr, req, 10000)
143 | 
144 | 	t.Log(urlStr)
145 | 	t.Log(err)
146 | 	t.Log(resp.Success)
147 | 	t.Log(resp.ContentLength)
148 | 	t.Log(resp.Headers)
149 | 	t.Log(resp.Charset)
150 | 
151 | 	doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
152 | 	doc.Find(DefaultDocRemoveTags).Remove()
153 | 	lang := Lang(doc, resp.Charset.Charset, true)
154 | 	t.Log(lang)
155 | 
156 | 	t.Log(fun.String(resp.Body))
157 | }
158 | 


--------------------------------------------------------------------------------
/extract/meta.go:
--------------------------------------------------------------------------------
  1 | package extract
  2 | 
  3 | import (
  4 | 	"strings"
  5 | )
  6 | 
  7 | var HostGovCountryMap = map[string]string{
  8 | 	"hk": "中国",
  9 | 	"tw": "中国",
 10 | 	"mo": "中国",
 11 | 	"jp": "日本",
 12 | 	"kr": "韩国",
 13 | 	"in": "印度",
 14 | 	"uk": "英国",
 15 | 	"us": "美国",
 16 | 	"it": "意大利",
 17 | 	"es": "西班牙",
 18 | 	"ru": "俄罗斯",
 19 | 	"de": "德国",
 20 | 	"fr": "法国",
 21 | 	"th": "泰国",
 22 | 	"vn": "越南",
 23 | 	"sg": "新加坡",
 24 | 	"au": "澳大利亚",
 25 | 	"ca": "加拿大",
 26 | 	"il": "以色列",
 27 | 	"mm": "缅甸",
 28 | 	"dz": "阿尔及利亚",
 29 | 	"pl": "波兰",
 30 | 	"az": "南非",
 31 | 	"ng": "尼日利亚",
 32 | 	"kp": "朝鲜",
 33 | 	"lb": "黎巴嫩",
 34 | 	"ua": "乌克兰",
 35 | 	"tr": "土耳其",
 36 | 	"se": "瑞典",
 37 | 	"lk": "斯里兰卡",
 38 | 	"si": "斯洛文尼亚",
 39 | 	"sk": "斯洛伐克",
 40 | 	"ro": "罗马尼亚",
 41 | 	"pt": "葡萄牙",
 42 | 	"ph": "菲律宾",
 43 | 	"pk": "巴基斯坦",
 44 | 	"py": "巴拉圭",
 45 | 	"np": "尼泊尔",
 46 | 	"ma": "摩洛哥",
 47 | 	"my": "马来西亚",
 48 | 	"lt": "立陶宛",
 49 | 	"ie": "爱尔兰",
 50 | 	"iq": "伊拉克",
 51 | 	"ir": "伊朗",
 52 | 	"id": "印度尼西亚",
 53 | 	"hu": "匈牙利",
 54 | 	"gr": "希腊",
 55 | 	"eg": "埃及",
 56 | 	"cz": "捷克",
 57 | 	"hr": "克罗地亚",
 58 | 	"co": "哥伦比亚",
 59 | 	"cl": "智利",
 60 | 	"br": "巴西",
 61 | 	"bg": "保加利亚",
 62 | 	"be": "比利时",
 63 | 	"bd": "孟加拉国",
 64 | 	"aw": "阿鲁巴",
 65 | 	"am": "亚美尼亚",
 66 | 	"ai": "安圭拉",
 67 | 	"ao": "安哥拉",
 68 | 	"al": "阿尔巴尼亚",
 69 | 	"af": "阿富汗",
 70 | 	"sa": "沙特阿拉伯",
 71 | 	"nl": "荷兰",
 72 | }
 73 | 
 74 | // MetaFromHost 根据域名尽可能返回一些固定信息
 75 | func MetaFromHost(host string, lang string) (string, string, string) {
 76 | 	var tld string
 77 | 	var country string
 78 | 	var province string
 79 | 	var category string
 80 | 
 81 | 	host = strings.ToLower(host)
 82 | 
 83 | 	if domain, err := DomainParse(host); err == nil {
 84 | 		tld = domain.TLD
 85 | 	} else {
 86 | 		return country, province, category
 87 | 	}
 88 | 
 89 | 	// 美国政府顶级域名
 90 | 	if tld == "gov" {
 91 | 		country = "美国"
 92 | 		category = "政务"
 93 | 		return country, province, category
 94 | 	}
 95 | 
 96 | 	// 判断是否是政府域名
 97 | 	for c, zh := range HostGovCountryMap {
 98 | 		gov := "gov." + c
 99 | 		if tld == gov {
100 | 			country = zh
101 | 			category = "政务"
102 | 
103 | 			if strings.HasSuffix(host, ".hk") && lang == "zh" {
104 | 				province = "中国香港"
105 | 			}
106 | 			if strings.HasSuffix(host, ".tw") && lang == "zh" {
107 | 				province = "中国台湾"
108 | 			}
109 | 			if strings.HasSuffix(host, ".mo") && lang == "zh" {
110 | 				province = "中国澳门"
111 | 			}
112 | 			return country, province, category
113 | 		}
114 | 	}
115 | 
116 | 	if strings.HasSuffix(host, ".hk") && lang == "zh" {
117 | 		country = "中国"
118 | 		province = "中国香港"
119 | 		return country, province, category
120 | 	}
121 | 
122 | 	if strings.HasSuffix(host, ".tw") && lang == "zh" {
123 | 		country = "中国"
124 | 		province = "中国台湾"
125 | 		return country, province, category
126 | 	}
127 | 
128 | 	if strings.HasSuffix(host, ".mo") && lang == "zh" {
129 | 		country = "中国"
130 | 		province = "中国澳门"
131 | 		return country, province, category
132 | 	}
133 | 
134 | 	if strings.HasSuffix(host, ".cn") && lang == "zh" {
135 | 		country = "中国"
136 | 		return country, province, category
137 | 	}
138 | 
139 | 	if strings.HasSuffix(host, ".jp") && lang == "ja" {
140 | 		country = "日本"
141 | 		return country, province, category
142 | 	}
143 | 
144 | 	if strings.HasSuffix(host, ".kr") && lang == "ko" {
145 | 		country = "韩国"
146 | 		return country, province, category
147 | 	}
148 | 
149 | 	if strings.HasSuffix(host, ".uk") && lang == "en" {
150 | 		country = "英国"
151 | 		return country, province, category
152 | 	}
153 | 
154 | 	if strings.HasSuffix(host, ".us") && lang == "en" {
155 | 		country = "美国"
156 | 		return country, province, category
157 | 	}
158 | 
159 | 	if strings.HasSuffix(host, ".in") && lang == "hi" {
160 | 		country = "印度"
161 | 		return country, province, category
162 | 	}
163 | 
164 | 	if strings.HasSuffix(host, ".es") && lang == "es" {
165 | 		country = "西班牙"
166 | 		return country, province, category
167 | 	}
168 | 
169 | 	if strings.HasSuffix(host, ".ru") && lang == "ru" {
170 | 		country = "俄罗斯"
171 | 		return country, province, category
172 | 	}
173 | 
174 | 	if strings.HasSuffix(host, ".de") && lang == "de" {
175 | 		country = "德国"
176 | 		return country, province, category
177 | 	}
178 | 
179 | 	if strings.HasSuffix(host, ".fr") && lang == "fr" {
180 | 		country = "法国"
181 | 		return country, province, category
182 | 	}
183 | 
184 | 	return country, province, category
185 | }
186 | 


--------------------------------------------------------------------------------
/http.go:
--------------------------------------------------------------------------------
  1 | package spider
  2 | 
  3 | import (
  4 | 	"crypto/tls"
  5 | 	"errors"
  6 | 	"net"
  7 | 	"net/http"
  8 | 	"time"
  9 | 
 10 | 	"github.com/x-funs/go-fun"
 11 | )
 12 | 
 13 | const (
 14 | 	HttpDefaultTimeOut          = 10000
 15 | 	HttpDefaultMaxContentLength = 10 * 1024 * 1024
 16 | 	HttpDefaultUserAgent        = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
 17 | 	HttpDefaultAcceptEncoding   = "gzip, deflate"
 18 | )
 19 | 
 20 | var (
 21 | 	textContentTypes = []string{
 22 | 		"text/plain",
 23 | 		"text/html",
 24 | 		"text/xml",
 25 | 		"application/xml",
 26 | 		"application/xhtml+xml",
 27 | 		"application/json",
 28 | 	}
 29 | )
 30 | 
 31 | type HttpReq struct {
 32 | 	// 嵌入 fun.HttpReq
 33 | 	*fun.HttpReq
 34 | 
 35 | 	// 禁止自动探测字符集和转换字符集
 36 | 	DisableCharset bool
 37 | 
 38 | 	// 强制 ContentType 为文本类型
 39 | 	ForceTextContentType bool
 40 | }
 41 | 
 42 | type HttpResp struct {
 43 | 	*fun.HttpResp
 44 | 
 45 | 	// 字符集
 46 | 	Charset CharsetRes
 47 | }
 48 | 
 49 | // HttpDefaultTransport 默认全局使用的 http.Transport
 50 | var HttpDefaultTransport = &http.Transport{
 51 | 	DialContext:           (&net.Dialer{Timeout: time.Second}).DialContext,
 52 | 	DisableKeepAlives:     true,
 53 | 	IdleConnTimeout:       60 * time.Second,
 54 | 	TLSHandshakeTimeout:   10 * time.Second,
 55 | 	ExpectContinueTimeout: 1 * time.Second,
 56 | 	TLSClientConfig:       &tls.Config{InsecureSkipVerify: true},
 57 | }
 58 | 
 59 | // HttpGet 参数为请求地址 (HttpReq, 超时时间)
 60 | // HttpGet(url)、HttpGet(url, HttpReq)、HttpGet(url, timeout)、HttpGet(url, HttpReq, timeout)
 61 | // 返回 body, 错误信息
 62 | func HttpGet(urlStr string, args ...any) ([]byte, error) {
 63 | 	l := len(args)
 64 | 
 65 | 	switch l {
 66 | 	case 0:
 67 | 		return HttpGetDo(urlStr, nil, 0)
 68 | 	case 1:
 69 | 		switch v := args[0].(type) {
 70 | 		case int:
 71 | 			timeout := fun.ToInt(args[0])
 72 | 			return HttpGetDo(urlStr, nil, timeout)
 73 | 		case *HttpReq:
 74 | 			return HttpGetDo(urlStr, v, 0)
 75 | 
 76 | 		}
 77 | 	case 2:
 78 | 		timeout := fun.ToInt(args[1])
 79 | 		switch v := args[0].(type) {
 80 | 		case *HttpReq:
 81 | 			return HttpGetDo(urlStr, v, timeout)
 82 | 		}
 83 | 
 84 | 	}
 85 | 
 86 | 	return nil, errors.New("http get params error")
 87 | }
 88 | 
 89 | // HttpGetDo Http Get 请求, 参数为请求地址, HttpReq, 超时时间(毫秒)
 90 | // 返回 body, 错误信息
 91 | func HttpGetDo(urlStr string, r *HttpReq, timeout int) ([]byte, error) {
 92 | 	resp, err := HttpGetResp(urlStr, r, timeout)
 93 | 	if err != nil {
 94 | 		return nil, err
 95 | 	} else {
 96 | 		return resp.Body, nil
 97 | 	}
 98 | }
 99 | 
100 | // HttpGetResp Http Get 请求, 参数为请求地址, HttpReq, 超时时间(毫秒)
101 | // 返回 HttpResp, 错误信息
102 | func HttpGetResp(urlStr string, r *HttpReq, timeout int) (*HttpResp, error) {
103 | 	req, err := http.NewRequest(http.MethodGet, urlStr, nil)
104 | 	if err != nil {
105 | 		return nil, err
106 | 	}
107 | 
108 | 	return HttpDoResp(req, r, timeout)
109 | }
110 | 
111 | // HttpDo Http 请求, 参数为 http.Request, HttpReq, 超时时间(毫秒)
112 | // 返回 body, 错误信息
113 | func HttpDo(req *http.Request, r *HttpReq, timeout int) ([]byte, error) {
114 | 	resp, err := HttpDoResp(req, r, timeout)
115 | 	if err != nil {
116 | 		return nil, err
117 | 	} else {
118 | 		return resp.Body, nil
119 | 	}
120 | }
121 | 
122 | // HttpDoResp Http 请求, 参数为 http.Request, HttpReq, 超时时间(毫秒)
123 | // 返回 HttpResp, 错误信息
124 | func HttpDoResp(req *http.Request, r *HttpReq, timeout int) (*HttpResp, error) {
125 | 	// 处理 Transport
126 | 	if r == nil {
127 | 		r = &HttpReq{
128 | 			HttpReq: &fun.HttpReq{
129 | 				Transport: HttpDefaultTransport,
130 | 			},
131 | 		}
132 | 	} else if r.HttpReq == nil {
133 | 		r.HttpReq = &fun.HttpReq{
134 | 			Transport: HttpDefaultTransport,
135 | 		}
136 | 	} else if r.Transport == nil {
137 | 		r.Transport = HttpDefaultTransport
138 | 	}
139 | 
140 | 	// 强制文本类型
141 | 	if r != nil && r.ForceTextContentType {
142 | 		r.AllowedContentTypes = textContentTypes
143 | 	}
144 | 
145 | 	// HttpResp
146 | 	var charset CharsetRes
147 | 	httpResp := &HttpResp{
148 | 		Charset: charset,
149 | 	}
150 | 
151 | 	resp, err := fun.HttpDoResp(req, r.HttpReq, timeout)
152 | 	httpResp.HttpResp = resp
153 | 	if err != nil {
154 | 		return httpResp, err
155 | 	}
156 | 
157 | 	// 默认会自动进行探测编码和转码, 除非手动禁用
158 | 	if r == nil || !r.DisableCharset {
159 | 		charsetRes := Charset(httpResp.Body, httpResp.Headers)
160 | 		httpResp.Charset = charsetRes
161 | 
162 | 		if charsetRes.Charset != "" && charsetRes.Charset != "UTF-8" {
163 | 			utf8Body, e := fun.ToUtf8(httpResp.Body, charsetRes.Charset)
164 | 			if e != nil {
165 | 				return httpResp, errors.New("ErrorCharset")
166 | 			} else {
167 | 				httpResp.Body = utf8Body
168 | 			}
169 | 		}
170 | 	}
171 | 
172 | 	return httpResp, nil
173 | }
174 | 


--------------------------------------------------------------------------------
/lang_test.go:
--------------------------------------------------------------------------------
  1 | package spider
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"regexp"
  7 | 	"testing"
  8 | 
  9 | 	"github.com/PuerkitoBio/goquery"
 10 | 	"github.com/suosi-inc/lingua-go"
 11 | 	"github.com/x-funs/go-fun"
 12 | )
 13 | 
 14 | func TestLinguaText(t *testing.T) {
 15 | 	text := "BEIJING, 10 août (Xinhua) -- Un porte-parole du Bureau du Travail du Comité central du Parti communiste chinois pour les affaires de Taiwan a fait mercredi des remarques sur un livre blanc nouvellement publié intitulé \"La question de Taiwan et la réunification de la Chine dans la nouvelle ère\"."
 16 | 
 17 | 	start := fun.Timestamp(true)
 18 | 	languages := []lingua.Language{
 19 | 		lingua.French,
 20 | 		lingua.Spanish,
 21 | 		lingua.Portuguese,
 22 | 		lingua.German,
 23 | 	}
 24 | 	detector := lingua.NewLanguageDetectorBuilder().
 25 | 		FromLanguages(languages...).
 26 | 		Build()
 27 | 
 28 | 	if language, exists := detector.DetectLanguageOf(text); exists {
 29 | 		t.Log(text)
 30 | 		t.Log(language.IsoCode639_1())
 31 | 		fmt.Println(fun.Timestamp(true) - start)
 32 | 	}
 33 | }
 34 | 
 35 | func BenchmarkLinguaTest(b *testing.B) {
 36 | 
 37 | 	text := "BEIJING"
 38 | 
 39 | 	languages := []lingua.Language{
 40 | 		lingua.French,
 41 | 		lingua.Spanish,
 42 | 		lingua.Portuguese,
 43 | 		lingua.German,
 44 | 		lingua.English,
 45 | 	}
 46 | 	detector := lingua.NewLanguageDetectorBuilder().
 47 | 		FromLanguages(languages...).
 48 | 		Build()
 49 | 
 50 | 	b.ResetTimer()
 51 | 
 52 | 	for i := 0; i < b.N; i++ {
 53 | 		_, _ = detector.DetectLanguageOf(text)
 54 | 	}
 55 | }
 56 | 
 57 | func TestLang(t *testing.T) {
 58 | 
 59 | 	var urlStrs = []string{
 60 | 
 61 | 		"https://www.bbc.com",
 62 | 		"https://www.ft.com/",
 63 | 
 64 | 		"https://www.163.com/news/article/HEJGEVFT000189FH.html",
 65 | 		"https://www.163.com",
 66 | 
 67 | 		"https://english.news.cn",
 68 | 		"https://jp.news.cn",
 69 | 		"https://kr.news.cn",
 70 | 		"https://german.news.cn/",
 71 | 		"https://portuguese.news.cn/",
 72 | 		"https://arabic.news.cn",
 73 | 		"https://french.news.cn",
 74 | 
 75 | 		"https://mn.cctv.com/",
 76 | 
 77 | 		"http://government.ru",
 78 | 
 79 | 		"https://www.gouvernement.fr",
 80 | 
 81 | 		"http://live.siammedia.org/",
 82 | 		"https://www.manchestereveningnews.co.uk/",
 83 | 
 84 | 		"https://www.chinadaily.com.cn",
 85 | 		"http://cn.chinadaily.com.cn/",
 86 | 		"http://www.chinadaily.com.cn/chinawatch_fr/index.html",
 87 | 		"https://d1ev.com/",
 88 | 		"https://www.cngold.com.cn/",
 89 | 		"https://china.guidechem.com/",
 90 | 		"https://xdkb.net/",
 91 | 		"https://www.lifeweek.com.cn/",
 92 | 		"http://gxbsrd.gov.cn/",
 93 | 		"https://defence24.com/",
 94 | 		"http://www.gmp.or.kr/",
 95 | 		"http://rdfmj.com/",
 96 | 		"https://news.xmnn.cn/xmnn/2022/08/09/101067908.shtml",
 97 | 	}
 98 | 
 99 | 	for _, urlStr := range urlStrs {
100 | 		resp, _ := HttpGetResp(urlStr, nil, 10000)
101 | 
102 | 		doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
103 | 
104 | 		doc.Find(DefaultDocRemoveTags).Remove()
105 | 
106 | 		// 语言
107 | 		start := fun.Timestamp(true)
108 | 		langRes := Lang(doc, resp.Charset.Charset, true)
109 | 
110 | 		t.Log(urlStr)
111 | 		t.Log(resp.Charset)
112 | 		t.Log(langRes)
113 | 		t.Log(fun.Timestamp(true) - start)
114 | 	}
115 | 
116 | }
117 | 
118 | func TestLangText(t *testing.T) {
119 | 	start := fun.Timestamp(true)
120 | 	text := "中文"
121 | 	t.Log(fun.Timestamp(true) - start)
122 | 	t.Log(LangText(text))
123 | }
124 | 
125 | func TestUnicode(t *testing.T) {
126 | 	text := "BEIJING, 9. August 2022 (Xinhuanet) -- In einem am Dienstag veröffentlichten Bericht über die Menschenrechtsverletzungen der USA wird darauf hingewiesen, dass die Vereinigten Staaten einen \"Konflikt der Zivilisationen\" geschaffen, Haft und Folter missbraucht sowie die Religionsfreiheit und Menschenwürde verletzt hätten.\n\nDer Bericht mit dem Titel ''Die USA begehen schwerwiegende Verbrechen der Menschenrechtsverletzungen im Nahen Osten und darüber hinaus'' wurde von der Chinesischen Gesellschaft für Menschenrechtsstudien veröffentlicht.\n\nIn dem Bericht heißt es, dass die Vereinigten Staaten keinen Respekt vor der Diversität der Zivilisationen zeigten, der islamischen Zivilisation feindlich gegenüberständen, das historische und kulturelle Erbe des Nahen Ostens zerstörten, Muslime rücksichtslos inhaftierten und folterten und die grundlegenden Menschenrechte der Bevölkerung im Nahen Osten und in anderen Gebieten schwer verletzten.\n\n\"Die Vereinigten Staaten haben die 'islamische Bedrohungstheorie' in der ganzen Welt verbreitet. Sie haben die Überlegenheit der westlichen und christlichen Zivilisation befürwortet, die nicht-westliche Zivilisation verachtet und die islamische Zivilisation stigmatisiert, indem sie sie als 'rückständig', 'terroristisch' und 'gewalttätig' bezeichneten\", heißt es in dem Bericht."
127 | 	// latinRex := regexp.MustCompile(`\p{Lo}`)
128 | 	latinRex := regexp.MustCompile("[\u0080-\u00ff]")
129 | 	latin := latinRex.FindAllString(text, -1)
130 | 
131 | 	t.Log(latin)
132 | }
133 | 


--------------------------------------------------------------------------------
/charset.go:
--------------------------------------------------------------------------------
  1 | package spider
  2 | 
  3 | import (
  4 | 	"net/http"
  5 | 	"regexp"
  6 | 	"strings"
  7 | 	"unicode/utf8"
  8 | 
  9 | 	"github.com/suosi-inc/chardet"
 10 | 	"github.com/x-funs/go-fun"
 11 | )
 12 | 
 13 | const (
 14 | 	CharsetPosHeader = "header"
 15 | 	CharsetPosHtml   = "html"
 16 | 	CharsetPosGuess  = "guess"
 17 | 	CharsetPosValid  = "valid"
 18 | )
 19 | 
 20 | const (
 21 | 	RegexCharset      = "(?i)charset=\\s*([a-z][_\\-0-9a-z]*)"
 22 | 	RegexCharsetHtml4 = "(?i)<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>"
 23 | 	RegexCharsetHtml5 = "(?i)<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>"
 24 | )
 25 | 
 26 | var (
 27 | 	regexCharsetPattern      = regexp.MustCompile(RegexCharset)
 28 | 	regexCharsetHtml4Pattern = regexp.MustCompile(RegexCharsetHtml4)
 29 | 	regexCharsetHtml5Pattern = regexp.MustCompile(RegexCharsetHtml5)
 30 | )
 31 | 
 32 | type CharsetRes struct {
 33 | 	Charset    string
 34 | 	CharsetPos string
 35 | }
 36 | 
 37 | // Charset 解析 HTTP body、http.Header 中的编码和语言, 如果未解析成功则尝试进行猜测
 38 | func Charset(body []byte, headers *http.Header) CharsetRes {
 39 | 	var charsetRes CharsetRes
 40 | 	var guessCharset string
 41 | 
 42 | 	// 优先检测是否是有效的 UTF-8
 43 | 	valid := utf8.Valid(body)
 44 | 	if valid {
 45 | 		charsetRes.Charset = "UTF-8"
 46 | 		charsetRes.CharsetPos = CharsetPosValid
 47 | 		return charsetRes
 48 | 	}
 49 | 
 50 | 	// 根据 Content-Type、Body Html 标签探测编码
 51 | 	charsetRes = CharsetFromHeaderHtml(body, headers)
 52 | 
 53 | 	// 未识别到 charset 则使用 guess
 54 | 	if charsetRes.Charset == "" {
 55 | 		guessCharset = CharsetGuess(body)
 56 | 
 57 | 		if guessCharset != "" {
 58 | 			charsetRes.Charset = guessCharset
 59 | 			charsetRes.CharsetPos = CharsetPosGuess
 60 | 		}
 61 | 	}
 62 | 
 63 | 	return charsetRes
 64 | }
 65 | 
 66 | // CharsetFromHeaderHtml 解析 HTTP body、http.Header 中的 charset, 准确性高
 67 | func CharsetFromHeaderHtml(body []byte, headers *http.Header) CharsetRes {
 68 | 	var res CharsetRes
 69 | 
 70 | 	cHeader := CharsetFromHeader(headers)
 71 | 
 72 | 	cHtml := CharsetFromHtml(body)
 73 | 
 74 | 	// 只有 Header 则使用 Header
 75 | 	if cHeader != "" && cHtml == "" {
 76 | 		res.Charset = cHeader
 77 | 		res.CharsetPos = CharsetPosHeader
 78 | 		return res
 79 | 	}
 80 | 
 81 | 	// 只有 Html 则使用 Html
 82 | 	if cHeader == "" && cHtml != "" {
 83 | 		res.Charset = cHtml
 84 | 		res.CharsetPos = CharsetPosHtml
 85 | 		return res
 86 | 	}
 87 | 
 88 | 	// 同时有 Header 和 Html, 根据情况使用 Header 或 Html
 89 | 	if cHeader != "" && cHtml != "" {
 90 | 		if cHeader == cHtml {
 91 | 			res.Charset = cHeader
 92 | 			res.CharsetPos = CharsetPosHeader
 93 | 			return res
 94 | 		}
 95 | 
 96 | 		// Header 和 Html 不一致, 以下情况以 Html 为准
 97 | 		if strings.HasPrefix(cHeader, "ISO") || strings.HasPrefix(cHeader, "WINDOWS") {
 98 | 			res.Charset = cHtml
 99 | 			res.CharsetPos = CharsetPosHtml
100 | 			return res
101 | 		}
102 | 
103 | 		res.Charset = cHeader
104 | 		res.CharsetPos = CharsetPosHeader
105 | 		return res
106 | 	}
107 | 
108 | 	return res
109 | }
110 | 
111 | // CharsetFromHeader 解析 HTTP header 中的 charset
112 | func CharsetFromHeader(headers *http.Header) string {
113 | 	var charset string
114 | 	if headers != nil {
115 | 		contentType := headers.Get("Content-Type")
116 | 		if !fun.Blank(contentType) {
117 | 			matches := regexCharsetPattern.FindStringSubmatch(contentType)
118 | 			if len(matches) > 1 {
119 | 				charset = matches[1]
120 | 			}
121 | 		}
122 | 	}
123 | 
124 | 	return convertCharset(charset)
125 | }
126 | 
127 | // CharsetFromHtml 解析 Html 中的 charset
128 | func CharsetFromHtml(body []byte) string {
129 | 	var charset string
130 | 
131 | 	if len(body) >= 0 {
132 | 		// 先检测 HTML 标签
133 | 		html := fun.String(body)
134 | 
135 | 		// 匹配 HTML4 标签
136 | 		var charset4 string
137 | 		matches := regexCharsetHtml4Pattern.FindStringSubmatch(html)
138 | 		if len(matches) > 1 {
139 | 			matches = regexCharsetPattern.FindStringSubmatch(matches[1])
140 | 			if len(matches) > 1 {
141 | 				charset4 = matches[1]
142 | 			}
143 | 		}
144 | 
145 | 		// 匹配 HTML5 标签
146 | 		var charset5 string
147 | 		matches = regexCharsetHtml5Pattern.FindStringSubmatch(html)
148 | 		if len(matches) > 1 {
149 | 			charset5 = matches[1]
150 | 		}
151 | 
152 | 		// 只有其中一个
153 | 		if charset4 != "" && charset5 == "" {
154 | 			charset = charset4
155 | 		}
156 | 
157 | 		if charset4 == "" && charset5 != "" {
158 | 			charset = charset5
159 | 		}
160 | 
161 | 		if charset4 != "" && charset5 != "" {
162 | 			// 竟然两个都有, 以最先出现的为准
163 | 			if charset4 == charset5 {
164 | 				charset = charset5
165 | 			} else {
166 | 				charset4Index := strings.Index(html, charset4)
167 | 				charset5Index := strings.Index(html, charset5)
168 | 
169 | 				if charset4Index < charset5Index {
170 | 					charset = charset4
171 | 				} else {
172 | 					charset = charset5
173 | 				}
174 | 			}
175 | 
176 | 		}
177 | 	}
178 | 
179 | 	return convertCharset(charset)
180 | }
181 | 
182 | // CharsetGuess 根据 HTTP body 猜测编码
183 | func CharsetGuess(body []byte) string {
184 | 	var guessCharset string
185 | 
186 | 	detector := chardet.NewHtmlDetector()
187 | 	guess, err := detector.DetectBest(body)
188 | 	if err == nil {
189 | 		guessCharset = strings.ToUpper(guess.Charset)
190 | 	}
191 | 
192 | 	return guessCharset
193 | }
194 | 
195 | // convertCharset 格式化 charset
196 | func convertCharset(charset string) string {
197 | 	c := strings.ToUpper(strings.TrimSpace(charset))
198 | 
199 | 	if c != "" {
200 | 		// alias utf8
201 | 		if c == "UTF8" || c == "UTF_8" {
202 | 			return "UTF-8"
203 | 		}
204 | 
205 | 		// alias gb2312, gb18030
206 | 		if strings.HasPrefix(c, "GB") {
207 | 			return "GBK"
208 | 		}
209 | 
210 | 		// alias big5-hkscs..
211 | 		if strings.HasPrefix(c, "BIG5") {
212 | 			return "Big5"
213 | 		}
214 | 
215 | 		// alias shift-jis
216 | 		if strings.HasPrefix(c, "SHIFT") {
217 | 			return "SHIFT_JIS"
218 | 		}
219 | 	}
220 | 
221 | 	return c
222 | }
223 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
 2 | github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
 3 | github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
 4 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
 5 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
 6 | github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
 7 | github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4=
 8 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 9 | github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8=
10 | github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0=
11 | github.com/microcosm-cc/bluemonday v1.0.26 h1:xbqSvqzQMeEHCqMi64VAs4d8uy6Mequs3rQ0k/Khz58=
12 | github.com/microcosm-cc/bluemonday v1.0.26/go.mod h1:JyzOCs9gkyQyjs+6h10UEVSe02CGwkhd72Xdqh78TWs=
13 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
14 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
15 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
16 | github.com/stretchr/objx v0.4.0 h1:M2gUjqZET1qApGOWNSnZ49BAIMX4F/1plDv3+l31EJ4=
17 | github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk=
18 | github.com/suosi-inc/chardet v0.1.0 h1:AmAXYaZKPAXCpwthMeQG/ABwYreonxjP/BCbhOa7jfw=
19 | github.com/suosi-inc/chardet v0.1.0/go.mod h1:dhKdJO4yQeuLYMyu1QFjoNITgMJ/zyLhs4zwIUnQTKI=
20 | github.com/suosi-inc/lingua-go v1.0.51 h1:+IhIKGPwLWVTxayQSEnMdTaSCUs2GWS0qVwafGSR0wQ=
21 | github.com/suosi-inc/lingua-go v1.0.51/go.mod h1:XDS0K21fYH99TkkUs71HxmJH03SEhPoc+RPi531aaX0=
22 | github.com/x-funs/go-fun v0.94.0 h1:claEwnVz4ybQYcdHLjm6DeDuVRntavqjOHh5dcHJG2g=
23 | github.com/x-funs/go-fun v0.94.0/go.mod h1:fYbm5aJU4EbzJkUQlodJUphsmjWgJ70iGvZNMakMSw4=
24 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
25 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
26 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
27 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
28 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
29 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
30 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
31 | golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
32 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
33 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
34 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
35 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
36 | golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c=
37 | golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U=
38 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
39 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
40 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
41 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
42 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
43 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
44 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
45 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
46 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
47 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
48 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
49 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
50 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
51 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
52 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
53 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
54 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
55 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
56 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
57 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
58 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
59 | golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc=
60 | golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
61 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
62 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
63 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
64 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
65 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
66 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ```
  2 |                             __                          _     __         
  3 |    ____ _____        ____  / /______ _      _________  (_)___/ /__  _____
  4 |   / __ `/ __ \______/ __ \/ //_/ __ `/_____/ ___/ __ \/ / __  / _ \/ ___/
  5 |  / /_/ / /_/ /_____/ /_/ / ,< / /_/ /_____(__  ) /_/ / / /_/ /  __/ /    
  6 |  \__, /\____/     / .___/_/|_|\__, /     /____/ .___/_/\__,_/\___/_/     
  7 | /____/           /_/         /____/          /_/                         
  8 | 
  9 | ```
 10 | 
 11 | 一个 Golang 实现的相对智能、无需规则维护的通用新闻网站数据提取工具库。含域名探测、网页编码语种识别、网页链接分类提取、网页新闻要素抽取以及新闻正文抽取等组件。
 12 | 
 13 | # 预览
 14 | 
 15 | 前往 [go-pkg-spider-gui Releases](https://github.com/suosi-inc/go-pkg-spider-gui/releases) 下载支持 Windows、MacOS GUI 客户端，进行体验。
 16 | 
 17 | <p align="center" markdown="1" style="max-width: 100%">
 18 |   <img src="https://raw.githubusercontent.com/suosi-inc/go-pkg-spider-gui/main/images/zh/win10.png" width="800" style="max-width: 100%" />
 19 | </p>
 20 | 
 21 | # 使用
 22 | 
 23 | ```shell
 24 | go get -u github.com/suosi-inc/go-pkg-spider
 25 | ```
 26 | 
 27 | # 介绍
 28 | 
 29 | ## Http 客户端
 30 | 
 31 | Http 客户端对 go-fun 中的 `fun.HttpGet` 相关函数进行了一些扩展，增加了以下功能：
 32 | 
 33 | * 自动识别字符集和转换字符集，统一转换为 UTF-8
 34 | * 响应文本类型限制
 35 | 
 36 | - **<big>`HttpGet(urlStr string, args ...any) ([]byte, error)`</big>** Http Get 请求
 37 | - **<big>`HttpGetResp(urlStr string, r *HttpReq, timeout int) (*HttpResp, error)`</big>** Http Get 请求, 返回 HttpResp
 38 | 
 39 | ## 网页语种自动识别
 40 | 
 41 | 当前支持以下主流语种：**中文、英语、日语、韩语、俄语、阿拉伯语、印地语、德语、法语、西班牙语、葡萄牙语、意大利语、泰语、越南语、缅甸语**。
 42 | 
 43 | 语种识别通过 HTML 、文本特征、字符集统计规则优先识别中文、英语、日语、韩语。
 44 | 
 45 | 同时辅助集成了 [lingua-go](https://github.com/pemistahl/lingua-go) n-gram model 语言识别模型，fork 并移除了很多语种和语料（因为完整包很大）
 46 | 
 47 | - **<big>`LangText(text string) (string, string)`</big>** 识别纯文本语种
 48 | - **<big>`Lang(doc *goquery.Document, charset string, listMode bool) LangRes `</big>** 识别 HTML 语种
 49 | 
 50 | ### 示例
 51 | 
 52 | 识别纯文本语种：
 53 | 
 54 | ```go
 55 | // 识别纯文本语种
 56 | lang, langPos := spider.LangText(text)
 57 | ```
 58 | 
 59 | 识别 HTML 语种：
 60 | 
 61 | ```go
 62 | // Http 请求获取响应
 63 | resp, err := spider.HttpGetResp(urlStr, req, timeout)
 64 | 
 65 | // 转换 goquery.*Document
 66 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
 67 | 
 68 | // 根据字符集、页面类型返回
 69 | langRes := spider.Lang(doc, resp.Charset.Charset, false)
 70 | ```
 71 | 
 72 | ## 域名自动探测
 73 | 
 74 | - **<big>`DetectDomain(domain string, timeout int, retry int) (*DomainRes, error)`</big>** 探测主域名基本信息
 75 | - **<big>`func DetectSubDomain(domain string, timeout int, retry int) (*DomainRes, error)`</big>** 探测子域名基本信息
 76 | 
 77 | 根据网站域名，尽可能的探测一些基本信息，基本信息包括：
 78 | 
 79 | ```go
 80 | type DomainRes struct {
 81 | 	// 域名
 82 | 	Domain       string
 83 | 	// 主页域名
 84 | 	HomeDomain   string
 85 | 	// 协议
 86 | 	Scheme       string
 87 | 	// 字符集
 88 | 	Charset      CharsetRes
 89 | 	// 语种
 90 | 	Lang         LangRes
 91 | 	// 国家
 92 | 	Country      string
 93 | 	// 省份
 94 | 	Province     string
 95 | 	// 分类
 96 | 	Category     string
 97 | 	// 标题
 98 | 	Title        string
 99 | 	// 描述
100 | 	Description  string
101 | 	// ICP
102 | 	Icp          string
103 | 	// 状态
104 | 	State        bool
105 | 	// 状态码
106 | 	StatusCode   int
107 | 	// 内容页链接数量
108 | 	ContentCount int
109 | 	// 列表页链接数量
110 | 	ListCount    int
111 | 	// 子域名列表
112 | 	SubDomains   map[string]bool
113 | }
114 | ```
115 | 
116 | ## 网页链接分类提取
117 | 
118 | 根据页面内容，自动分析识别并提取页面上的内容页、列表页以及其他链接，支持传入自定义规则干扰最终结果
119 | 
120 | 分类依据通过链接标题、URL特征、以及统计归纳的方式
121 | 
122 | - **<big>`GetLinkData(urlStr string, strictDomain bool, timeout int, retry int) (*LinkData, error)`</big>** 获取页面链接分类数据
123 | 
124 | ### 链接分类提取结果定义
125 | 
126 | ```go
127 | type LinkData struct {
128 | 	LinkRes    *extract.LinkRes
129 | 	// 过滤
130 | 	Filters    map[string]string
131 | 	// 子域名
132 | 	SubDomains map[string]bool
133 | }
134 | 
135 | type LinkRes struct {
136 | 	// 内容页
137 | 	Content map[string]string
138 | 	// 列表页
139 | 	List map[string]string
140 | 	// 未知链接
141 | 	Unknown map[string]string
142 | 	// 过滤链接
143 | 	None map[string]string
144 | }
145 | ```
146 | 
147 | ## 网页新闻提取
148 | 
149 | 新闻最重要的三要素：标题、发布时间、正文。其中发布时间对精准度要求高，标题和正文更追求完整性。
150 | 
151 | 体验下来，业内最强大的是： [diffbot](https://www.diffbot.com/) 公司，猜测它可能是基于网页视觉+深度学习来实现。
152 | 
153 | 有不少新闻正文提取或新闻正文抽取的开源的方案，大都是基于规则或统计方法实现。如：
154 | 
155 | * Python: [GeneralNewsExtractor](https://github.com/GeneralNewsExtractor/GeneralNewsExtractor)
156 | * Java: [WebCollector/ContentExtractor](https://github.com/CrawlScript/WebCollector)
157 | 
158 | 更古老的还有：[python-goose](https://github.com/grangier/python-goose), [newspaper](https://github.com/codelucas/newspaper)，甚至 Readability、Html2Article 等等。
159 | 
160 | 其中：`WebCollector/ContentExtractor` 是 [基于标签路径特征融合新闻内容抽取的 CEPF 算法](http://www.jos.org.cn/jos/article/abstract/4868) 的 Java 实现版本。
161 | 
162 | go-pkg-spider 实现了 CEPF 算法的 Golang 版本，在此基础上做了大量优化，内置了一些通用规则，更精细的控制了标题和发布时间的提取与转换，并支持多语种新闻网站的要素提取。
163 | 
164 | 
165 | ### 新闻要素提取结果定义
166 | 
167 | ```go
168 | type News struct {
169 | 	// 标题
170 | 	Title string
171 | 	// 标题提取依据
172 | 	TitlePos string
173 | 	// 发布时间
174 | 	TimeLocal string
175 | 	// 原始时间
176 | 	Time string
177 | 	// 发布时间时间提取依据
178 | 	TimePos string
179 | 	// 正文纯文本
180 | 	Content string
181 | 	// 正文 Node 节点
182 | 	ContentNode *html.Node
183 | 	// 提取用时（毫秒）
184 | 	Spend int64
185 | 	// 语种
186 | 	Lang string
187 | }
188 | ```
189 | 
190 | 可根据 `ContentNode *html.Node` 来重新定义需要清洗保留的标签。
191 | 
192 | ### 效果
193 | 
194 | <p align="center" markdown="1" style="max-width: 100%">
195 |   <img src="https://raw.githubusercontent.com/suosi-inc/go-pkg-spider-gui/main/images/zh/content.png" width="800" style="max-width: 100%" />
196 | </p>
197 | 
198 | ### 示例
199 | 
200 | ```go
201 | // Http 请求获取响应
202 | resp, err := spider.HttpGetResp(urlStr, req, timeout)
203 | 
204 | // 转换 goquery.*Document
205 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
206 | 
207 | // 基本清理
208 | doc.Find(spider.DefaultDocRemoveTags).Remove()
209 | 
210 | // 语种
211 | langRes := Lang(doc, resp.Charset.Charset, false)
212 | 
213 | // 新闻提取
214 | content := extract.NewContent(contentDoc, langRes.Lang, listTitle, urlStr)
215 | 
216 | // 新闻提取结果
217 | news := content.ExtractNews()
218 | ```
219 | 
220 | 可以通过下面的已经封装好的方法完成以上步骤：
221 | 
222 | - **<big>`GetNews(urlStr string, title string, timeout int, retry int) (*extract.News, *HttpResp, error)`</big>** 获取链接新闻数据
223 | 
224 | # 免责声明
225 | 
226 | 本项目是一个数据提取工具库，不是爬虫框架或采集软件，只限于技术交流，源码中请求目标网站的相关代码仅为功能测试需要。
227 | 
228 | 请在符合法律法规和相关规定的情况下使用本项目，禁止使用本项目进行任何非法、侵权或者违反公序良俗的行为。
229 | 
230 | 使用本项目造成的直接或间接的风险由用户自行承担。
231 | 


--------------------------------------------------------------------------------
/detect_test.go:
--------------------------------------------------------------------------------
  1 | package spider
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"testing"
  7 | 
  8 | 	"github.com/PuerkitoBio/goquery"
  9 | 	"github.com/suosi-inc/go-pkg-spider/extract"
 10 | 	"github.com/x-funs/go-fun"
 11 | )
 12 | 
 13 | func TestDomainDetect(t *testing.T) {
 14 | 	domains := []string{
 15 | 		// "china-nengyuan.com",
 16 | 		// "suosi.com.cn",
 17 | 		// "wanjiaxian.com",
 18 | 		"thediplomat.com",
 19 | 	}
 20 | 
 21 | 	for _, domain := range domains {
 22 | 		domainRes, err := DetectDomain(domain, 10000, 1)
 23 | 		if err == nil {
 24 | 			t.Log(domainRes.Title)
 25 | 			t.Log(domainRes.TitleClean)
 26 | 			t.Log(domainRes)
 27 | 		} else {
 28 | 			t.Log(err)
 29 | 			t.Log(domainRes)
 30 | 		}
 31 | 	}
 32 | }
 33 | 
 34 | func BenchmarkLinkTitles(b *testing.B) {
 35 | 	urlStr := "http://www.qq.com/"
 36 | 
 37 | 	resp, _ := HttpGetResp(urlStr, nil, 30000)
 38 | 
 39 | 	// 解析 HTML
 40 | 	doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
 41 | 	doc.Find(DefaultDocRemoveTags).Remove()
 42 | 
 43 | 	// 语言
 44 | 
 45 | 	langRes := Lang(doc, resp.Charset.Charset, true)
 46 | 
 47 | 	fmt.Println(langRes)
 48 | 
 49 | 	var linkTitles map[string]string
 50 | 
 51 | 	b.ResetTimer()
 52 | 
 53 | 	for i := 0; i < b.N; i++ {
 54 | 		// 标题
 55 | 		linkTitles, _ = extract.WebLinkTitles(doc, resp.RequestURL, true)
 56 | 
 57 | 		// 连接和子域名
 58 | 		_, _ = extract.LinkTypes(linkTitles, langRes.Lang, nil)
 59 | 
 60 | 		// rules := map[string][]string{
 61 | 		// 	"163.com": []string{
 62 | 		// 		"`\\w{16}\\.html`",
 63 | 		// 	},
 64 | 		// }
 65 | 		// _, _ = extract.LinkTypes(linkTitles, langRes.Lang, rules)
 66 | 	}
 67 | 
 68 | 	b.StopTimer()
 69 | 
 70 | 	fmt.Println(langRes.Lang)
 71 | 	fmt.Println(len(linkTitles))
 72 | 
 73 | }
 74 | 
 75 | func TestLinkTitles(t *testing.T) {
 76 | 	var urlStrs = []string{
 77 | 		"https://www.1905.com",
 78 | 		// "https://www.people.com.cn",
 79 | 		// "https://www.36kr.com",
 80 | 		// "https://www.163.com",
 81 | 		// "https://news.163.com/",
 82 | 		// "http://jyj.suqian.gov.cn",
 83 | 		// "https://www.huxiu.com/",
 84 | 		// "http://www.news.cn/politicspro/",
 85 | 		// "http://www.cankaoxiaoxi.com",
 86 | 		// "http://www.bbc.com",
 87 | 		// "https://www.ft.com",
 88 | 		// "https://www.reuters.com/",
 89 | 		// "https://nypost.com/",
 90 | 		// "http://www.mengcheng.gov.cn/",
 91 | 		// "https://www.chunichi.co.jp",
 92 | 		// "https://www.donga.com/",
 93 | 		// "https://people.com/",
 94 | 		// "https://czql.gov.cn/",
 95 | 		// "https://qiye.163.com/",
 96 | 		// "https://www.washingtontimes.com/",
 97 | 		// "https://www.gamersky.com/",
 98 | 		// "https://www.cdns.com.tw/",
 99 | 		// "http://www.163.com/",
100 | 	}
101 | 
102 | 	for _, urlStr := range urlStrs {
103 | 
104 | 		resp, err := HttpGetResp(urlStr, nil, 30000)
105 | 
106 | 		t.Log(urlStr)
107 | 		t.Log(err)
108 | 
109 | 		// 解析 HTML
110 | 		doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
111 | 		doc.Find(DefaultDocRemoveTags).Remove()
112 | 
113 | 		// 语言
114 | 		langRes := Lang(doc, resp.Charset.Charset, true)
115 | 
116 | 		fmt.Println(resp.Charset)
117 | 		fmt.Println(langRes)
118 | 
119 | 		// 标题
120 | 		linkTitles, filters := extract.WebLinkTitles(doc, resp.RequestURL, true)
121 | 
122 | 		// 分类链接和子域名列表
123 | 		linkRes, domainRes := extract.LinkTypes(linkTitles, langRes.Lang, nil)
124 | 
125 | 		// 分类链接和子域名列表, 规则
126 | 		// rules := map[string][]string{
127 | 		// 	"cankaoxiaoxi.com": []string{
128 | 		// 		"\\d{7}\\.shtml$",
129 | 		// 	},
130 | 		// }
131 | 		// linkRes, domainRes := extract.LinkTypes(linkTitles, langRes.Lang, rules)
132 | 
133 | 		fmt.Println("all:", len(linkTitles))
134 | 		fmt.Println("content:", len(linkRes.Content))
135 | 		fmt.Println("list:", len(linkRes.List))
136 | 		fmt.Println("unknown:", len(linkRes.Unknown))
137 | 		fmt.Println("none:", len(linkRes.None))
138 | 
139 | 		i := 0
140 | 		for a, title := range filters {
141 | 			i = i + 1
142 | 			fmt.Println(i, "filter:"+a+"\t=>\t"+title)
143 | 		}
144 | 		i = 0
145 | 		for subdomain := range domainRes {
146 | 			i = i + 1
147 | 			fmt.Println(i, "domain:"+subdomain)
148 | 		}
149 | 		i = 0
150 | 		for a, title := range linkRes.Content {
151 | 			i = i + 1
152 | 			fmt.Println(i, "content:"+a+"\t=>\t"+title)
153 | 		}
154 | 		i = 0
155 | 		for a, title := range linkRes.Unknown {
156 | 			i = i + 1
157 | 			fmt.Println(i, "unknown:"+a+"\t=>\t"+title)
158 | 		}
159 | 		i = 0
160 | 		for a, title := range linkRes.List {
161 | 			i = i + 1
162 | 			fmt.Println(i, "list:"+a+"\t=>\t"+title)
163 | 		}
164 | 		i = 0
165 | 		for a, title := range linkRes.None {
166 | 			i = i + 1
167 | 			fmt.Println(i, "none:"+a+"\t=>\t"+title)
168 | 		}
169 | 
170 | 	}
171 | }
172 | 
173 | func TestDetectIcp(t *testing.T) {
174 | 	var urlStrs = []string{
175 | 		// "http://suosi.com.cn",
176 | 		"https://www.163.com",
177 | 		// "https://www.sohu.com",
178 | 		// "https://www.qq.com",
179 | 		// "https://www.hexun.com",
180 | 		// "https://www.wfmc.edu.cn/",
181 | 		// "https://www.cankaoxiaoxi.com/",
182 | 	}
183 | 
184 | 	for _, urlStr := range urlStrs {
185 | 
186 | 		resp, err := HttpGetResp(urlStr, nil, 30000)
187 | 
188 | 		t.Log(err)
189 | 		t.Log(urlStr)
190 | 
191 | 		doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
192 | 		doc.Find(DefaultDocRemoveTags).Remove()
193 | 		icp, loc := extract.Icp(doc)
194 | 		t.Log(icp, loc)
195 | 	}
196 | }
197 | 
198 | func TestLangFromUtf8Body(t *testing.T) {
199 | 	var urlStrs = []string{
200 | 		// "https://www.163.com",
201 | 		// "https://english.news.cn",
202 | 		// "https://jp.news.cn",
203 | 		// "https://kr.news.cn",
204 | 		// "https://arabic.news.cn",
205 | 		// "https://www.bbc.com",
206 | 		// "http://government.ru",
207 | 		// "https://french.news.cn",
208 | 		// "https://www.gouvernement.fr",
209 | 		// "http://live.siammedia.org/",
210 | 		// "http://hanoimoi.com.vn",
211 | 		// "https://www.commerce.gov.mm",
212 | 		// "https://www.rrdmyanmar.gov.mm",
213 | 		"https://czql.gov.cn/",
214 | 	}
215 | 
216 | 	for _, urlStr := range urlStrs {
217 | 		resp, _ := fun.HttpGetResp(urlStr, nil, 30000)
218 | 
219 | 		doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
220 | 		doc.Find(DefaultDocRemoveTags).Remove()
221 | 
222 | 		start := fun.Timestamp(true)
223 | 		lang, pos := LangFromUtf8Body(doc, false)
224 | 		t.Log(urlStr)
225 | 		t.Log(lang)
226 | 		t.Log(pos)
227 | 		t.Log(fun.Timestamp(true) - start)
228 | 
229 | 	}
230 | }
231 | 
232 | func TestDetectFriendDomainDo(t *testing.T) {
233 | 	var domains = []string{
234 | 		"northnews.cn",
235 | 	}
236 | 
237 | 	for _, domain := range domains {
238 | 		friendDomains, err := DetectFriendDomainDo(domain, 10000)
239 | 
240 | 		t.Log(err)
241 | 		t.Log(friendDomains)
242 | 	}
243 | }
244 | 


--------------------------------------------------------------------------------
/spider.go:
--------------------------------------------------------------------------------
  1 | package spider
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"errors"
  6 | 	"regexp"
  7 | 	"strings"
  8 | 
  9 | 	"github.com/PuerkitoBio/goquery"
 10 | 	"github.com/suosi-inc/go-pkg-spider/extract"
 11 | 	"github.com/x-funs/go-fun"
 12 | )
 13 | 
 14 | const (
 15 | 	RegexHostnameIp = `\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}`
 16 | 
 17 | 	RegexMetaRefresh = `(?i)url=(.+)`
 18 | )
 19 | 
 20 | var (
 21 | 	DefaultDocRemoveTags = "script,noscript,style,iframe,br,link,svg"
 22 | 
 23 | 	RegexHostnameIpPattern = regexp.MustCompile(RegexHostnameIp)
 24 | 
 25 | 	regexMetaRefreshPattern = regexp.MustCompile(RegexMetaRefresh)
 26 | )
 27 | 
 28 | type LinkData struct {
 29 | 	LinkRes    *extract.LinkRes
 30 | 	Filters    map[string]string
 31 | 	SubDomains map[string]bool
 32 | }
 33 | 
 34 | // GetLinkData 获取页面链接数据
 35 | func GetLinkData(urlStr string, strictDomain bool, timeout int, retry int) (*LinkData, error) {
 36 | 	if retry <= 0 {
 37 | 		retry = 1
 38 | 	}
 39 | 
 40 | 	errs := make([]string, 0)
 41 | 
 42 | 	for i := 0; i < retry; i++ {
 43 | 		linkData, err := GetLinkDataDo(urlStr, strictDomain, nil, nil, timeout)
 44 | 		if err == nil {
 45 | 			return linkData, err
 46 | 		} else {
 47 | 			errs = append(errs, err.Error())
 48 | 		}
 49 | 	}
 50 | 
 51 | 	return nil, errors.New("ErrorLinkRes" + fun.ToString(errs))
 52 | }
 53 | 
 54 | // GetLinkDataWithReq 获取页面链接数据
 55 | func GetLinkDataWithReq(urlStr string, strictDomain bool, req *HttpReq, timeout int, retry int) (*LinkData, error) {
 56 | 	if retry <= 0 {
 57 | 		retry = 1
 58 | 	}
 59 | 
 60 | 	errs := make([]string, 0)
 61 | 
 62 | 	for i := 0; i < retry; i++ {
 63 | 		linkData, err := GetLinkDataDo(urlStr, strictDomain, nil, req, timeout)
 64 | 		if err == nil {
 65 | 			return linkData, err
 66 | 		} else {
 67 | 			errs = append(errs, err.Error())
 68 | 		}
 69 | 	}
 70 | 
 71 | 	return nil, errors.New("ErrorLinkRes" + fun.ToString(errs))
 72 | }
 73 | 
 74 | // GetLinkDataWithReqAndRule 获取页面链接数据
 75 | func GetLinkDataWithReqAndRule(urlStr string, strictDomain bool, rules extract.LinkTypeRule, req *HttpReq, timeout int, retry int) (*LinkData, error) {
 76 | 	if retry <= 0 {
 77 | 		retry = 1
 78 | 	}
 79 | 
 80 | 	errs := make([]string, 0)
 81 | 
 82 | 	for i := 0; i < retry; i++ {
 83 | 		linkData, err := GetLinkDataDo(urlStr, strictDomain, rules, req, timeout)
 84 | 		if err == nil {
 85 | 			return linkData, err
 86 | 		} else {
 87 | 			errs = append(errs, err.Error())
 88 | 		}
 89 | 	}
 90 | 
 91 | 	return nil, errors.New("ErrorLinkRes" + fun.ToString(errs))
 92 | }
 93 | 
 94 | // GetLinkDataWithRule 获取页面链接数据
 95 | func GetLinkDataWithRule(urlStr string, strictDomain bool, rules extract.LinkTypeRule, timeout int, retry int) (*LinkData, error) {
 96 | 	if retry <= 0 {
 97 | 		retry = 1
 98 | 	}
 99 | 
100 | 	errs := make([]string, 0)
101 | 
102 | 	for i := 0; i < retry; i++ {
103 | 		linkData, err := GetLinkDataDo(urlStr, strictDomain, rules, nil, timeout)
104 | 		if err == nil {
105 | 			return linkData, err
106 | 		} else {
107 | 			errs = append(errs, err.Error())
108 | 		}
109 | 	}
110 | 
111 | 	return nil, errors.New("ErrorLinkRes" + fun.ToString(errs))
112 | }
113 | 
114 | // GetLinkDataDo 获取页面链接数据
115 | func GetLinkDataDo(urlStr string, strictDomain bool, rules extract.LinkTypeRule, req *HttpReq, timeout int) (*LinkData, error) {
116 | 	if timeout == 0 {
117 | 		timeout = 10000
118 | 	}
119 | 
120 | 	if req == nil {
121 | 		req = &HttpReq{
122 | 			HttpReq: &fun.HttpReq{
123 | 				MaxContentLength: HttpDefaultMaxContentLength,
124 | 				MaxRedirect:      3,
125 | 			},
126 | 			ForceTextContentType: true,
127 | 		}
128 | 	}
129 | 
130 | 	resp, err := HttpGetResp(urlStr, req, timeout)
131 | 	if resp != nil && err == nil && resp.Success {
132 | 		// 解析 HTML
133 | 		doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
134 | 		if docErr == nil {
135 | 			linkData := &LinkData{}
136 | 
137 | 			doc.Find(DefaultDocRemoveTags).Remove()
138 | 
139 | 			// 语言
140 | 			langRes := Lang(doc, resp.Charset.Charset, true)
141 | 
142 | 			// 站内链接
143 | 			linkTitles, filters := extract.WebLinkTitles(doc, resp.RequestURL, strictDomain)
144 | 
145 | 			// 链接分类
146 | 			linkRes, subDomains := extract.LinkTypes(linkTitles, langRes.Lang, rules)
147 | 
148 | 			linkData.LinkRes = linkRes
149 | 			linkData.Filters = filters
150 | 			linkData.SubDomains = subDomains
151 | 
152 | 			return linkData, nil
153 | 		} else {
154 | 			return nil, errors.New("ErrorDocParse")
155 | 		}
156 | 	}
157 | 
158 | 	return nil, errors.New("ErrorRequest")
159 | }
160 | 
161 | // GetNews 获取链接新闻数据
162 | func GetNews(urlStr string, title string, timeout int, retry int) (*extract.News, *HttpResp, error) {
163 | 	if retry <= 0 {
164 | 		retry = 1
165 | 	}
166 | 
167 | 	errs := make([]string, 0)
168 | 
169 | 	for i := 0; i < retry; i++ {
170 | 		news, resp, err := GetNewsDo(urlStr, title, nil, timeout)
171 | 		if err == nil {
172 | 			return news, resp, nil
173 | 		} else {
174 | 			errs = append(errs, err.Error())
175 | 		}
176 | 	}
177 | 
178 | 	return nil, nil, errors.New("ErrorRequest" + fun.ToString(errs))
179 | }
180 | 
181 | // GetNewsWithReq 获取链接新闻数据
182 | func GetNewsWithReq(urlStr string, title string, req *HttpReq, timeout int, retry int) (*extract.News, *HttpResp, error) {
183 | 	if retry <= 0 {
184 | 		retry = 1
185 | 	}
186 | 
187 | 	errs := make([]string, 0)
188 | 
189 | 	for i := 0; i < retry; i++ {
190 | 		news, resp, err := GetNewsDo(urlStr, title, req, timeout)
191 | 		if err == nil {
192 | 			return news, resp, nil
193 | 		} else {
194 | 			errs = append(errs, err.Error())
195 | 		}
196 | 	}
197 | 
198 | 	return nil, nil, errors.New("ErrorRequest" + fun.ToString(errs))
199 | }
200 | 
201 | // GetNewsDo 获取链接新闻数据
202 | func GetNewsDo(urlStr string, title string, req *HttpReq, timeout int) (*extract.News, *HttpResp, error) {
203 | 	return getNewsDoTop(urlStr, title, req, timeout, true)
204 | }
205 | 
206 | // getNewsDoTop 获取链接新闻数据
207 | func getNewsDoTop(urlStr string, title string, req *HttpReq, timeout int, top bool) (*extract.News, *HttpResp, error) {
208 | 	if timeout == 0 {
209 | 		timeout = HttpDefaultTimeOut
210 | 	}
211 | 
212 | 	if req == nil {
213 | 		req = &HttpReq{
214 | 			HttpReq: &fun.HttpReq{
215 | 				MaxContentLength: HttpDefaultMaxContentLength,
216 | 				MaxRedirect:      2,
217 | 			},
218 | 			ForceTextContentType: true,
219 | 		}
220 | 	}
221 | 
222 | 	resp, err := HttpGetResp(urlStr, req, timeout)
223 | 
224 | 	if resp != nil && err == nil && resp.Success {
225 | 		doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
226 | 		if docErr == nil {
227 | 			contentDoc := goquery.CloneDocument(doc)
228 | 			doc.Find(DefaultDocRemoveTags).Remove()
229 | 
230 | 			// 具有 HTML 跳转属性, 如果为本域名下, 则跳转一次
231 | 			if top {
232 | 				if refresh, exists := doc.Find("meta[http-equiv='refresh' i]").Attr("content"); exists {
233 | 					refreshMatch := regexMetaRefreshPattern.FindStringSubmatch(refresh)
234 | 					if len(refreshMatch) > 1 {
235 | 						requestHostname := resp.RequestURL.Hostname()
236 | 						requestTopDomain := extract.DomainTop(requestHostname)
237 | 						refreshUrl := strings.TrimSpace(refreshMatch[1])
238 | 						if r, err := fun.UrlParse(refreshUrl); err == nil {
239 | 							refreshHostname := r.Hostname()
240 | 							refreshTopDomain := extract.DomainTop(refreshHostname)
241 | 							if refreshTopDomain != "" && refreshTopDomain == requestTopDomain {
242 | 								return getNewsDoTop(refreshUrl, title, req, timeout, false)
243 | 							}
244 | 						}
245 | 					}
246 | 				}
247 | 			}
248 | 
249 | 			// 语言
250 | 			langRes := Lang(doc, resp.Charset.Charset, false)
251 | 
252 | 			// 正文抽取
253 | 			content := extract.NewContent(contentDoc, langRes.Lang, title, urlStr)
254 | 			news := content.ExtractNews()
255 | 
256 | 			return news, resp, nil
257 | 		} else {
258 | 			return nil, resp, errors.New("ErrorDocParse")
259 | 		}
260 | 	}
261 | 
262 | 	return nil, nil, errors.New("ErrorRequest")
263 | }
264 | 


--------------------------------------------------------------------------------
/spider_news.go:
--------------------------------------------------------------------------------
  1 | package spider
  2 | 
  3 | import (
  4 | 	"strings"
  5 | 	"sync"
  6 | 	"time"
  7 | 
  8 | 	"github.com/x-funs/go-fun"
  9 | )
 10 | 
 11 | // 新闻采集器结构体
 12 | type NewsSpider struct {
 13 | 	Url         string            // 根链接
 14 | 	Depth       uint8             // 采集页面深度
 15 | 	seen        map[string]bool   // 是否已采集
 16 | 	IsSub       bool              // 是否采集子域名
 17 | 	linkChan    chan *NewsData    // NewsData 通道共享
 18 | 	contentChan chan *NewsContent // NewsContent 通道共享
 19 | 	ProcessFunc func(...any)      // 处理函数
 20 | 	RetryTime   int               // 请求重试次数
 21 | 	TimeOut     int               // 请求响应时间
 22 | 	wg          *sync.WaitGroup   // 同步等待组
 23 | 	Req         *HttpReq          // 请求体
 24 | 	Ctx         any               // 任务详情上下文，传入ProcessFunc函数中
 25 | }
 26 | 
 27 | // 新闻内容结构体
 28 | type NewsContent struct {
 29 | 	Url     string // 链接
 30 | 	Title   string // 标题
 31 | 	Time    string // 发布时间
 32 | 	Content string // 正文纯文本
 33 | 	Lang    string // 语种
 34 | }
 35 | 
 36 | // 新闻LinkData总数据
 37 | type NewsData struct {
 38 | 	*LinkData
 39 | 	Depth   uint8  // 采集深度溯源
 40 | 	ListUrl string // 列表页溯源
 41 | 	Error   error
 42 | }
 43 | 
 44 | // 自定义配置函数
 45 | type Option func(*NewsSpider)
 46 | 
 47 | // 原型链接口
 48 | type Prototype interface {
 49 | 	Clone() Prototype
 50 | }
 51 | 
 52 | // NewNewsSpider 初始化
 53 | func NewNewsSpider(url string, depth uint8, pf func(...any), ctx any, options ...Option) *NewsSpider {
 54 | 	n := &NewsSpider{
 55 | 		Url:         url,
 56 | 		Depth:       depth,
 57 | 		seen:        map[string]bool{},
 58 | 		IsSub:       false,
 59 | 		linkChan:    make(chan *NewsData),
 60 | 		contentChan: make(chan *NewsContent),
 61 | 		ProcessFunc: pf,
 62 | 		RetryTime:   2,
 63 | 		TimeOut:     20000,
 64 | 		wg:          &sync.WaitGroup{},
 65 | 		Req:         nil,
 66 | 		Ctx:         ctx,
 67 | 	}
 68 | 
 69 | 	// 函数式选项模式
 70 | 	for _, option := range options {
 71 | 		option(n)
 72 | 	}
 73 | 
 74 | 	return n
 75 | }
 76 | 
 77 | func WithRetryTime(retryTime int) Option {
 78 | 	return func(n *NewsSpider) {
 79 | 		n.RetryTime = retryTime
 80 | 	}
 81 | }
 82 | 
 83 | func WithTimeOut(timeout int) Option {
 84 | 	return func(n *NewsSpider) {
 85 | 		n.TimeOut = timeout
 86 | 	}
 87 | }
 88 | 
 89 | func WithReq(req *HttpReq) Option {
 90 | 	return func(n *NewsSpider) {
 91 | 		n.Req = req
 92 | 	}
 93 | }
 94 | 
 95 | func WithIsSub(isSub bool) Option {
 96 | 	return func(n *NewsSpider) {
 97 | 		n.IsSub = isSub
 98 | 	}
 99 | }
100 | 
101 | // 原型链结构体拷贝
102 | func (n *NewsSpider) Clone() Prototype {
103 | 	nc := *n
104 | 
105 | 	// 拷贝时需重置chan和wg等字段
106 | 	nc.seen = map[string]bool{}
107 | 	nc.linkChan = make(chan *NewsData)
108 | 	nc.contentChan = make(chan *NewsContent)
109 | 	nc.wg = &sync.WaitGroup{}
110 | 
111 | 	return &nc
112 | }
113 | 
114 | // GetNews 开始采集
115 | func (n *NewsSpider) GetNews(linksHandleFunc func(*NewsData)) {
116 | 	// 初始化列表页和内容页切片
117 | 	var (
118 | 		listSlice      []string
119 | 		listSliceTemp  []string
120 | 		subDomainSlice []string
121 | 	)
122 | 
123 | 	// 获取首页url和协议
124 | 	scheme, indexUrl := GetIndexUrl(n.Url)
125 | 
126 | 	// 首次添加当前页
127 | 	listSliceTemp = append(listSliceTemp, n.Url)
128 | 
129 | 	if n.IsSub {
130 | 		// 先探测出首页url的所有子域名
131 | 		subDomains, _ := GetSubdomains(indexUrl, n.Req, n.TimeOut, n.RetryTime*100)
132 | 
133 | 		for subDomain := range subDomains {
134 | 			subDomainSlice = append(subDomainSlice, subDomain)
135 | 			listSliceTemp = append(listSliceTemp, subDomain)
136 | 		}
137 | 	}
138 | 
139 | 	// 深度优先循环遍历获取页面列表页和内容页
140 | 	for i := 0; i < int(n.Depth); i++ {
141 | 		listS, _ := n.GetNewsLinkRes(linksHandleFunc, scheme, listSliceTemp, uint8(i+1), n.TimeOut, n.RetryTime)
142 | 		listSlice = append(listSlice, listS...)
143 | 
144 | 		// 重置循环列表页
145 | 		if len(listS) == 0 {
146 | 			break
147 | 		}
148 | 		listSliceTemp = listS
149 | 	}
150 | }
151 | 
152 | // GetNewsLinkRes 获取news页面链接分组, 仅返回列表页和内容页
153 | func (n *NewsSpider) GetNewsLinkRes(linksHandleFunc func(*NewsData), scheme string, urls []string, depth uint8, timeout int, retry int) ([]string, error) {
154 | 	listSlice := []string{}
155 | 
156 | 	for _, url := range urls {
157 | 		if !strings.Contains(url, "http") {
158 | 			url = scheme + url
159 | 		}
160 | 
161 | 		if linkData, err := GetLinkDataWithReq(url, true, n.Req, timeout, retry); err == nil {
162 | 			for l := range linkData.LinkRes.List {
163 | 				if !n.seen[l] {
164 | 					n.seen[l] = true
165 | 					listSlice = append(listSlice, l)
166 | 				}
167 | 			}
168 | 
169 | 			newsData := &NewsData{linkData, depth, url, nil}
170 | 
171 | 			n.wg.Add(1)
172 | 			go linksHandleFunc(newsData)
173 | 
174 | 		} else {
175 | 			// 报错空的LinkData也需要push
176 | 			newsData := &NewsData{nil, depth, url, err}
177 | 
178 | 			n.wg.Add(1)
179 | 			go linksHandleFunc(newsData)
180 | 
181 | 			// return nil, errors.New("GetNewsLinkRes Err")
182 | 		}
183 | 	}
184 | 
185 | 	return listSlice, nil
186 | }
187 | 
188 | // CrawlLinkRes 直接推送列表页内容页
189 | func (n *NewsSpider) CrawlLinkRes(l *NewsData) {
190 | 	defer n.wg.Done()
191 | 	// defer n.sleep()
192 | 
193 | 	n.PushLinks(l)
194 | }
195 | 
196 | // GetContentNews 解析内容页详情数据
197 | func (n *NewsSpider) CrawlContentNews(l *NewsData) {
198 | 	defer n.wg.Done()
199 | 	// defer n.sleep()
200 | 
201 | 	if l.Error == nil {
202 | 		for c, v := range l.LinkRes.Content {
203 | 			if !n.seen[c] {
204 | 				n.seen[c] = true
205 | 				cc := map[string]string{}
206 | 				cc[c] = v
207 | 
208 | 				n.wg.Add(1)
209 | 				go n.ReqContentNews(cc)
210 | 			}
211 | 		}
212 | 	}
213 | }
214 | 
215 | // ReqContentNews 获取内容页详情数据
216 | func (n *NewsSpider) ReqContentNews(content map[string]string) {
217 | 	defer n.wg.Done()
218 | 
219 | 	time.Sleep(time.Duration(fun.RandomInt(10, 100)) * time.Millisecond)
220 | 
221 | 	for url, title := range content {
222 | 		if news, _, err := GetNews(url, title, n.TimeOut, n.RetryTime); err == nil {
223 | 			newsData := &NewsContent{}
224 | 			newsData.Url = url
225 | 			newsData.Title = news.Title
226 | 			newsData.Content = news.Content
227 | 			newsData.Time = news.TimeLocal
228 | 			newsData.Lang = news.Lang
229 | 
230 | 			n.PushContentNews(newsData)
231 | 		}
232 | 	}
233 | }
234 | 
235 | // PushLinks 推送links数据
236 | func (n *NewsSpider) PushLinks(data *NewsData) {
237 | 	n.linkChan <- data
238 | }
239 | 
240 | // PushContentNews 推送详情页数据
241 | func (n *NewsSpider) PushContentNews(data *NewsContent) {
242 | 	n.contentChan <- data
243 | }
244 | 
245 | // Wait wg阻塞等待退出
246 | func (n *NewsSpider) Wait() {
247 | 	n.wg.Wait()
248 | }
249 | 
250 | // Close 关闭Chan
251 | func (n *NewsSpider) Close() {
252 | 	close(n.linkChan)
253 | 	close(n.contentChan)
254 | }
255 | 
256 | // process 处理chan data函数
257 | func (n *NewsSpider) process(processFunc func(...any)) {
258 | 	for {
259 | 		select {
260 | 		case data, ok := <-n.linkChan:
261 | 			if !ok {
262 | 				return
263 | 			}
264 | 			processFunc(data, n.Ctx)
265 | 		case data, ok := <-n.contentChan:
266 | 			if !ok {
267 | 				return
268 | 			}
269 | 			processFunc(data, n.Ctx)
270 | 		}
271 | 	}
272 | }
273 | 
274 | // GetLinkRes 回调获取LinkRes数据
275 | func (n *NewsSpider) GetLinkRes() {
276 | 	n.GetNews(n.CrawlLinkRes)
277 | 
278 | 	go n.process(n.ProcessFunc)
279 | 
280 | 	n.Wait()
281 | 	defer n.Close()
282 | }
283 | 
284 | // GetContentNews 回调获取内容页数据
285 | func (n *NewsSpider) GetContentNews() {
286 | 	n.GetNews(n.CrawlContentNews)
287 | 
288 | 	go n.process(n.ProcessFunc)
289 | 
290 | 	n.Wait()
291 | 	defer n.Close()
292 | }
293 | 
294 | // GetSubdomains 获取subDomain
295 | func GetSubdomains(url string, req *HttpReq, timeout int, retry int) (map[string]bool, error) {
296 | 	if linkData, err := GetLinkDataWithReq(url, true, req, timeout, retry); err == nil {
297 | 		return linkData.SubDomains, nil
298 | 	} else {
299 | 		return nil, err
300 | 	}
301 | }
302 | 
303 | // GetIndexUrl 获取首页url
304 | func GetIndexUrl(url string) (string, string) {
305 | 	urlSlice := strings.Split(url, "/")
306 | 	if len(urlSlice) == 1 {
307 | 		// domain
308 | 		return "https://", "https://www." + url
309 | 	}
310 | 	scheme := urlSlice[0] + "//"
311 | 	indexUrl := scheme + urlSlice[2]
312 | 	return scheme, indexUrl
313 | }
314 | 
315 | // sleep depth只有一层时，需要等待几秒，避免wg done后直接退出，导致select来不及取出数据
316 | func (n *NewsSpider) sleep() {
317 | 	if n.Depth == 1 {
318 | 		time.Sleep(2 * time.Second)
319 | 	}
320 | }
321 | 


--------------------------------------------------------------------------------
/extract/web.go:
--------------------------------------------------------------------------------
  1 | package extract
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"net/url"
  6 | 	"path"
  7 | 	"regexp"
  8 | 	"strings"
  9 | 
 10 | 	"github.com/PuerkitoBio/goquery"
 11 | 	"github.com/x-funs/go-fun"
 12 | )
 13 | 
 14 | const (
 15 | 	RegexHostnameIp = `\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}`
 16 | )
 17 | 
 18 | var (
 19 | 	filterUrlSuffix = []string{
 20 | 		".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".xml",
 21 | 		".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
 22 | 		".zip", ".rar", ".7z", ".gz", ".apk", ".cgi", ".exe", ".bz2", ".play",
 23 | 		".rss", ".sig", ".sgf",
 24 | 		".mp3", ".mp4", ".rm", ".rmvb", ".mov", ".ogv", ".flv",
 25 | 	}
 26 | 
 27 | 	invalidUrlCharsets = []string{"{", "}", "[", "]", "@", "$", "<", ">", "\""}
 28 | 
 29 | 	titleZhSplits = []string{"_", "|", "-", "－", "｜", "—", "＊", "：", ",", "，", ":", "·", ">>", "="}
 30 | 
 31 | 	titleZhContentSplits = []string{"_", "|", "-", "－", "｜", "—"}
 32 | 
 33 | 	titleEnSplits = []string{" - ", " | ", ":"}
 34 | 
 35 | 	RegexHostnameIpPattern = regexp.MustCompile(RegexHostnameIp)
 36 | )
 37 | 
 38 | // WebTitle 返回网页标题, 最大 128 个字符
 39 | func WebTitle(doc *goquery.Document, maxLength int) string {
 40 | 	var title string
 41 | 	titleNode := doc.Find("title")
 42 | 	if titleNode.Size() > 1 {
 43 | 		// 竟然有多个 title, 只取第一个
 44 | 		title = titleNode.First().Text()
 45 | 	} else {
 46 | 		title = titleNode.Text()
 47 | 	}
 48 | 
 49 | 	title = fun.RemoveLines(title)
 50 | 	title = strings.TrimSpace(title)
 51 | 
 52 | 	if maxLength > 0 && maxLength < 128 {
 53 | 		return fun.SubString(title, 0, maxLength)
 54 | 	} else {
 55 | 		return fun.SubString(title, 0, 128)
 56 | 	}
 57 | }
 58 | 
 59 | // WebTitleClean 返回尽量清洗后的网页标题
 60 | func WebTitleClean(title string, lang string) string {
 61 | 	// 中文网站, 查找中文网站的分割标记, 找到任意一个, 从尾部循环删除后返回
 62 | 	if lang == "zh" {
 63 | 
 64 | 		for _, split := range titleZhSplits {
 65 | 			if fun.HasPrefixCase(title, split) {
 66 | 				title = fun.RemovePrefix(title, split)
 67 | 			}
 68 | 		}
 69 | 
 70 | 		// 去除首页开头
 71 | 		if fun.HasPrefixCase(title, "首页") {
 72 | 			title = regexp.MustCompile("首页([ |\\-_－—｜])*").ReplaceAllString(title, "")
 73 | 		}
 74 | 
 75 | 		titleClean := title
 76 | 		for _, split := range titleZhSplits {
 77 | 			var exists bool
 78 | 			end := strings.LastIndex(titleClean, split)
 79 | 			if end != -1 {
 80 | 				exists = true
 81 | 				for {
 82 | 					titleClean = strings.TrimSpace(titleClean[:end])
 83 | 					end = strings.LastIndex(titleClean, split)
 84 | 
 85 | 					if end == -1 {
 86 | 						break
 87 | 					}
 88 | 				}
 89 | 				if exists {
 90 | 					break
 91 | 				}
 92 | 			}
 93 | 		}
 94 | 
 95 | 		// 去除尾巴
 96 | 		if titleClean != "首页" {
 97 | 			titleClean = fun.RemoveSuffix(titleClean, "首页")
 98 | 		}
 99 | 
100 | 		titleClean = fun.RemoveSign(titleClean)
101 | 
102 | 		return titleClean
103 | 
104 | 	} else {
105 | 		// 其他, 查找英文分割标记, 如果找到, 从尾部删除一次返回
106 | 		for _, split := range titleEnSplits {
107 | 			end := strings.LastIndex(title, split)
108 | 			if end != -1 {
109 | 				titleClean := strings.TrimSpace(title[:end])
110 | 				return titleClean
111 | 			}
112 | 		}
113 | 	}
114 | 
115 | 	return title
116 | }
117 | 
118 | // WebContentTitleClean 返回内容页尽量清洗后的网页标题
119 | func WebContentTitleClean(title string, lang string) string {
120 | 	// 中文网站, 查找中文网站的分割标记, 找到任意一个, 从尾部循环删除后返回
121 | 	if lang == "zh" {
122 | 		for _, split := range titleZhContentSplits {
123 | 			if fun.HasPrefixCase(title, split) {
124 | 				title = fun.RemovePrefix(title, split)
125 | 			}
126 | 		}
127 | 
128 | 		titleClean := title
129 | 		for _, split := range titleZhContentSplits {
130 | 			var exists bool
131 | 			end := strings.LastIndex(titleClean, split)
132 | 			if end != -1 {
133 | 				exists = true
134 | 				for {
135 | 					titleClean = strings.TrimSpace(titleClean[:end])
136 | 					end = strings.LastIndex(titleClean, split)
137 | 
138 | 					if end == -1 {
139 | 						break
140 | 					}
141 | 				}
142 | 				if exists {
143 | 					break
144 | 				}
145 | 			}
146 | 		}
147 | 
148 | 		return titleClean
149 | 
150 | 	} else {
151 | 		// 其他, 查找英文分割标记, 如果找到, 从尾部删除一次返回
152 | 		for _, split := range titleEnSplits {
153 | 			end := strings.LastIndex(title, split)
154 | 			if end != -1 {
155 | 				titleClean := strings.TrimSpace(title[:end])
156 | 				return titleClean
157 | 			}
158 | 		}
159 | 	}
160 | 
161 | 	return title
162 | }
163 | 
164 | // WebKeywords 返回网页 Keyword
165 | func WebKeywords(doc *goquery.Document) string {
166 | 	keywords := doc.Find("meta[name='keywords' i]").AttrOr("content", "")
167 | 	keywords = fun.RemoveLines(keywords)
168 | 	keywords = strings.TrimSpace(keywords)
169 | 
170 | 	return keywords
171 | }
172 | 
173 | // WebDescription 返回网页描述, 最大 384 个字符
174 | func WebDescription(doc *goquery.Document, maxLength int) string {
175 | 	description := doc.Find("meta[name='description' i]").AttrOr("content", "")
176 | 	description = fun.RemoveLines(description)
177 | 	description = strings.TrimSpace(description)
178 | 
179 | 	if maxLength > 0 && maxLength < 384 {
180 | 		return fun.SubString(description, 0, maxLength)
181 | 	} else {
182 | 		return fun.SubString(description, 0, 384)
183 | 	}
184 | }
185 | 
186 | // WebLinkTitles 返回网页链接和锚文本
187 | func WebLinkTitles(doc *goquery.Document, baseUrl *url.URL, strictDomain bool) (map[string]string, map[string]string) {
188 | 	var linkTitles = make(map[string]string)
189 | 	var filters = make(map[string]string)
190 | 
191 | 	// 当前请求的 urlStr
192 | 	if baseUrl == nil {
193 | 		return linkTitles, filters
194 | 	}
195 | 
196 | 	// 获取所有 a 链接
197 | 	aTags := doc.Find("a")
198 | 	if aTags.Size() > 0 {
199 | 		var tmpLinks = make(map[string]string)
200 | 
201 | 		// 提取所有的 a 链接
202 | 		aTags.Each(func(i int, s *goquery.Selection) {
203 | 			tmpLink, exists := s.Attr("href")
204 | 			if exists {
205 | 				tmpLink = fun.RemoveLines(tmpLink)
206 | 				tmpLink = strings.TrimSpace(tmpLink)
207 | 
208 | 				tmpTitle := s.Text()
209 | 				tmpTitle = fun.NormaliseSpace(tmpTitle)
210 | 				tmpTitle = strings.TrimSpace(tmpTitle)
211 | 				if tmpLink != "" && tmpTitle != "" {
212 | 					// 如果链接已存在, 保留长标题
213 | 					if _, exists := tmpLinks[tmpLink]; exists {
214 | 						oldTitle := tmpLinks[tmpLink]
215 | 						if len(oldTitle) < len(tmpTitle) {
216 | 							tmpLinks[tmpLink] = tmpTitle
217 | 						}
218 | 					} else {
219 | 						tmpLinks[tmpLink] = tmpTitle
220 | 					}
221 | 				}
222 | 			}
223 | 		})
224 | 
225 | 		// 过滤链接
226 | 		tmpLinkLen := len(tmpLinks)
227 | 		if tmpLinkLen > 0 {
228 | 			for link, title := range tmpLinks {
229 | 				if a, err := filterUrl(link, baseUrl, strictDomain); err == nil {
230 | 					linkTitles[a] = title
231 | 				} else {
232 | 					filters[a] = err.Error()
233 | 				}
234 | 			}
235 | 		}
236 | 	}
237 | 
238 | 	return linkTitles, filters
239 | }
240 | 
241 | // filterUrl 过滤 url
242 | func filterUrl(link string, baseUrl *url.URL, strictDomain bool) (string, error) {
243 | 	var urlStr string
244 | 
245 | 	// 过滤掉链接中包含特殊字符的
246 | 	if fun.ContainsAny(link, invalidUrlCharsets...) {
247 | 		return link, errors.New("invalid url with illegal characters")
248 | 	}
249 | 
250 | 	// 转换为绝对路径
251 | 	if !fun.HasPrefixCase(link, "http") && !fun.HasPrefixCase(link, "https") {
252 | 		if l, err := baseUrl.Parse(link); err == nil {
253 | 			urlStr = l.String()
254 | 		} else {
255 | 			return link, errors.New("invalid url with baseUrl parse error")
256 | 		}
257 | 	} else {
258 | 		urlStr = link
259 | 	}
260 | 
261 | 	// 解析验证
262 | 	u, err := fun.UrlParse(urlStr)
263 | 	if err != nil {
264 | 		return urlStr, errors.New("invalid url with parse error")
265 | 	}
266 | 
267 | 	// 验证转换后是否是绝对路径
268 | 	if !u.IsAbs() {
269 | 		return urlStr, errors.New("invalid url with not absolute url")
270 | 	}
271 | 
272 | 	// 验证非常规端口
273 | 	if u.Port() != "" {
274 | 		return urlStr, errors.New("invalid url with not 80 port")
275 | 	}
276 | 
277 | 	// 验证主机名
278 | 	if RegexHostnameIpPattern.MatchString(u.Hostname()) {
279 | 		return urlStr, errors.New("invalid url with ip hostname")
280 | 	}
281 | 
282 | 	// 过滤掉明显错误的后缀
283 | 	ext := path.Ext(u.Path)
284 | 	if strings.Contains(ext, ".") {
285 | 		ext = strings.ToLower(ext)
286 | 		if fun.SliceContains(filterUrlSuffix, ext) {
287 | 			return urlStr, errors.New("invalid url with suffix")
288 | 		}
289 | 	}
290 | 
291 | 	// 过滤掉站外链接
292 | 	if strictDomain {
293 | 		hostname := u.Hostname()
294 | 		domainTop := DomainTop(hostname)
295 | 		baseDomainTop := DomainTop(baseUrl.Hostname())
296 | 		if domainTop != baseDomainTop {
297 | 			return urlStr, errors.New("invalid url with strict domain")
298 | 		}
299 | 	}
300 | 
301 | 	return urlStr, nil
302 | }
303 | 


--------------------------------------------------------------------------------
/detect.go:
--------------------------------------------------------------------------------
  1 | package spider
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"errors"
  6 | 	"net/url"
  7 | 	"strings"
  8 | 
  9 | 	"github.com/PuerkitoBio/goquery"
 10 | 	"github.com/suosi-inc/go-pkg-spider/extract"
 11 | 	"github.com/x-funs/go-fun"
 12 | )
 13 | 
 14 | type DomainRes struct {
 15 | 	// 域名
 16 | 	Domain string
 17 | 	// 主页域名
 18 | 	HomeDomain string
 19 | 	// 协议
 20 | 	Scheme string
 21 | 	// 字符集
 22 | 	Charset CharsetRes
 23 | 	// 语种
 24 | 	Lang LangRes
 25 | 	// 国家
 26 | 	Country string
 27 | 	// 省份
 28 | 	Province string
 29 | 	// 分类
 30 | 	Category string
 31 | 	// 标题
 32 | 	Title string
 33 | 	// 标题
 34 | 	TitleClean string
 35 | 	// 描述
 36 | 	Description string
 37 | 	// ICP
 38 | 	Icp string
 39 | 	// 状态
 40 | 	State bool
 41 | 	// 状态码
 42 | 	StatusCode int
 43 | 	// 内容页链接数量
 44 | 	ContentCount int
 45 | 	// 列表页链接数量
 46 | 	ListCount int
 47 | 	// 子域名列表
 48 | 	SubDomains map[string]bool
 49 | }
 50 | 
 51 | // DetectDomain 域名探测
 52 | // DomainRes.State true 和 err nil 表示探测成功
 53 | // DomainRes.State true 可能会返回 err, 如 doc 解析失败
 54 | // DomainRes.State false 时根据 StatusCode 判断是请求是否成功或请求成功但响应失败(如404)
 55 | func DetectDomain(domain string, timeout int, retry int) (*DomainRes, error) {
 56 | 	if retry == 0 {
 57 | 		retry = 1
 58 | 	}
 59 | 
 60 | 	for i := 0; i < retry; i++ {
 61 | 		domainRes, err := DetectDomainDo(domain, true, timeout)
 62 | 		if domainRes.StatusCode != 0 || err == nil {
 63 | 			return domainRes, err
 64 | 		}
 65 | 	}
 66 | 
 67 | 	var charset CharsetRes
 68 | 	var lang LangRes
 69 | 	domainRes := &DomainRes{
 70 | 		Charset: charset,
 71 | 		Lang:    lang,
 72 | 	}
 73 | 	return domainRes, errors.New("ErrorDomainDetect")
 74 | }
 75 | 
 76 | // DetectSubDomain 子域名探测
 77 | // DomainRes.State true 和 err nil 表示探测成功
 78 | // DomainRes.State true 可能会返回 err, 如 doc 解析失败
 79 | // DomainRes.State false 时根据 StatusCode 判断是请求是否成功或请求成功但响应失败(如404)
 80 | func DetectSubDomain(domain string, timeout int, retry int) (*DomainRes, error) {
 81 | 	if retry == 0 {
 82 | 		retry = 1
 83 | 	}
 84 | 
 85 | 	for i := 0; i < retry; i++ {
 86 | 		domainRes, err := DetectDomainDo(domain, false, timeout)
 87 | 		if domainRes.StatusCode != 0 || err == nil {
 88 | 			return domainRes, err
 89 | 		}
 90 | 	}
 91 | 
 92 | 	var charset CharsetRes
 93 | 	var lang LangRes
 94 | 	domainRes := &DomainRes{
 95 | 		Charset: charset,
 96 | 		Lang:    lang,
 97 | 	}
 98 | 	return domainRes, errors.New("ErrorDomainDetect")
 99 | }
100 | 
101 | func DetectDomainDo(domain string, isTop bool, timeout int) (*DomainRes, error) {
102 | 	if timeout == 0 {
103 | 		timeout = 10000
104 | 	}
105 | 
106 | 	domainRes := &DomainRes{}
107 | 
108 | 	req := &HttpReq{
109 | 		HttpReq: &fun.HttpReq{
110 | 			MaxContentLength: 10 * 1024 * 1024,
111 | 			MaxRedirect:      3,
112 | 		},
113 | 		ForceTextContentType: true,
114 | 	}
115 | 
116 | 	scheme := "http"
117 | 
118 | 	// 是否进行首页探测
119 | 	var homes []string
120 | 	if isTop {
121 | 		homes = []string{"www", ""}
122 | 	} else {
123 | 		homes = []string{""}
124 | 	}
125 | 
126 | 	for _, home := range homes {
127 | 
128 | 		var urlStr string
129 | 		var homeDomain string
130 | 		if home != "" {
131 | 			homeDomain = home + fun.DOT + domain
132 | 			urlStr = scheme + "://" + homeDomain
133 | 		} else {
134 | 			homeDomain = domain
135 | 			urlStr = scheme + "://" + homeDomain
136 | 		}
137 | 
138 | 		resp, err := HttpGetResp(urlStr, req, timeout)
139 | 
140 | 		if resp != nil && err == nil && resp.Success {
141 | 			domainRes.Domain = domain
142 | 			domainRes.StatusCode = resp.StatusCode
143 | 
144 | 			// 如果发生 HTTP 跳转, 则重新设置 homeDomain, 判断跳转后是否是同一个主域名, 如果域名改变则记录并返回错误
145 | 			domainRes.HomeDomain = homeDomain
146 | 			requestHostname := resp.RequestURL.Hostname()
147 | 			if domainRes.HomeDomain != requestHostname {
148 | 				requestTopDomain := extract.DomainTop(requestHostname)
149 | 				if requestTopDomain != "" && requestTopDomain != domain {
150 | 					// 验证主机名
151 | 					if RegexHostnameIpPattern.MatchString(requestHostname) {
152 | 						return domainRes, errors.New("ErrorRedirectHost")
153 | 					}
154 | 					// 验证非常规端口
155 | 					if resp.RequestURL.Port() != "" {
156 | 						return domainRes, errors.New("ErrorRedirectHost")
157 | 					}
158 | 
159 | 					return domainRes, errors.New("ErrorRedirect:" + requestTopDomain)
160 | 				}
161 | 
162 | 				domainRes.HomeDomain = requestHostname
163 | 			}
164 | 
165 | 			// 如果发生了协议跳转, 则重新设置 scheme
166 | 			domainRes.Scheme = scheme
167 | 			if domainRes.Scheme != resp.RequestURL.Scheme {
168 | 				domainRes.Scheme = resp.RequestURL.Scheme
169 | 			}
170 | 
171 | 			// 字符集
172 | 			domainRes.Charset = resp.Charset
173 | 
174 | 			// 解析 HTML
175 | 			u, _ := url.Parse(urlStr)
176 | 			doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
177 | 			if docErr == nil {
178 | 				doc.Find(DefaultDocRemoveTags).Remove()
179 | 
180 | 				// 具有 HTML 跳转属性, HTTP 无法自动处理永远返回错误, 判断跳转后是否是同一个主域名, 记录并返回
181 | 				if refresh, exists := doc.Find("meta[http-equiv='refresh' i]").Attr("content"); exists {
182 | 					refreshMatch := regexMetaRefreshPattern.FindStringSubmatch(refresh)
183 | 					if len(refreshMatch) > 1 {
184 | 						refreshUrl := refreshMatch[1]
185 | 						if r, err := fun.UrlParse(refreshUrl); err == nil {
186 | 							refreshHostname := r.Hostname()
187 | 							refreshTopDomain := extract.DomainTop(refreshHostname)
188 | 							if refreshTopDomain != "" && refreshTopDomain != domain {
189 | 								// 验证主机名
190 | 								if RegexHostnameIpPattern.MatchString(refreshHostname) {
191 | 									return domainRes, errors.New("ErrorMetaJumpHost")
192 | 								}
193 | 								// 验证非常规端口
194 | 								if r.Port() != "" {
195 | 									return domainRes, errors.New("ErrorMetaJumpHost")
196 | 								}
197 | 
198 | 								return domainRes, errors.New("ErrorMetaJump:" + refreshTopDomain)
199 | 							}
200 | 						}
201 | 						return domainRes, errors.New("ErrorMetaJump")
202 | 					}
203 | 				}
204 | 
205 | 				// 中国 ICP 解析
206 | 				icp, province := extract.Icp(doc)
207 | 				if icp != "" && province != "" {
208 | 					domainRes.Country = "中国"
209 | 					domainRes.Icp = icp
210 | 					domainRes.Province = extract.ProvinceShortMap[province]
211 | 				}
212 | 
213 | 				// 语言
214 | 				langRes := Lang(doc, resp.Charset.Charset, true)
215 | 				domainRes.Lang = langRes
216 | 
217 | 				// 尽可能的探测一些信息国家/省份/类别
218 | 				if domainRes.Country == "" {
219 | 					country, province, category := extract.MetaFromHost(u.Hostname(), langRes.Lang)
220 | 					domainRes.Country = country
221 | 					domainRes.Province = province
222 | 					domainRes.Category = category
223 | 				}
224 | 
225 | 				// 标题摘要
226 | 				domainRes.Title = extract.WebTitle(doc, 0)
227 | 				domainRes.TitleClean = extract.WebTitleClean(domainRes.Title, langRes.Lang)
228 | 				domainRes.Description = extract.WebDescription(doc, 0)
229 | 
230 | 				// 站内链接
231 | 				linkTitles, _ := extract.WebLinkTitles(doc, resp.RequestURL, true)
232 | 
233 | 				// 链接分类
234 | 				links, subDomains := extract.LinkTypes(linkTitles, langRes.Lang, nil)
235 | 
236 | 				domainRes.ContentCount = len(links.Content)
237 | 				domainRes.ListCount = len(links.List)
238 | 				domainRes.SubDomains = subDomains
239 | 
240 | 				domainRes.State = true
241 | 
242 | 				return domainRes, nil
243 | 			} else {
244 | 				return domainRes, errors.New("ErrorDocParse")
245 | 			}
246 | 		} else {
247 | 			if resp != nil {
248 | 				domainRes.StatusCode = resp.StatusCode
249 | 			}
250 | 		}
251 | 	}
252 | 
253 | 	return domainRes, errors.New("ErrorDomainDetect")
254 | }
255 | 
256 | func DetectFriendDomain(domain string, timeout int, retry int) (map[string]string, error) {
257 | 	if retry == 0 {
258 | 		retry = 1
259 | 	}
260 | 
261 | 	friendDomains := make(map[string]string, 0)
262 | 
263 | 	for i := 0; i < retry; i++ {
264 | 		friendDomains, err := DetectFriendDomainDo(domain, timeout)
265 | 		if err == nil {
266 | 			return friendDomains, err
267 | 		}
268 | 	}
269 | 
270 | 	return friendDomains, errors.New("ErrorDomainDetect")
271 | }
272 | 
273 | func DetectFriendDomainDo(domain string, timeout int) (map[string]string, error) {
274 | 	if timeout == 0 {
275 | 		timeout = 10000
276 | 	}
277 | 
278 | 	friendDomains := make(map[string]string, 0)
279 | 
280 | 	req := &HttpReq{
281 | 		HttpReq: &fun.HttpReq{
282 | 			MaxContentLength: 10 * 1024 * 1024,
283 | 			MaxRedirect:      3,
284 | 		},
285 | 		ForceTextContentType: true,
286 | 	}
287 | 
288 | 	scheme := "http"
289 | 	homes := []string{"www", ""}
290 | 
291 | 	for _, home := range homes {
292 | 
293 | 		var urlStr string
294 | 		var homeDomain string
295 | 		if home != "" {
296 | 			homeDomain = home + fun.DOT + domain
297 | 			urlStr = scheme + "://" + homeDomain
298 | 		} else {
299 | 			homeDomain = domain
300 | 			urlStr = scheme + "://" + homeDomain
301 | 		}
302 | 
303 | 		resp, err := HttpGetResp(urlStr, req, timeout)
304 | 
305 | 		if resp != nil && err == nil && resp.Success {
306 | 
307 | 			doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
308 | 			if docErr == nil {
309 | 				doc.Find(DefaultDocRemoveTags).Remove()
310 | 
311 | 				// 非限制域名所有链接
312 | 				linkTitles, _ := extract.WebLinkTitles(doc, resp.RequestURL, false)
313 | 
314 | 				if len(linkTitles) > 0 {
315 | 					for link, title := range linkTitles {
316 | 						if link == "" || title == "" {
317 | 							continue
318 | 						}
319 | 
320 | 						u, e := fun.UrlParse(link)
321 | 						if e != nil {
322 | 							continue
323 | 						}
324 | 
325 | 						// 验证非常规端口
326 | 						if u.Port() != "" {
327 | 							continue
328 | 						}
329 | 
330 | 						// 验证主机名
331 | 						if fun.Matches(`\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}`, u.Hostname()) {
332 | 							continue
333 | 						}
334 | 
335 | 						pathDir := strings.TrimSpace(u.Path)
336 | 						if pathDir == "" || pathDir == fun.SLASH || pathDir == "/index.html" || pathDir == "/index.htm" || pathDir == "/index.shtml" {
337 | 							hostname := u.Hostname()
338 | 							domainTop := extract.DomainTop(hostname)
339 | 							baseDomainTop := domain
340 | 							if domainTop != baseDomainTop {
341 | 								friendDomains[domainTop] = title
342 | 							}
343 | 						}
344 | 					}
345 | 				}
346 | 
347 | 				return friendDomains, nil
348 | 			} else {
349 | 				return friendDomains, errors.New("ErrorDocParse")
350 | 			}
351 | 		} else {
352 | 			return friendDomains, err
353 | 		}
354 | 	}
355 | 
356 | 	return friendDomains, errors.New("ErrorDomainDetect")
357 | }
358 | 


--------------------------------------------------------------------------------
/spider_test.go:
--------------------------------------------------------------------------------
  1 | package spider
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"crypto/tls"
  6 | 	"fmt"
  7 | 	"net/http"
  8 | 	"net/url"
  9 | 	"regexp"
 10 | 	"strconv"
 11 | 	"testing"
 12 | 	"unicode/utf8"
 13 | 
 14 | 	"github.com/PuerkitoBio/goquery"
 15 | 	"github.com/microcosm-cc/bluemonday"
 16 | 	"github.com/suosi-inc/go-pkg-spider/extract"
 17 | 	"github.com/x-funs/go-fun"
 18 | )
 19 | 
 20 | func BenchmarkHtmlParse(b *testing.B) {
 21 | 
 22 | 	resp, _ := fun.HttpGetResp("https://www.163.com", nil, 30000)
 23 | 
 24 | 	b.ResetTimer()
 25 | 	for i := 0; i < b.N; i++ {
 26 | 		doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
 27 | 		doc.Find(DefaultDocRemoveTags).Remove()
 28 | 	}
 29 | }
 30 | 
 31 | func TestGoquery(t *testing.T) {
 32 | 	body, _ := HttpGet("https://jp.news.cn/index.htm")
 33 | 	doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
 34 | 
 35 | 	// lang, exist := doc.Find("html").Attr("id")
 36 | 
 37 | 	doc.Find("script,noscript,style,iframe,br,link,svg,textarea").Remove()
 38 | 	text := doc.Find("body").Text()
 39 | 	text = fun.RemoveSign(text)
 40 | 
 41 | 	fmt.Println(text)
 42 | }
 43 | 
 44 | func TestRegex(t *testing.T) {
 45 | 	str := ",.!，，D_NAME。！；‘’”“《》**dfs#%^&()-+我1431221     中国123漢字かどうかのjavaを<決定>$¥"
 46 | 	r := regexp.MustCompile(`[\p{Hiragana}|\p{Katakana}]`)
 47 | 	s := r.FindAllString(str, -1)
 48 | 	t.Log(str)
 49 | 	t.Log(s)
 50 | }
 51 | 
 52 | func TestUrlParse(t *testing.T) {
 53 | 	var urlStrs = []string{
 54 | 		"https://www.163.com",
 55 | 		"https://www.163.com/",
 56 | 		"https://www.163.com/a",
 57 | 		"https://www.163.com/aa.html",
 58 | 		"https://www.163.com/a/b",
 59 | 		"https://www.163.com/a/bb.html",
 60 | 		"https://www.163.com/a/b/",
 61 | 		"https://www.163.com/a/b/c",
 62 | 		"https://www.163.com/a/b/cc.html",
 63 | 	}
 64 | 
 65 | 	for _, urlStr := range urlStrs {
 66 | 		u, _ := url.Parse(urlStr)
 67 | 		link := "javascript:;"
 68 | 		absolute, err := u.Parse(link)
 69 | 		t.Log(err)
 70 | 
 71 | 		_, err = url.Parse(absolute.String())
 72 | 		if err != nil {
 73 | 			t.Log(err)
 74 | 		}
 75 | 
 76 | 		t.Log(urlStr + "	+ " + link + " => " + absolute.String())
 77 | 	}
 78 | 
 79 | }
 80 | 
 81 | func TestCount(t *testing.T) {
 82 | 	fmt.Println(regexLangHtmlPattern.MatchString("zh"))
 83 | 	fmt.Println(regexLangHtmlPattern.MatchString("en"))
 84 | 	fmt.Println(regexLangHtmlPattern.MatchString("zh-cn"))
 85 | 	fmt.Println(regexLangHtmlPattern.MatchString("utf-8"))
 86 | 
 87 | 	fmt.Println(utf8.RuneCountInString("https://khmers.cn/2022/05/23/%e6%b4%aa%e6%a3%ae%e6%80%bb%e7%90%86%ef%bc%9a%e6%9f%ac%e5%9f%94%e5%af%a8%e7%b4%af%e8%ae%a1%e8%8e%b7%e5%be%97%e8%b6%85%e8%bf%875200%e4%b8%87%e5%89%82%e6%96%b0%e5%86%a0%e7%96%ab%e8%8b%97%ef%bc%8c/"))
 88 | }
 89 | 
 90 | func TestGetLinkData(t *testing.T) {
 91 | 	var urlStrs = []string{
 92 | 		// "https://www.1905.com",
 93 | 		// "https://www.people.com.cn",
 94 | 		// "https://www.36kr.com",
 95 | 		// "https://www.163.com",
 96 | 		// "https://news.163.com/",
 97 | 		// "http://jyj.suqian.gov.cn",
 98 | 		// "https://www.huxiu.com/",
 99 | 		// "http://www.news.cn/politicspro/",
100 | 		// "http://www.cankaoxiaoxi.com",
101 | 		// "http://www.bbc.com",
102 | 		// "https://www.ft.com",
103 | 		// "https://www.reuters.com/",
104 | 		// "https://nypost.com/",
105 | 		// "http://www.mengcheng.gov.cn/",
106 | 		// "https://www.chunichi.co.jp",
107 | 		// "https://www.donga.com/",
108 | 		// "https://people.com/",
109 | 		// "https://czql.gov.cn/",
110 | 		// "https://qiye.163.com/",
111 | 		// "https://www.washingtontimes.com/",
112 | 		// "https://www.gamersky.com/",
113 | 		// "https://www.cdns.com.tw/",
114 | 		// "http://www.163.com/",
115 | 
116 | 		// "https://data.163.com",
117 | 		// "https://www.sensetime.com/cn/news-index",
118 | 		// "",
119 | 		"https://www.sis.gov.eg/section/7413/7410?lang=zh-cn",
120 | 	}
121 | 
122 | 	for _, urlStr := range urlStrs {
123 | 
124 | 		if linkData, err := GetLinkData(urlStr, false, 10000, 1); err == nil {
125 | 			fmt.Println("subDomain:", len(linkData.SubDomains))
126 | 			fmt.Println("content:", len(linkData.LinkRes.Content))
127 | 			fmt.Println("list:", len(linkData.LinkRes.List))
128 | 			fmt.Println("unknown:", len(linkData.LinkRes.Unknown))
129 | 			fmt.Println("none:", len(linkData.LinkRes.None))
130 | 
131 | 			i := 0
132 | 			for a, title := range linkData.Filters {
133 | 				i = i + 1
134 | 				fmt.Println(i, "filter:"+a+"\t=>\t"+title)
135 | 			}
136 | 			i = 0
137 | 			for a, title := range linkData.SubDomains {
138 | 				i = i + 1
139 | 				fmt.Println(i, "subDomain:"+a+"\t=>\t"+strconv.FormatBool(title))
140 | 			}
141 | 			i = 0
142 | 			for a, title := range linkData.LinkRes.Content {
143 | 				i = i + 1
144 | 				fmt.Println(i, "content:"+a+"\t=>\t"+title)
145 | 			}
146 | 			i = 0
147 | 			for a, title := range linkData.LinkRes.Unknown {
148 | 				i = i + 1
149 | 				fmt.Println(i, "unknown:"+a+"\t=>\t"+title)
150 | 			}
151 | 			i = 0
152 | 			for a, title := range linkData.LinkRes.List {
153 | 				i = i + 1
154 | 				fmt.Println(i, "list:"+a+"\t=>\t"+title)
155 | 			}
156 | 			i = 0
157 | 			for a, title := range linkData.LinkRes.None {
158 | 				i = i + 1
159 | 				fmt.Println(i, "none:"+a+"\t=>\t"+title)
160 | 			}
161 | 		}
162 | 	}
163 | }
164 | 
165 | func TestGetNews(t *testing.T) {
166 | 
167 | 	var urlStrs = []string{
168 | 		// "http://www.cankaoxiaoxi.com/finance/20220831/2489264.shtml",
169 | 		// "https://www.163.com/news/article/HG3DE7AQ000189FH.html",
170 | 		// "http://suosi.com.cn/",
171 | 		// "http://www.cankaoxiaoxi.com/world/20220831/2489267.shtml",
172 | 		// "http://www.cankaoxiaoxi.com/photo/20220901/2489404.shtml",
173 | 		// "http://column.cankaoxiaoxi.com/2022/0831/2489330.shtml",
174 | 		// "http://www.gov.cn/xinwen/2022-08/31/content_5707661.htm",
175 | 		// "http://suosi.com.cn/2019/14.shtml",
176 | 		// "https://www.wangan.com/p/7fy78317feb66b37",
177 | 		// "https://www.wangan.com/news/7fy78y38c7207bf0",
178 | 		// "http://env.people.com.cn/n1/2022/0901/c1010-32516651.html",
179 | 		"http://com.gd.gov.cn/go/article.php?typeid=40&contentId=23262",
180 | 		// "http://www.changzhou.gov.cn/ns_news/827166202029392",
181 | 		// "https://www.163.com/money/article/HG4TRBL1002580S6.html?clickfrom=w_yw_money",
182 | 		// "https://mp.weixin.qq.com/s?__biz=MzUxODkxNTYxMA==&mid=2247484842&idx=1&sn=d9822ee4662523609aee7441066c2a96&chksm=f980d6dfcef75fc93cb1e7942cb16ec82a7fb7ec3c2d857c307766daff667bd63ab1b4941abd&exportkey=AXWfguuAyJjlOJgCHf10io8%3D&acctmode=0&pass_ticket=8eXqj",
183 | 		// "https://www.bbc.com/news/world-asia-62744522",
184 | 		// "https://www.sohu.com/a/581634395_121284943",
185 | 		// "https://edition.cnn.com/2022/01/30/europe/lithuania-took-on-china-intl-cmd/index.html",
186 | 		// "https://www.36kr.com/p/1897541916043649",
187 | 		// "https://www.huxiu.com/article/651531.html",
188 | 		// "http://www.news.cn/politics/2022-09/02/c_1128969463.htm",
189 | 		// "https://www.ccdi.gov.cn/yaowenn/202209/t20220901_215343.html",
190 | 		// "https://new.qq.com/omn/20200701/20200701A04H7500",
191 | 		// "http://v.china.com.cn/2022-09/06/content_78407150.html",
192 | 		// "http://www.chinagwy.org.cn/content-cat-10/143162.html",
193 | 		// "https://news.52pk.com/xwlm/201912/7366710.shtml",
194 | 		// "https://www.business-standard.com/article/finance/govt-rbi-propose-action-plan-for-facilitating-special-rupee-accounts-122090701260_1.html",
195 | 		// "https://www.squirepattonboggs.com/en/news/2022/09/squire-patton-boggs-advises-new-wave-group-ab-on-uk-acquisition",
196 | 		// "https://www.thebulletin.be/number-road-deaths-belgium-rises-sharply",
197 | 		// "https://www.dailyexpress.com.my/read/4840/ma63-zero-without-equitable-economic-partnership/",
198 | 		// "https://news.cgtn.com/news/2022-08-20/CGTN-documentary-Remote-Killing-released-1cE7t7RD104/index.html",
199 | 		// "https://www.sensetime.com/en/news-detail/51164633?categoryId=1072",
200 | 	}
201 | 
202 | 	for _, urlStr := range urlStrs {
203 | 		if news, resp, err := GetNews(urlStr, "", 10000, 1); err == nil {
204 | 			t.Log(resp.Charset)
205 | 			t.Log(news.Lang)
206 | 			t.Log(news.Spend)
207 | 			t.Log(news.Title)
208 | 			t.Log(news.TitlePos)
209 | 			t.Log(news.TimeLocal)
210 | 			t.Log(news.Time)
211 | 			t.Log(news.TimePos)
212 | 			t.Log(news.Content)
213 | 
214 | 			if news.ContentNode != nil {
215 | 				// 内容 html 节点
216 | 				node := goquery.NewDocumentFromNode(news.ContentNode)
217 | 				contentHtml, _ := node.Html()
218 | 				t.Log(fun.NormaliseLine(contentHtml))
219 | 
220 | 				// 内容 html 节点清理, 仅保留 p img 标签
221 | 				p := bluemonday.NewPolicy()
222 | 				p.AllowElements("p")
223 | 				p.AllowImages()
224 | 				html := p.Sanitize(contentHtml)
225 | 				t.Log(fun.NormaliseLine(html))
226 | 			}
227 | 		}
228 | 	}
229 | }
230 | 
231 | func TestGetNewsWithReq(t *testing.T) {
232 | 	transport := &http.Transport{
233 | 		TLSClientConfig:   &tls.Config{InsecureSkipVerify: true},
234 | 		DisableKeepAlives: true,
235 | 	}
236 | 	proxyString := "http://username:password@host:port"
237 | 	proxy, _ := url.Parse(proxyString)
238 | 	transport.Proxy = http.ProxyURL(proxy)
239 | 
240 | 	req := &HttpReq{
241 | 		HttpReq: &fun.HttpReq{
242 | 			MaxContentLength: HttpDefaultMaxContentLength,
243 | 			MaxRedirect:      2,
244 | 			Transport:        transport,
245 | 		},
246 | 		ForceTextContentType: true,
247 | 	}
248 | 
249 | 	var urlStrs = []string{
250 | 		"https://www.bbc.com/news/world-asia-62744522",
251 | 	}
252 | 
253 | 	for _, urlStr := range urlStrs {
254 | 		if news, resp, err := GetNewsWithReq(urlStr, "", req, 10000, 1); err == nil {
255 | 			t.Log(resp.Charset)
256 | 			t.Log(news.Spend)
257 | 			t.Log(news.Title)
258 | 			t.Log(news.TitlePos)
259 | 			t.Log(news.TimeLocal)
260 | 			t.Log(news.Time)
261 | 			t.Log(news.TimePos)
262 | 			t.Log(news.Content)
263 | 
264 | 			if news.ContentNode != nil {
265 | 				// 内容 html 节点
266 | 				node := goquery.NewDocumentFromNode(news.ContentNode)
267 | 				contentHtml, _ := node.Html()
268 | 				t.Log(fun.NormaliseLine(contentHtml))
269 | 
270 | 				// 内容 html 节点清理, 仅保留 p img 标签
271 | 				p := bluemonday.NewPolicy()
272 | 				p.AllowElements("p")
273 | 				p.AllowImages()
274 | 				html := p.Sanitize(contentHtml)
275 | 				t.Log(fun.NormaliseLine(html))
276 | 			}
277 | 		}
278 | 	}
279 | }
280 | 
281 | func TestDemo(t *testing.T) {
282 | 	a := "2022-05-26 17:00:57 UTC"
283 | 	findString := regexp.MustCompile(extract.RegexPublishDate).FindStringSubmatch(a)
284 | 	t.Log(findString)
285 | 	t.Log(fun.Date(fun.StrToTime("2022-04-10T18:24:00")))
286 | }
287 | 


--------------------------------------------------------------------------------
/lang.go:
--------------------------------------------------------------------------------
  1 | package spider
  2 | 
  3 | import (
  4 | 	"regexp"
  5 | 	"strings"
  6 | 	"unicode/utf8"
  7 | 
  8 | 	"github.com/PuerkitoBio/goquery"
  9 | 	"github.com/suosi-inc/go-pkg-spider/extract"
 10 | 	"github.com/suosi-inc/lingua-go"
 11 | 	"github.com/x-funs/go-fun"
 12 | )
 13 | 
 14 | const (
 15 | 	LangPosCharset = "charset"
 16 | 	LangPosHtmlTag = "html"
 17 | 	LangPosBody    = "body"
 18 | 	LangPosLingua  = "lingua"
 19 | 	LangPosTitleZh = "title"
 20 | 	BodyChunkSize  = 2048
 21 | 	BodyMinSize    = 64
 22 | 
 23 | 	RegexLangHtml = "^(?i)([a-z]{2}|[a-z]{2}\\-[a-z]+)$"
 24 | )
 25 | 
 26 | var (
 27 | 	CharsetLangMap = map[string]string{
 28 | 		"GBK":         "zh",
 29 | 		"Big5":        "zh",
 30 | 		"ISO-2022-CN": "zh",
 31 | 		"SHIFT_JIS":   "ja",
 32 | 		"KOI8-R":      "ru",
 33 | 		"EUC-JP":      "ja",
 34 | 		"EUC-KR":      "ko",
 35 | 		"EUC-CN":      "zh",
 36 | 		"ISO-2022-JP": "ja",
 37 | 		"ISO-2022-KR": "ko",
 38 | 	}
 39 | 
 40 | 	LangEnZhMap = map[string]string{
 41 | 		"zh": "中文",
 42 | 		"en": "英语",
 43 | 		"ja": "日语",
 44 | 		"ru": "俄语",
 45 | 		"ko": "韩语",
 46 | 		"ar": "阿拉伯语",
 47 | 		"hi": "印地语",
 48 | 		"de": "德语",
 49 | 		"fr": "法语",
 50 | 		"es": "西班牙语",
 51 | 		"pt": "葡萄牙语",
 52 | 		"it": "意大利语",
 53 | 		"th": "泰语",
 54 | 		"vi": "越南语",
 55 | 		"my": "缅甸语",
 56 | 	}
 57 | 
 58 | 	LangZhEnMap = map[string]string{
 59 | 		"中文":   "zh",
 60 | 		"英语":   "en",
 61 | 		"日语":   "ja",
 62 | 		"俄语":   "ru",
 63 | 		"韩语":   "ko",
 64 | 		"阿拉伯语": "ar",
 65 | 		"印地语":  "hi",
 66 | 		"德语":   "de",
 67 | 		"法语":   "fr",
 68 | 		"西班牙语": "es",
 69 | 		"葡萄牙语": "pt",
 70 | 		"意大利语": "it",
 71 | 		"泰语":   "th",
 72 | 		"越南语":  "vi",
 73 | 		"缅甸语":  "my",
 74 | 	}
 75 | 
 76 | 	langMetaSelectors = []string{
 77 | 		"meta[http-equiv='content-language' i]",
 78 | 		"meta[name='lang' i]",
 79 | 	}
 80 | 
 81 | 	linguaLanguages = []lingua.Language{
 82 | 		lingua.Arabic,
 83 | 		lingua.Russian,
 84 | 		lingua.Hindi,
 85 | 		lingua.Korean,
 86 | 	}
 87 | 
 88 | 	linguaLatinLanguages = []lingua.Language{
 89 | 		lingua.French,
 90 | 		lingua.German,
 91 | 		lingua.Spanish,
 92 | 		lingua.Portuguese,
 93 | 		lingua.English,
 94 | 	}
 95 | 
 96 | 	linguaMap = map[string]string{
 97 | 		"arabic":     "ar",
 98 | 		"russian":    "ru",
 99 | 		"hindi":      "hi",
100 | 		"korean":     "ko",
101 | 		"french":     "fr",
102 | 		"german":     "de",
103 | 		"spanish":    "es",
104 | 		"portuguese": "pt",
105 | 		"english":    "en",
106 | 	}
107 | 
108 | 	regexLangHtmlPattern = regexp.MustCompile(RegexLangHtml)
109 | 	regexPuncsPattern    = regexp.MustCompile(`[\pP\pS]`)
110 | 	regexEnPattern       = regexp.MustCompile(`[a-zA-Z]`)
111 | 	regexLatinPattern    = regexp.MustCompile("[\u0080-\u00ff]")
112 | 	regexZhPattern       = regexp.MustCompile(`\p{Han}`)
113 | 	regexJaPattern       = regexp.MustCompile(`[\p{Hiragana}|\p{Katakana}]`)
114 | 	regexKoPattern       = regexp.MustCompile(`\p{Hangul}`)
115 | )
116 | 
117 | type LangRes struct {
118 | 	Lang    string
119 | 	LangPos string
120 | }
121 | 
122 | // LangText 探测纯文本语种
123 | func LangText(text string) (string, string) {
124 | 	return langFromText(text)
125 | }
126 | 
127 | // Lang 探测 HTML 语种
128 | func Lang(doc *goquery.Document, charset string, listMode bool) LangRes {
129 | 	var res LangRes
130 | 	var lang string
131 | 
132 | 	// 如果存在特定语言的 charset 对照表, 则直接返回
133 | 	if charset != "" {
134 | 		if _, exist := CharsetLangMap[charset]; exist {
135 | 			res.Lang = CharsetLangMap[charset]
136 | 			res.LangPos = LangPosCharset
137 | 			return res
138 | 		}
139 | 	}
140 | 
141 | 	// 优先判断Title是否包含中文, 辅助内容排除日韩
142 | 	titleLang, pos := LangFromTitle(doc, listMode)
143 | 	if titleLang != "" {
144 | 		res.Lang = titleLang
145 | 		res.LangPos = pos
146 | 		return res
147 | 	}
148 | 
149 | 	// 解析 Html 语言属性, 当不为空不为 en 时可信度比较高, 直接返回
150 | 	lang = LangFromHtml(doc)
151 | 	if lang != "" && lang != "en" {
152 | 		res.Lang = lang
153 | 		res.LangPos = LangPosHtmlTag
154 | 		return res
155 | 	}
156 | 
157 | 	// 当 utf 编码时, lang 为空或 en 可信度比较低, 进行基于内容语种的检测
158 | 	if strings.HasPrefix(charset, "UTF") && (lang == "" || lang == "en") {
159 | 		bodyLang, pos := LangFromUtf8Body(doc, listMode)
160 | 		if bodyLang != "" {
161 | 			res.Lang = bodyLang
162 | 			res.LangPos = pos
163 | 		}
164 | 	}
165 | 
166 | 	return res
167 | }
168 | 
169 | func LangFromHtml(doc *goquery.Document) string {
170 | 	var lang string
171 | 
172 | 	// html lang
173 | 	if lang, exists := doc.Find("html").Attr("lang"); exists {
174 | 		lang = strings.TrimSpace(lang)
175 | 		if regexLangHtmlPattern.MatchString(lang) {
176 | 			lang = fun.SubString(lang, 0, 2)
177 | 			return lang
178 | 		}
179 | 	}
180 | 	if lang, exists := doc.Find("html").Attr("xml:lang"); exists {
181 | 		lang = strings.TrimSpace(lang)
182 | 		if regexLangHtmlPattern.MatchString(lang) {
183 | 			lang = fun.SubString(lang, 0, 2)
184 | 			return lang
185 | 		}
186 | 
187 | 	}
188 | 	for _, selector := range langMetaSelectors {
189 | 		if lang, exists := doc.Find(selector).Attr("content"); exists {
190 | 			lang = strings.TrimSpace(lang)
191 | 			if regexLangHtmlPattern.MatchString(lang) {
192 | 				lang = fun.SubString(lang, 0, 2)
193 | 				return lang
194 | 			}
195 | 		}
196 | 	}
197 | 
198 | 	return lang
199 | }
200 | func LangFromTitle(doc *goquery.Document, listMode bool) (string, string) {
201 | 	var lang string
202 | 	var text string
203 | 
204 | 	// 获取 Title
205 | 	title := extract.WebTitle(doc, 0)
206 | 	text = fun.RemoveSign(title)
207 | 	text = strings.TrimSpace(text)
208 | 
209 | 	if text != "" {
210 | 		// 首先判断标题是否包含汉字
211 | 		han := regexZhPattern.FindAllString(text, -1)
212 | 		if han != nil {
213 | 			hanCount := len(han)
214 | 
215 | 			// 汉字数量 >=2
216 | 			if hanCount >= 2 {
217 | 
218 | 				// 需要抽取内容验证包含有日语韩语, 如(日本語_新華網)
219 | 				bodyText := bodyTextForLang(doc, listMode)
220 | 
221 | 				// 去除所有符号
222 | 				bodyText = fun.RemoveSign(bodyText)
223 | 
224 | 				// 最大截取 BodyChunkSize 个字符
225 | 				bodyText = fun.SubString(bodyText, 0, BodyChunkSize)
226 | 				bodyText = strings.TrimSpace(bodyText)
227 | 
228 | 				bodyTextCount := utf8.RuneCountInString(bodyText)
229 | 
230 | 				// 包含一定的日语
231 | 				ja := regexJaPattern.FindAllString(bodyText, -1)
232 | 				if ja != nil {
233 | 					jaCount := len(ja)
234 | 					jaRate := float64(jaCount) / float64(bodyTextCount)
235 | 
236 | 					// 日语出现比例
237 | 					if jaRate > 0.2 {
238 | 						lang = "ja"
239 | 						return lang, LangPosTitleZh
240 | 					}
241 | 				}
242 | 
243 | 				// 包含一定的韩语
244 | 				ko := regexKoPattern.FindAllString(bodyText, -1)
245 | 				if ko != nil {
246 | 					koCount := len(ko)
247 | 					koRate := float64(koCount) / float64(bodyTextCount)
248 | 
249 | 					// 韩语出现比例
250 | 					if koRate > 0.2 {
251 | 						lang = "ko"
252 | 						return lang, LangPosTitleZh
253 | 					}
254 | 				}
255 | 
256 | 				lang = "zh"
257 | 				return lang, LangPosTitleZh
258 | 			}
259 | 		}
260 | 	}
261 | 
262 | 	return lang, ""
263 | }
264 | 
265 | func LangFromUtf8Body(doc *goquery.Document, listMode bool) (string, string) {
266 | 	var text string
267 | 
268 | 	// 抽取内容
269 | 	text = bodyTextForLang(doc, listMode)
270 | 
271 | 	return langFromText(text)
272 | }
273 | 
274 | func langFromText(text string) (string, string) {
275 | 	var lang string
276 | 
277 | 	// 去除换行(为了保留语义只替换多余的空格)
278 | 	text = fun.RemoveLines(text)
279 | 	text = strings.ReplaceAll(text, fun.TAB, "")
280 | 	text = strings.ReplaceAll(text, "  ", "")
281 | 
282 | 	// 去除符号
283 | 	text = regexPuncsPattern.ReplaceAllString(text, "")
284 | 
285 | 	// 最大截取 BodyChunkSize 个字符
286 | 	text = fun.SubString(text, 0, BodyChunkSize)
287 | 	text = strings.TrimSpace(text)
288 | 
289 | 	// 截取后的字符长度
290 | 	textCount := utf8.RuneCountInString(text)
291 | 
292 | 	// 内容太少不足以判断语言, 放弃
293 | 	if textCount < BodyMinSize {
294 | 		return "", ""
295 | 	}
296 | 
297 | 	// 首先判断是否包含汉字, 中文和日语
298 | 	han := regexZhPattern.FindAllString(text, -1)
299 | 	if han != nil {
300 | 		hanCount := len(han)
301 | 		hanRate := float64(hanCount) / float64(textCount)
302 | 
303 | 		// 汉字比例
304 | 		if hanRate >= 0.3 {
305 | 			ja := regexJaPattern.FindAllString(text, -1)
306 | 			if ja != nil {
307 | 				jaCount := len(ja)
308 | 				jaRate := float64(jaCount) / float64(hanCount)
309 | 
310 | 				// 日语在汉字中的占比
311 | 				if jaRate > 0.1 {
312 | 					lang = "ja"
313 | 					return lang, LangPosBody
314 | 				}
315 | 			}
316 | 
317 | 			lang = "zh"
318 | 			return lang, LangPosBody
319 | 		}
320 | 	}
321 | 
322 | 	// 其次判断拉丁语系, 分析主要的一些语种
323 | 	english := regexEnPattern.FindAllString(text, -1)
324 | 	if english != nil {
325 | 		englishCount := len(english)
326 | 		englishRate := float64(englishCount) / float64(textCount)
327 | 		if englishRate > 0.618 {
328 | 
329 | 			// 包含拉丁补充字符集, 使用 lingua 分析主要的非英语拉丁语种
330 | 			latin := regexLatinPattern.FindAllString(text, -1)
331 | 			if latin != nil {
332 | 				latinCount := len(latin)
333 | 
334 | 				if latinCount > 5 {
335 | 					detector := lingua.NewLanguageDetectorBuilder().FromLanguages(linguaLatinLanguages...).Build()
336 | 					if language, exists := detector.DetectLanguageOf(text); exists {
337 | 						key := strings.ToLower(language.String())
338 | 						linguaLang := linguaMap[key]
339 | 						return linguaLang, LangPosLingua
340 | 					}
341 | 				}
342 | 			}
343 | 
344 | 			return "en", LangPosBody
345 | 		}
346 | 	}
347 | 
348 | 	// 最后, 使用 lingua 分析其他主要的非拉丁语种
349 | 	detector := lingua.NewLanguageDetectorBuilder().FromLanguages(linguaLanguages...).Build()
350 | 	if language, exists := detector.DetectLanguageOf(text); exists {
351 | 
352 | 		key := strings.ToLower(language.String())
353 | 		linguaLang := linguaMap[key]
354 | 		return linguaLang, LangPosLingua
355 | 	}
356 | 
357 | 	return lang, ""
358 | }
359 | 
360 | func bodyTextForLang(doc *goquery.Document, listMode bool) string {
361 | 	var text string
362 | 
363 | 	// 列表页模式
364 | 	if listMode {
365 | 		// 优先获取网页中最多 64 个 a 标签, 如果没有 a 标签或过少，放弃
366 | 		aTag := doc.Find("a")
367 | 		aTagSize := aTag.Size()
368 | 		if aTagSize >= 16 {
369 | 			sliceMax := fun.Min(aTagSize, 64)
370 | 			text = aTag.Slice(0, sliceMax).Text()
371 | 
372 | 			// 如果 a 标签中包含过多的 {} 可能是动态渲染, 放弃
373 | 			if strings.Count(text, "{") >= 5 && strings.Count(text, "}") >= 5 {
374 | 				return ""
375 | 			}
376 | 		}
377 | 	} else {
378 | 		// 内容页模式, 获取网页中最多 64 个 p 标签
379 | 		pTag := doc.Find("p")
380 | 		pTagSize := pTag.Size()
381 | 		sliceMax := fun.Min(pTagSize, 64)
382 | 		text = pTag.Slice(0, sliceMax).Text()
383 | 
384 | 		// 如果内容太少, 获取全部 body 文本
385 | 		textCount := utf8.RuneCountInString(text)
386 | 		if textCount < BodyMinSize {
387 | 			text = doc.Find("body").Text()
388 | 		}
389 | 	}
390 | 
391 | 	return text
392 | }
393 | 


--------------------------------------------------------------------------------
/extract/link.go:
--------------------------------------------------------------------------------
  1 | package extract
  2 | 
  3 | import (
  4 | 	"net/url"
  5 | 	"path"
  6 | 	"regexp"
  7 | 	"strings"
  8 | 	"unicode/utf8"
  9 | 
 10 | 	"github.com/x-funs/go-fun"
 11 | )
 12 | 
 13 | const (
 14 | 	LinkTypeNone    LinkType = 0
 15 | 	LinkTypeContent LinkType = 1
 16 | 	LinkTypeList    LinkType = 2
 17 | 	LinkTypeUnknown LinkType = 3
 18 | 
 19 | 	RegexUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2]|[1-9])[/]?(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])?)`
 20 | 
 21 | 	RegexIndexSuffix = `^/index\.(html|shtml|htm|php|asp|aspx|jsp)$`
 22 | 
 23 | 	RegexTitleZhBlack = "(经营|制作|信息服务|出版|出版服务|演出|视听节目|新闻|视听|新网)许可证"
 24 | )
 25 | 
 26 | var (
 27 | 	zhPuncs = []string{"，", "。", "；", "：", "？", "！", "（", "）", "“", "”"}
 28 | 
 29 | 	wordLangs = []string{"en", "ru", "ar", "de", "fr", "es", "pt"}
 30 | 
 31 | 	zhEnTitles = []string{"nba", "cba", "5g", "ai", "it", "ipo"}
 32 | 
 33 | 	regexUrlPublishDatePattern = regexp.MustCompile(RegexUrlPublishDate)
 34 | 
 35 | 	regexZhPattern = regexp.MustCompile(`\p{Han}`)
 36 | 
 37 | 	regexEnPattern = regexp.MustCompile(`[a-zA-Z]`)
 38 | 
 39 | 	regexPuncPattern = regexp.MustCompile(`\pP`)
 40 | 
 41 | 	regexTitleZhBlackPattern = regexp.MustCompile(RegexTitleZhBlack)
 42 | 
 43 | 	regexIndexSuffixPattern = regexp.MustCompile(RegexIndexSuffix)
 44 | )
 45 | 
 46 | type LinkType int
 47 | 
 48 | type LinkTypeRule map[string][]string
 49 | 
 50 | type LinkRes struct {
 51 | 	// 内容页
 52 | 	Content map[string]string
 53 | 	// 列表页
 54 | 	List map[string]string
 55 | 	// 未知链接
 56 | 	Unknown map[string]string
 57 | 	// 过滤链接
 58 | 	None map[string]string
 59 | }
 60 | 
 61 | // LinkTypes 返回链接分类结果
 62 | func LinkTypes(linkTitles map[string]string, lang string, rules LinkTypeRule) (*LinkRes, map[string]bool) {
 63 | 	linkRes := &LinkRes{
 64 | 		Content: make(map[string]string),
 65 | 		List:    make(map[string]string),
 66 | 		Unknown: make(map[string]string),
 67 | 		None:    make(map[string]string),
 68 | 	}
 69 | 
 70 | 	subDomains := make(map[string]bool)
 71 | 
 72 | 	// 统计数据
 73 | 	var contentPublishCount int
 74 | 	contentTopPaths := make(map[string]int)
 75 | 
 76 | 	for link, title := range linkTitles {
 77 | 		if linkUrl, err := fun.UrlParse(link); err == nil {
 78 | 			hostname := linkUrl.Hostname()
 79 | 			domainTop := DomainTop(hostname)
 80 | 			if hostname != domainTop {
 81 | 				subDomains[hostname] = true
 82 | 			}
 83 | 
 84 | 			// 无规则自动模式
 85 | 			if rules == nil {
 86 | 				linkType := LinkIsContentByTitle(linkUrl, title, lang)
 87 | 				switch linkType {
 88 | 				case LinkTypeContent:
 89 | 					linkRes.Content[link] = title
 90 | 
 91 | 					// 内容页 URL path 时间特征统计
 92 | 					pathDir := path.Dir(strings.TrimSpace(linkUrl.Path))
 93 | 					pathClean := pathDirClean(pathDir)
 94 | 					if regexUrlPublishDatePattern.MatchString(pathClean) {
 95 | 						contentPublishCount++
 96 | 					}
 97 | 
 98 | 					// 内容页 URL path 统计
 99 | 					paths := fun.SplitTrim(pathDir, fun.SLASH)
100 | 					if len(paths) > 0 {
101 | 						pathIndex := paths[0]
102 | 						contentTopPaths[pathIndex]++
103 | 					}
104 | 				case LinkTypeList:
105 | 					linkRes.List[link] = title
106 | 				case LinkTypeNone:
107 | 					linkRes.None[link] = title
108 | 				case LinkTypeUnknown:
109 | 					linkRes.Unknown[link] = title
110 | 				}
111 | 			} else {
112 | 				// 有规则匹配模式
113 | 				if LinkIsContentByRegex(linkUrl, rules) {
114 | 					linkRes.Content[link] = title
115 | 				} else {
116 | 					// 无 path 或者默认 path, 应当由 domain 处理
117 | 					pathDir := strings.TrimSpace(linkUrl.Path)
118 | 					if pathDir == "" || pathDir == fun.SLASH || regexIndexSuffixPattern.MatchString(pathDir) {
119 | 						linkRes.None[link] = title
120 | 					} else {
121 | 						linkRes.List[link] = title
122 | 					}
123 | 				}
124 | 			}
125 | 		}
126 | 	}
127 | 
128 | 	// 基于内容页 URL path 特征统计与分类
129 | 	if rules == nil {
130 | 		linkRes = linkTypePathProcess(linkRes, contentTopPaths, contentPublishCount)
131 | 	}
132 | 
133 | 	// 最后的清洗
134 | 	linkRes = linkClean(linkRes, lang)
135 | 
136 | 	return linkRes, subDomains
137 | }
138 | 
139 | func linkClean(linkRes *LinkRes, lang string) *LinkRes {
140 | 	if lang == "zh" {
141 | 		contentCount := len(linkRes.Content)
142 | 		if contentCount > 0 {
143 | 			for link, title := range linkRes.Content {
144 | 				if regexTitleZhBlackPattern.MatchString(title) {
145 | 					linkRes.None[link] = title
146 | 					delete(linkRes.Content, link)
147 | 				}
148 | 			}
149 | 		}
150 | 	}
151 | 
152 | 	return linkRes
153 | }
154 | 
155 | func linkTypePathProcess(linkRes *LinkRes, contentTopPaths map[string]int, contentPublishCount int) *LinkRes {
156 | 	// 统计
157 | 	contentCount := len(linkRes.Content)
158 | 	listCount := len(linkRes.List)
159 | 	unknownCount := len(linkRes.Unknown)
160 | 
161 | 	// 内容页 URL path 发布时间特征比例
162 | 	publishProb := float32(contentPublishCount) / float32(contentCount)
163 | 
164 | 	// 内容页 URL path 占比较多的特征, 只取 Top 2
165 | 	topPaths := make([]string, 0)
166 | 	if contentCount >= 8 {
167 | 		for topPath, stat := range contentTopPaths {
168 | 			if stat > 1 {
169 | 				prob := float32(stat) / float32(contentCount)
170 | 				if prob > 0.4 {
171 | 					topPaths = append(topPaths, topPath)
172 | 				}
173 | 			}
174 | 		}
175 | 	}
176 | 
177 | 	// 内容页 URL path 具有明显的发布时间特征比例, 处理 List、Unknown
178 | 	if publishProb > 0.7 {
179 | 		if listCount > 0 {
180 | 			for link, title := range linkRes.List {
181 | 				linkUrl, _ := fun.UrlParse(link)
182 | 				pathDir := path.Dir(strings.TrimSpace(linkUrl.Path))
183 | 				pathClean := pathDirClean(pathDir)
184 | 				if regexUrlPublishDatePattern.MatchString(pathClean) {
185 | 					// 判断下长度才加入
186 | 					titleLen := utf8.RuneCountInString(title)
187 | 					if titleLen >= 2 {
188 | 						linkRes.Content[link] = title
189 | 						delete(linkRes.List, link)
190 | 					}
191 | 				}
192 | 			}
193 | 		}
194 | 		if unknownCount > 0 {
195 | 			for link, title := range linkRes.Unknown {
196 | 				linkUrl, _ := fun.UrlParse(link)
197 | 				pathDir := path.Dir(strings.TrimSpace(linkUrl.Path))
198 | 				pathClean := pathDirClean(pathDir)
199 | 				if regexUrlPublishDatePattern.MatchString(pathClean) {
200 | 					// 判断下长度才加入
201 | 					titleLen := utf8.RuneCountInString(title)
202 | 					if titleLen >= 2 {
203 | 						linkRes.Content[link] = title
204 | 					} else {
205 | 						linkRes.List[link] = title
206 | 					}
207 | 				} else {
208 | 					linkRes.List[link] = title
209 | 				}
210 | 				delete(linkRes.Unknown, link)
211 | 			}
212 | 		}
213 | 	} else if len(topPaths) > 0 && unknownCount > 0 {
214 | 		// 内容页 URL path 具有前缀特征, 处理 Unknown
215 | 		for link, title := range linkRes.Unknown {
216 | 			linkUrl, _ := fun.UrlParse(link)
217 | 
218 | 			pathDir := path.Dir(strings.TrimSpace(linkUrl.Path))
219 | 			paths := fun.SplitTrim(pathDir, fun.SLASH)
220 | 			if len(paths) > 0 {
221 | 				pathIndex := paths[0]
222 | 				if fun.SliceContains(topPaths, pathIndex) {
223 | 					// 判断下长度才加入
224 | 					titleLen := utf8.RuneCountInString(title)
225 | 					if titleLen >= 2 {
226 | 						linkRes.Content[link] = title
227 | 					} else {
228 | 						linkRes.List[link] = title
229 | 					}
230 | 				} else {
231 | 					linkRes.List[link] = title
232 | 				}
233 | 				delete(linkRes.Unknown, link)
234 | 			}
235 | 		}
236 | 	}
237 | 
238 | 	// path 具有特征, 清洗一下内容页中无 path 的
239 | 	if contentCount > 0 && (publishProb > 0.7 || len(topPaths) > 0) {
240 | 		for link, title := range linkRes.Content {
241 | 			linkUrl, _ := fun.UrlParse(link)
242 | 			pathStr := strings.TrimSpace(linkUrl.Path)
243 | 			pathDir := path.Dir(pathStr)
244 | 			paths := fun.SplitTrim(pathDir, fun.SLASH)
245 | 			if pathStr == "" || pathStr == "/" || len(paths) == 0 {
246 | 				linkRes.Unknown[link] = title
247 | 				delete(linkRes.Content, link)
248 | 			}
249 | 		}
250 | 	}
251 | 
252 | 	return linkRes
253 | }
254 | 
255 | func LinkIsContentByRegex(linkUrl *url.URL, rules LinkTypeRule) bool {
256 | 	hostname := linkUrl.Hostname()
257 | 	domainTop := DomainTop(hostname)
258 | 
259 | 	if _, exist := rules[hostname]; exist {
260 | 		for _, regex := range rules[hostname] {
261 | 			if fun.Matches(linkUrl.String(), regex) {
262 | 				return true
263 | 			}
264 | 		}
265 | 	} else if _, exist := rules[domainTop]; exist {
266 | 		for _, regex := range rules[domainTop] {
267 | 			if fun.Matches(linkUrl.String(), regex) {
268 | 				return true
269 | 			}
270 | 		}
271 | 	}
272 | 
273 | 	return false
274 | }
275 | 
276 | func LinkIsContentByTitle(linkUrl *url.URL, title string, lang string) LinkType {
277 | 	link := linkUrl.String()
278 | 
279 | 	if utf8.RuneCountInString(link) > 255 {
280 | 		return LinkTypeNone
281 | 	}
282 | 
283 | 	// 无 path 或者默认 path, 应当由 domain 处理
284 | 	pathDir := strings.TrimSpace(linkUrl.Path)
285 | 	if pathDir == "" || pathDir == fun.SLASH || regexIndexSuffixPattern.MatchString(pathDir) {
286 | 		return LinkTypeNone
287 | 	}
288 | 
289 | 	if lang == "zh" {
290 | 		// 中文
291 | 		zhs := regexZhPattern.FindAllString(title, -1)
292 | 		hanCount := len(zhs)
293 | 
294 | 		// 必须包含中文才可能是内容页
295 | 		if hanCount > 0 {
296 | 			// 内容页标题中文大于 5
297 | 			if hanCount > 5 {
298 | 
299 | 				// 去掉空格
300 | 				title = strings.ReplaceAll(title, fun.SPACE, "")
301 | 				titleLen := utf8.RuneCountInString(title)
302 | 
303 | 				// >= 8 判定为内容页 URL
304 | 				if titleLen >= 8 {
305 | 					return LinkTypeContent
306 | 				} else if titleLen < 8 {
307 | 
308 | 					// 如果是中文, 判断是否包含常用标点
309 | 					if lang == "zh" {
310 | 						if fun.ContainsAny(title, zhPuncs...) {
311 | 							return LinkTypeContent
312 | 						}
313 | 					}
314 | 					return LinkTypeUnknown
315 | 				}
316 | 			} else {
317 | 				return LinkTypeList
318 | 			}
319 | 		} else {
320 | 			// 没有中文, 简单匹配英文字典
321 | 			if fun.SliceContains(zhEnTitles, strings.ToLower(title)) {
322 | 				return LinkTypeList
323 | 			}
324 | 
325 | 			return LinkTypeNone
326 | 		}
327 | 
328 | 	} else if fun.SliceContains(wordLangs, lang) {
329 | 		// 英语等单词类的语种
330 | 		// 去掉所有标点
331 | 		title = regexPuncPattern.ReplaceAllString(title, "")
332 | 
333 | 		ens := regexEnPattern.FindAllString(title, -1)
334 | 		enCount := len(ens)
335 | 
336 | 		// 必须包含英文字母
337 | 		if enCount > 0 {
338 | 			// 按照空格切分计算长度
339 | 			words := fun.SplitTrim(title, fun.SPACE)
340 | 
341 | 			// 大于等于5个单词
342 | 			if len(words) >= 5 {
343 | 				return LinkTypeContent
344 | 			} else {
345 | 				return LinkTypeList
346 | 			}
347 | 		} else {
348 | 			return LinkTypeNone
349 | 		}
350 | 	} else {
351 | 		// 其他语种, 去除标点, 计算长度
352 | 		title = regexPuncPattern.ReplaceAllString(title, "")
353 | 
354 | 		titleLen := utf8.RuneCountInString(title)
355 | 		if titleLen >= 8 {
356 | 			return LinkTypeContent
357 | 		} else if titleLen < 8 {
358 | 			// TODO 其他规则
359 | 			return LinkTypeList
360 | 		}
361 | 	}
362 | 
363 | 	return LinkTypeNone
364 | }
365 | 
366 | func pathDirClean(pathDir string) string {
367 | 	pathClean := strings.ReplaceAll(pathDir, fun.DOT, "")
368 | 	pathClean = strings.ReplaceAll(pathClean, fun.DASH, "")
369 | 	pathClean = strings.ReplaceAll(pathClean, fun.UNDERSCORE, "")
370 | 
371 | 	return pathClean
372 | }
373 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/extract/content.go:
--------------------------------------------------------------------------------
   1 | // Package extract 新闻要素抽取, 在 CEPF 算法基础上做了大量的优化
   2 | // Refer to: 基于标签路径特征融合新闻内容抽取的 CEPF 算法 (吴共庆等) http://www.jos.org.cn/jos/article/abstract/4868
   3 | package extract
   4 | 
   5 | import (
   6 | 	"bytes"
   7 | 	"log"
   8 | 	"math"
   9 | 	"path"
  10 | 	"regexp"
  11 | 	"strings"
  12 | 	"unicode/utf8"
  13 | 
  14 | 	"github.com/PuerkitoBio/goquery"
  15 | 	"github.com/x-funs/go-fun"
  16 | 	"golang.org/x/net/html"
  17 | )
  18 | 
  19 | const (
  20 | 	ContentRemoveTags = "script,noscript,style,iframe,br,link,svg,textarea"
  21 | 
  22 | 	// RegexPublishDate 完整的发布时间正则
  23 | 	RegexPublishDate = "(((20[1-3]\\d{1})[-/年.])(0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?((\\.\\d{3})?)(z|Z|[\\+-]\\d{2}[:]?\\d{2})?)?)"
  24 | 
  25 | 	// RegexPublishShortDate 年份缩写发布时间正则, 如 22-09-02 11:11:11
  26 | 	RegexPublishShortDate = "(((20[1-3]\\d{1}|[1-3]\\d{1})[-/年.])(0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?((\\.\\d{3})?)(z|Z|[\\+-]\\d{2}[:]?\\d{2})?)?)"
  27 | 
  28 | 	// RegexPublishDateNoYear 不包含年的发布时间(优先级低), 09-02
  29 | 	RegexPublishDateNoYear = "((0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?)?)"
  30 | 
  31 | 	// RegexEnPublishDate1 英文格式的正则1, 如 02 Sep 2022 11:40:53 pm
  32 | 	RegexEnPublishDate1 = "(?i)((?:(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])(?:st|nd|rd|th)?)[, ]{0,4}(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sept?|oct|nov|dec)[, ]{0,4}(20[1-3]\\d{1})([, ]{0,4}([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:]([0-5][0-9]|[0-9])([:]([0-5][0-9]|[0-9]))?([, ]{0,4}(am|pm))?)?)"
  33 | 
  34 | 	// RegexEnPublishDate2 英文格式的正则2, 如 Sep 02 2022 11:40:53 pm
  35 | 	RegexEnPublishDate2 = "(?i)((january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sept?|oct|nov|dec)[, ]{0,4}(?:(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])(?:st|nd|rd|th)?)[, ]{0,4}(20[1-3]\\d{1})([, ]{0,4}([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:]([0-5][0-9]|[0-9])([:]([0-5][0-9]|[0-9]))?([, ]{0,4}(am|pm))?)?)"
  36 | 
  37 | 	// RegexEnUsPublishDate 英文美式格式的正则3, 如 8/30/2022 11:11:11
  38 | 	RegexEnUsPublishDate = "((0[1-9]|1[0-2]|[1-9])[-/.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[-/.](20[1-3]\\d{1}|[1-3]\\d{1})[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:]([0-5][0-9]|[0-9])[:]?(([0-5][0-9]|[0-9]))?)?)"
  39 | 
  40 | 	// RegexTime 仅时间正则
  41 | 	RegexTime = "([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?"
  42 | 
  43 | 	// RegexZhPublishPrefix 中文的发布时间前缀
  44 | 	RegexZhPublishPrefix = "(?i)(发布|创建|出版|发表|编辑)?(时间|日期)"
  45 | 
  46 | 	// RegexZhPublishDate 中文的固定格式, 如 发布时间: xxx
  47 | 	RegexZhPublishDate = RegexZhPublishPrefix + "[\\pP ]{1,8}" + RegexPublishShortDate
  48 | 
  49 | 	// RegexScriptTitle Script 中的标题
  50 | 	RegexScriptTitle = `(?i)"title"[\t ]{0,4}:[\t ]{0,4}"(.*)"`
  51 | 
  52 | 	// RegexScriptTime Script 中的发布时间
  53 | 	RegexScriptTime = `(?i)"[\w_\-]*pub.*"[\t ]{0,4}:[\t ]{0,4}"(((20[1-3]\d{1})[-/年.])(0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?((\.\d{3})?)(z|Z|[\+-]\d{2}[:]?\d{2})?))"`
  54 | 
  55 | 	// RegexWxScriptTime 微信 Script 中的发布时间
  56 | 	RegexWxScriptTime = `(?i)ct[\t ]{0,4}=[\t ]{0,4}"(1[2-9]\d{8})"`
  57 | 
  58 | 	// RegexContentUrlPublishDate 内容页URL中隐藏的时间, 必须是非常完整标准的时间 20221003
  59 | 	RegexContentUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2])[/]?(0[1-9]|[1-2][0-9]|3[0-1]))`
  60 | 
  61 | 	// RegexFormatTime3 错误的时间格式, 用于过滤
  62 | 	RegexFormatTime3 = `[:分]\d{3}$`
  63 | 
  64 | 	// RegexFormatTime4 错误的时间格式, 用于过滤
  65 | 	RegexFormatTime4 = `[:分]\d{4}$`
  66 | 
  67 | 	// RegexZone 错误的时区格式, 用于过滤
  68 | 	RegexZone = `(([\+-]\d{2})[:]?\d{2})$`
  69 | 
  70 | 	// TitleSimZh 中文相似度阈值
  71 | 	TitleSimZh = 0.3
  72 | 
  73 | 	// TitleSimWord 单词相似度阈值
  74 | 	TitleSimWord = 0.5
  75 | )
  76 | 
  77 | var (
  78 | 	contentMetaTitleSelectors = []string{
  79 | 		"meta[property='og:title' i]",
  80 | 		"meta[property='twitter:title' i]",
  81 | 		"meta[name='twitter:title' i]",
  82 | 	}
  83 | 
  84 | 	contentMetaDatetimeDicts = []string{"publish", "pubdate", "pubtime", "release", "dctermsdate"}
  85 | 
  86 | 	regexPublishDatePattern = regexp.MustCompile(RegexPublishDate)
  87 | 
  88 | 	regexPublishShortDatePattern = regexp.MustCompile(RegexPublishShortDate)
  89 | 
  90 | 	regexPublishDateNoYearPattern = regexp.MustCompile(RegexPublishDateNoYear)
  91 | 
  92 | 	regexZhPublishDatePattern = regexp.MustCompile(RegexZhPublishDate)
  93 | 
  94 | 	regexEnPublishDatePattern1 = regexp.MustCompile(RegexEnPublishDate1)
  95 | 
  96 | 	regexEnPublishDatePattern2 = regexp.MustCompile(RegexEnPublishDate2)
  97 | 
  98 | 	regexEnUsPublishDatePattern = regexp.MustCompile(RegexEnUsPublishDate)
  99 | 
 100 | 	regexTimePattern = regexp.MustCompile(RegexTime)
 101 | 
 102 | 	regexScriptTitlePattern = regexp.MustCompile(RegexScriptTitle)
 103 | 
 104 | 	regexScriptTimePattern = regexp.MustCompile(RegexScriptTime)
 105 | 
 106 | 	regexWxScriptTimePattern = regexp.MustCompile(RegexWxScriptTime)
 107 | 
 108 | 	regexContentUrlPublishDatePattern = regexp.MustCompile(RegexContentUrlPublishDate)
 109 | 
 110 | 	regexFormatTime3 = regexp.MustCompile(RegexFormatTime3)
 111 | 
 112 | 	regexFormatTime4 = regexp.MustCompile(RegexFormatTime4)
 113 | 
 114 | 	regexZonePattern = regexp.MustCompile(RegexZone)
 115 | )
 116 | 
 117 | type News struct {
 118 | 	// 标题
 119 | 	Title string
 120 | 	// 标题提取依据
 121 | 	TitlePos string
 122 | 	// 发布时间
 123 | 	TimeLocal string
 124 | 	// 原始时间
 125 | 	Time string
 126 | 	// 发布时间时间提取依据
 127 | 	TimePos string
 128 | 	// 正文纯文本
 129 | 	Content string
 130 | 	// 正文 Node 节点
 131 | 	ContentNode *html.Node
 132 | 	// 提取用时（毫秒）
 133 | 	Spend int64
 134 | 	// 语种
 135 | 	Lang string
 136 | }
 137 | 
 138 | type Content struct {
 139 | 	// 原始 Doc
 140 | 	OriginDoc *goquery.Document
 141 | 	// Doc
 142 | 	Doc *goquery.Document
 143 | 	// 原始标题, 来自于上级页面
 144 | 	OriginTitle string
 145 | 	// 原始链接, 来自于上级页面
 146 | 	OriginUrl string
 147 | 	// 语种
 148 | 	Lang string
 149 | 
 150 | 	infoMap      map[*html.Node]countInfo
 151 | 	bodyNode     *html.Node
 152 | 	title        string
 153 | 	titlePos     string
 154 | 	titleSim     float64
 155 | 	timePos      string
 156 | 	timeEnFormat bool
 157 | }
 158 | 
 159 | type countInfo struct {
 160 | 	// 文本长度, 如 <p> 标签的文本
 161 | 	TextCount int
 162 | 	// 带有链接的文本长度, 如 <a> 标签中的文本
 163 | 	LinkTextCount int
 164 | 	// 标签数量
 165 | 	TagCount int
 166 | 	// 带有链接的标签数量
 167 | 	LinkTagCount int
 168 | 	// 密度
 169 | 	Density float64
 170 | 	// 密度统计
 171 | 	DensitySum float64
 172 | 	// <p> 标签数量
 173 | 	PCount int
 174 | 	// 叶子列表
 175 | 	LeafList []int
 176 | }
 177 | 
 178 | func NewContent(docOrg *goquery.Document, lang string, originTitle string, originUrl string) *Content {
 179 | 	originDoc := goquery.CloneDocument(docOrg)
 180 | 	doc := goquery.CloneDocument(docOrg)
 181 | 	doc.Find(ContentRemoveTags).Remove()
 182 | 
 183 | 	// 标题相似度阈值判定
 184 | 	titleSim := TitleSimZh
 185 | 	if fun.SliceContains(wordLangs, lang) {
 186 | 		titleSim = TitleSimWord
 187 | 	}
 188 | 
 189 | 	infoMap := make(map[*html.Node]countInfo, 0)
 190 | 
 191 | 	return &Content{OriginDoc: originDoc, Doc: doc, OriginTitle: originTitle, OriginUrl: originUrl, Lang: lang, infoMap: infoMap, titleSim: titleSim}
 192 | }
 193 | 
 194 | func (c *Content) ExtractNews() *News {
 195 | 	news := &News{}
 196 | 
 197 | 	// 开始时间
 198 | 	begin := fun.Timestamp(true)
 199 | 
 200 | 	// 提取正文结点和正文
 201 | 	contentNode := c.getContentNode()
 202 | 	if contentNode != nil {
 203 | 		news.ContentNode = contentNode
 204 | 
 205 | 		content := c.formatContent(contentNode)
 206 | 		news.Content = content
 207 | 	}
 208 | 
 209 | 	// 提取标题
 210 | 	title := c.getTitle(contentNode)
 211 | 	news.Title = title
 212 | 	news.TitlePos = c.titlePos
 213 | 	c.title = title
 214 | 
 215 | 	// 提取发布时间
 216 | 	time := c.getTime()
 217 | 	if time != "" {
 218 | 		// 格式化时间
 219 | 		news.Time = time
 220 | 		news.TimePos = c.timePos
 221 | 		time = c.formatTime(time)
 222 | 		ts := fun.StrToTime(time)
 223 | 		if ts > 0 {
 224 | 			news.TimeLocal = fun.Date(ts)
 225 | 		}
 226 | 	}
 227 | 
 228 | 	news.Spend = fun.Timestamp(true) - begin
 229 | 	news.Lang = c.Lang
 230 | 
 231 | 	return news
 232 | }
 233 | 
 234 | // formatTime 时间格式化清洗(尽可能的)
 235 | func (c *Content) formatTime(time string) string {
 236 | 	if !c.timeEnFormat {
 237 | 		// 当包含时区信息时格式化空格
 238 | 		if fun.ContainsAny(time, "T", "t", "Z", "z") {
 239 | 			time = strings.ReplaceAll(time, " ", "")
 240 | 		}
 241 | 		// 当包含时区T时又没有偏移, 按本地时间处理
 242 | 		if fun.Contains(time, "T") && !fun.ContainsCase(time, "z") {
 243 | 			if !regexZonePattern.MatchString(time) {
 244 | 				time = strings.ReplaceAll(time, "T", " ")
 245 | 			}
 246 | 		}
 247 | 	}
 248 | 
 249 | 	// 错误的尾巴处理
 250 | 	if fun.Contains(time, ":") && !fun.ContainsAny(time, "时", "点") {
 251 | 		time = strings.TrimSuffix(time, "分")
 252 | 	}
 253 | 	return time
 254 | }
 255 | 
 256 | // formatContent 正文格式化, 处理 <p> 的换行, 最终将多个换行符和空格均合并为一个
 257 | func (c *Content) formatContent(contentNode *html.Node) string {
 258 | 	// 先提取 HTML
 259 | 	node := goquery.NewDocumentFromNode(contentNode)
 260 | 	contentHtml, _ := node.Html()
 261 | 
 262 | 	// 给 <p>  则增加换行 \n
 263 | 	contentHtml = strings.ReplaceAll(contentHtml, "</p>", "</p>\n")
 264 | 	n, _ := goquery.NewDocumentFromReader(strings.NewReader(contentHtml))
 265 | 	str := n.Text()
 266 | 
 267 | 	// 最后合并多余的换行
 268 | 	lines := fun.SplitTrim(str, fun.LF)
 269 | 	if len(lines) > 0 {
 270 | 		for i, line := range lines {
 271 | 			lines[i] = fun.NormaliseSpace(line)
 272 | 		}
 273 | 		str = strings.Join(lines, fun.LF)
 274 | 	} else {
 275 | 		str = fun.NormaliseSpace(str)
 276 | 	}
 277 | 
 278 | 	return str
 279 | }
 280 | 
 281 | func (c *Content) getContentNode() *html.Node {
 282 | 	var maxScore float64
 283 | 	var contentNode *html.Node
 284 | 
 285 | 	// 取第一个 body 标签
 286 | 	bodyNodes := c.Doc.Find("body").Nodes
 287 | 	if len(bodyNodes) > 0 {
 288 | 		bodyNode := bodyNodes[0]
 289 | 		c.bodyNode = bodyNode
 290 | 
 291 | 		// 递归遍历计算并统计, 最后找得分最高那个节点
 292 | 		c.computeInfo(c.bodyNode)
 293 | 
 294 | 		for node := range c.infoMap {
 295 | 			if node.Data == "a" || node == bodyNode {
 296 | 				continue
 297 | 			}
 298 | 
 299 | 			score := c.computeScore(node)
 300 | 			if score > maxScore {
 301 | 				maxScore = score
 302 | 				contentNode = node
 303 | 			}
 304 | 		}
 305 | 	}
 306 | 
 307 | 	return contentNode
 308 | }
 309 | 
 310 | func (c *Content) getTime() string {
 311 | 	// meta
 312 | 	regexZhPatterns := []*regexp.Regexp{
 313 | 		regexPublishDatePattern,
 314 | 	}
 315 | 	metaZhTime := c.getTimeByMeta(regexZhPatterns)
 316 | 	if metaZhTime != "" {
 317 | 		c.timePos = "meta"
 318 | 		return metaZhTime
 319 | 	}
 320 | 
 321 | 	// meta En
 322 | 	if c.Lang != "zh" {
 323 | 		regexEnPatterns := []*regexp.Regexp{
 324 | 			regexEnPublishDatePattern1,
 325 | 			regexEnPublishDatePattern2,
 326 | 		}
 327 | 		metaEnTime := c.getTimeByMetaEn(regexEnPatterns)
 328 | 		if metaEnTime != "" {
 329 | 			c.timePos = "meta"
 330 | 			c.timeEnFormat = true
 331 | 			return metaEnTime
 332 | 		}
 333 | 	}
 334 | 
 335 | 	// <time> 标签
 336 | 	tagTime := c.getTimeByTag()
 337 | 	if tagTime != "" {
 338 | 		c.timePos = "tag"
 339 | 		return tagTime
 340 | 	}
 341 | 
 342 | 	// <script> 标签, 必须包含时间
 343 | 	scriptTime := c.getTimeByScript()
 344 | 	if scriptTime != "" {
 345 | 		c.timePos = "script"
 346 | 		return scriptTime
 347 | 	}
 348 | 
 349 | 	bodyText := c.Doc.Find("body").Text()
 350 | 	bodyText = fun.NormaliseSpace(bodyText)
 351 | 
 352 | 	// <body>
 353 | 	contentTime := c.getTimeByBody(bodyText)
 354 | 	if contentTime != "" {
 355 | 		c.timePos = "body"
 356 | 		return contentTime
 357 | 	}
 358 | 
 359 | 	// Lang专属
 360 | 	langTime := c.getTimeByLang(bodyText)
 361 | 	if langTime != "" {
 362 | 		c.timePos = "lang"
 363 | 		return langTime
 364 | 	}
 365 | 
 366 | 	urlTime := c.getTimeByUrl()
 367 | 	if urlTime != "" {
 368 | 		c.timePos = "url"
 369 | 		return urlTime
 370 | 	}
 371 | 
 372 | 	return ""
 373 | }
 374 | 
 375 | func (c *Content) getTimeByLang(bodyText string) string {
 376 | 	if c.Lang == "zh" {
 377 | 		allRegexs := regexZhPublishDatePattern.FindAllString(bodyText, -1)
 378 | 
 379 | 		if allRegexs != nil {
 380 | 			publishDates := make([]string, 0)
 381 | 			for _, regex := range allRegexs {
 382 | 				regexDate := regexPublishShortDatePattern.FindString(regex)
 383 | 				if regexDate != "" {
 384 | 					publishDates = append(publishDates, regexDate)
 385 | 				}
 386 | 			}
 387 | 
 388 | 			if len(publishDates) > 0 {
 389 | 				return c.pickPublishDates(bodyText, publishDates, false)
 390 | 			}
 391 | 		}
 392 | 	} else {
 393 | 		// 第一种格式
 394 | 		allRegexs := regexEnPublishDatePattern1.FindAllString(bodyText, -1)
 395 | 		if allRegexs != nil {
 396 | 			publishDates := make([]string, 0)
 397 | 			for _, regex := range allRegexs {
 398 | 				dateStr := strings.TrimSpace(regex)
 399 | 				dateStr = fun.NormaliseSpace(dateStr)
 400 | 				dateStr = strings.ReplaceAll(dateStr, ",", " ")
 401 | 				publishDates = append(publishDates, dateStr)
 402 | 
 403 | 			}
 404 | 
 405 | 			if len(publishDates) > 0 {
 406 | 				c.timeEnFormat = true
 407 | 				return c.pickPublishDates(bodyText, publishDates, false)
 408 | 			}
 409 | 		}
 410 | 
 411 | 		// 第二种格式
 412 | 		allRegexs = regexEnPublishDatePattern2.FindAllString(bodyText, -1)
 413 | 		if allRegexs != nil {
 414 | 			publishDates := make([]string, 0)
 415 | 			for _, regex := range allRegexs {
 416 | 				dateStr := strings.TrimSpace(regex)
 417 | 				dateStr = fun.NormaliseSpace(dateStr)
 418 | 				dateStr = strings.ReplaceAll(dateStr, ",", " ")
 419 | 				publishDates = append(publishDates, dateStr)
 420 | 
 421 | 			}
 422 | 
 423 | 			if len(publishDates) > 0 {
 424 | 				c.timeEnFormat = true
 425 | 				return c.pickPublishDates(bodyText, publishDates, false)
 426 | 			}
 427 | 		}
 428 | 
 429 | 		// 第三种格式, 美式时间, 如 8/30/2022 11:11:11
 430 | 		allRegexs = regexEnUsPublishDatePattern.FindAllString(bodyText, -1)
 431 | 		if allRegexs != nil {
 432 | 			publishDates := make([]string, 0)
 433 | 			for _, regex := range allRegexs {
 434 | 				dateStr := strings.TrimSpace(regex)
 435 | 				publishDates = append(publishDates, dateStr)
 436 | 			}
 437 | 
 438 | 			if len(publishDates) > 0 {
 439 | 				return c.pickPublishDates(bodyText, publishDates, false)
 440 | 			}
 441 | 		}
 442 | 	}
 443 | 
 444 | 	return ""
 445 | }
 446 | 
 447 | func (c *Content) getTimeByBody(bodyText string) string {
 448 | 	// 带有年份的完整匹配
 449 | 	publishDates := regexPublishShortDatePattern.FindAllString(bodyText, -1)
 450 | 	if (publishDates) != nil {
 451 | 		return c.pickPublishDates(bodyText, publishDates, false)
 452 | 	}
 453 | 
 454 | 	// 不带年份的匹配, 仅处理中文并且必须有时间, 如  01-01 01:00
 455 | 	if c.Lang == "zh" {
 456 | 		publishNoYearDates := regexPublishDateNoYearPattern.FindAllString(bodyText, -1)
 457 | 		if publishNoYearDates != nil {
 458 | 			noYear := c.pickPublishDates(bodyText, publishNoYearDates, true)
 459 | 			if noYear != "" {
 460 | 				if strings.Contains(noYear, "月") {
 461 | 					year := fun.Date("2006年")
 462 | 					return year + noYear
 463 | 				} else {
 464 | 					noYear = strings.NewReplacer("/", "-", ".", "-").Replace(noYear)
 465 | 					year := fun.Date("2006-")
 466 | 					return year + noYear
 467 | 				}
 468 | 			}
 469 | 
 470 | 			return noYear
 471 | 		}
 472 | 	}
 473 | 
 474 | 	return ""
 475 | }
 476 | 
 477 | func (c *Content) pickPublishDates(bodyText string, publishDates []string, requireTime bool) string {
 478 | 	// 根据是否有时间进行分组
 479 | 	hasTimes := make([]string, 0)
 480 | 	noTimes := make([]string, 0)
 481 | 	for _, date := range publishDates {
 482 | 		dateStr := strings.TrimSpace(date)
 483 | 		if regexTimePattern.MatchString(dateStr) {
 484 | 			// 去除非法的尾巴
 485 | 			if regexFormatTime3.MatchString(dateStr) {
 486 | 				timeRunes := []rune(dateStr)
 487 | 				timeRunes = timeRunes[:len(timeRunes)-1]
 488 | 				dateStr = string(timeRunes)
 489 | 			}
 490 | 			if regexFormatTime4.MatchString(dateStr) {
 491 | 				timeRunes := []rune(dateStr)
 492 | 				timeRunes = timeRunes[:len(timeRunes)-2]
 493 | 				dateStr = string(timeRunes)
 494 | 			}
 495 | 			hasTimes = append(hasTimes, dateStr)
 496 | 		} else {
 497 | 			noTimes = append(noTimes, dateStr)
 498 | 		}
 499 | 	}
 500 | 
 501 | 	// 有时间的情况优先
 502 | 	if len(hasTimes) > 0 {
 503 | 		if len(hasTimes) == 1 {
 504 | 			return hasTimes[0]
 505 | 		}
 506 | 
 507 | 		// 判断第一个是不是最长的, 如果最长就优先返回
 508 | 		var maxLen int
 509 | 		var maxIndex int
 510 | 		for i, date := range hasTimes {
 511 | 			length := utf8.RuneCountInString(date)
 512 | 			if length > maxLen {
 513 | 				maxLen = length
 514 | 				maxIndex = i
 515 | 			}
 516 | 		}
 517 | 
 518 | 		if maxIndex == 0 {
 519 | 			return hasTimes[0]
 520 | 		}
 521 | 
 522 | 		// 找最靠近标题的那一个
 523 | 		if c.title != "" && (c.titlePos == "selector" || c.titlePos == "headline" || c.titlePos == "content") {
 524 | 			titleIndex := strings.Index(bodyText, c.title)
 525 | 
 526 | 			minDistance := float64(math.MaxInt)
 527 | 			var minIndex int
 528 | 			for i, date := range hasTimes {
 529 | 				dateIndex := strings.Index(bodyText, date)
 530 | 				abs := math.Abs(float64(dateIndex) - float64(titleIndex))
 531 | 				if abs < minDistance {
 532 | 					minDistance = abs
 533 | 					minIndex = i
 534 | 				}
 535 | 			}
 536 | 
 537 | 			return hasTimes[minIndex]
 538 | 		}
 539 | 
 540 | 		// 没找到或标题不是正文区域, 最后返回第一个
 541 | 		return hasTimes[0]
 542 | 	}
 543 | 
 544 | 	// 没有时间的情况
 545 | 	if !requireTime {
 546 | 		if len(noTimes) > 0 {
 547 | 			if len(noTimes) == 1 {
 548 | 				return noTimes[0]
 549 | 			}
 550 | 
 551 | 			// 英文时间格式
 552 | 			if c.timeEnFormat {
 553 | 				// 找最靠近标题的那一个
 554 | 				if c.title != "" && (c.titlePos == "selector" || c.titlePos == "headline") {
 555 | 					titleIndex := strings.Index(bodyText, c.title)
 556 | 
 557 | 					minDistance := float64(math.MaxInt)
 558 | 					var minIndex int
 559 | 					for i, date := range noTimes {
 560 | 						dateIndex := strings.Index(bodyText, date)
 561 | 						abs := math.Abs(float64(dateIndex) - float64(titleIndex))
 562 | 						if abs < minDistance {
 563 | 							minDistance = abs
 564 | 							minIndex = i
 565 | 						}
 566 | 					}
 567 | 
 568 | 					return noTimes[minIndex]
 569 | 				}
 570 | 
 571 | 				// 返回第一个
 572 | 				return noTimes[0]
 573 | 			} else {
 574 | 				// 返回最近的一个日期
 575 | 				var maxTimestamp int64
 576 | 				var maxIndex int
 577 | 				for i, date := range noTimes {
 578 | 					timestamp := fun.StrToTime(date)
 579 | 					timestampLimit := fun.Timestamp() + 86400
 580 | 					if timestamp < timestampLimit && timestamp > maxTimestamp {
 581 | 						maxIndex = i
 582 | 					}
 583 | 				}
 584 | 
 585 | 				return noTimes[maxIndex]
 586 | 			}
 587 | 			// 不会直接返回没有时间的, 因为可靠性低
 588 | 		}
 589 | 	}
 590 | 
 591 | 	return ""
 592 | }
 593 | 
 594 | func (c *Content) getTimeByTag() string {
 595 | 	timeTags := c.Doc.Find("time")
 596 | 	if timeTags.Size() > 0 {
 597 | 		firstTimeTags := timeTags.First()
 598 | 		dateTime := firstTimeTags.AttrOr("datetime", "")
 599 | 		if dateTime != "" {
 600 | 			// 先匹配标准格式
 601 | 			find := regexPublishDatePattern.FindString(dateTime)
 602 | 			if find != "" {
 603 | 				return find
 604 | 			}
 605 | 
 606 | 			// 非英文再匹配其他格式
 607 | 			if c.Lang != "zh" {
 608 | 				find = regexEnPublishDatePattern1.FindString(dateTime)
 609 | 				if find != "" {
 610 | 					find = fun.NormaliseSpace(find)
 611 | 					find = strings.ReplaceAll(find, ",", " ")
 612 | 					c.timeEnFormat = true
 613 | 					return find
 614 | 				}
 615 | 
 616 | 				find = regexEnPublishDatePattern2.FindString(dateTime)
 617 | 				if find != "" {
 618 | 					find = fun.NormaliseSpace(find)
 619 | 					find = strings.ReplaceAll(find, ",", " ")
 620 | 					c.timeEnFormat = true
 621 | 					return find
 622 | 				}
 623 | 			}
 624 | 		}
 625 | 	}
 626 | 
 627 | 	return ""
 628 | }
 629 | 
 630 | func (c *Content) getTimeByMeta(regexPatterns []*regexp.Regexp) string {
 631 | 	metaDates := make([]string, 0)
 632 | 	metas := c.Doc.Find("meta")
 633 | 	if metas.Size() > 0 {
 634 | 		metas.Each(func(i int, meta *goquery.Selection) {
 635 | 			content := meta.AttrOr("content", "")
 636 | 			for _, regexPattern := range regexPatterns {
 637 | 				dateStr := regexPattern.FindString(content)
 638 | 				if dateStr != "" {
 639 | 					name := meta.AttrOr("name", "")
 640 | 					property := meta.AttrOr("property", "")
 641 | 					replacer := strings.NewReplacer("_", "", "-", "", ".", "")
 642 | 					name = replacer.Replace(name)
 643 | 					property = replacer.Replace(property)
 644 | 					if fun.ContainsAny(property, contentMetaDatetimeDicts...) {
 645 | 						dateStr = strings.TrimSpace(dateStr)
 646 | 						metaDates = append(metaDates, dateStr)
 647 | 					}
 648 | 
 649 | 					if fun.ContainsAny(name, contentMetaDatetimeDicts...) {
 650 | 						dateStr = strings.TrimSpace(dateStr)
 651 | 						metaDates = append(metaDates, dateStr)
 652 | 					}
 653 | 
 654 | 					break
 655 | 				}
 656 | 			}
 657 | 		})
 658 | 	}
 659 | 
 660 | 	metaDatesLen := len(metaDates)
 661 | 	if metaDatesLen > 0 {
 662 | 		// 根据是否有时间进行分组
 663 | 		hasTimes := make([]string, 0)
 664 | 		noTimes := make([]string, 0)
 665 | 		for _, date := range metaDates {
 666 | 			if regexTimePattern.MatchString(date) {
 667 | 				// 去除非法的尾巴
 668 | 				hasTimes = append(hasTimes, date)
 669 | 			} else {
 670 | 				noTimes = append(noTimes, date)
 671 | 			}
 672 | 		}
 673 | 
 674 | 		// 有时间的情况, 返回最长的
 675 | 		if len(hasTimes) > 0 {
 676 | 			if len(hasTimes) == 1 {
 677 | 				return hasTimes[0]
 678 | 			}
 679 | 
 680 | 			var maxLen int
 681 | 			var maxLenDate string
 682 | 			for _, date := range hasTimes {
 683 | 				length := utf8.RuneCountInString(date)
 684 | 				if length > maxLen {
 685 | 					maxLen = length
 686 | 					maxLenDate = date
 687 | 				}
 688 | 			}
 689 | 
 690 | 			return maxLenDate
 691 | 		}
 692 | 
 693 | 		// 返回最长的, 非中文情况下才会返回没有时间的
 694 | 		if c.Lang != "zh" {
 695 | 			if len(noTimes) > 0 {
 696 | 				if len(noTimes) == 1 {
 697 | 					return noTimes[0]
 698 | 				}
 699 | 
 700 | 				var maxLen int
 701 | 				var maxLenDate string
 702 | 				for _, date := range noTimes {
 703 | 					length := utf8.RuneCountInString(date)
 704 | 					if length > maxLen {
 705 | 						maxLen = length
 706 | 						maxLenDate = date
 707 | 					}
 708 | 				}
 709 | 
 710 | 				return maxLenDate
 711 | 			}
 712 | 		}
 713 | 	}
 714 | 
 715 | 	return ""
 716 | }
 717 | 
 718 | func (c *Content) getTimeByMetaEn(regexPatterns []*regexp.Regexp) string {
 719 | 	metaDates := make([]string, 0)
 720 | 	metas := c.Doc.Find("meta")
 721 | 	if metas.Size() > 0 {
 722 | 		metas.Each(func(i int, meta *goquery.Selection) {
 723 | 			content := meta.AttrOr("content", "")
 724 | 			for _, regexPattern := range regexPatterns {
 725 | 				dateStr := regexPattern.FindString(content)
 726 | 				if dateStr != "" {
 727 | 					name := meta.AttrOr("name", "")
 728 | 					property := meta.AttrOr("property", "")
 729 | 					replacer := strings.NewReplacer("_", "", "-", "", ".", "")
 730 | 					name = replacer.Replace(name)
 731 | 					property = replacer.Replace(property)
 732 | 
 733 | 					if fun.ContainsAny(property, contentMetaDatetimeDicts...) {
 734 | 						dateStr = strings.TrimSpace(dateStr)
 735 | 						dateStr = fun.NormaliseSpace(dateStr)
 736 | 						dateStr = strings.ReplaceAll(dateStr, ",", " ")
 737 | 						metaDates = append(metaDates, dateStr)
 738 | 					}
 739 | 
 740 | 					if fun.ContainsAny(name, contentMetaDatetimeDicts...) {
 741 | 						dateStr = strings.TrimSpace(dateStr)
 742 | 						dateStr = fun.NormaliseSpace(dateStr)
 743 | 						dateStr = strings.ReplaceAll(dateStr, ",", " ")
 744 | 						metaDates = append(metaDates, dateStr)
 745 | 					}
 746 | 
 747 | 					break
 748 | 				}
 749 | 			}
 750 | 		})
 751 | 	}
 752 | 
 753 | 	metaDatesLen := len(metaDates)
 754 | 	if metaDatesLen > 0 {
 755 | 		// 根据是否有时间进行分组
 756 | 		hasTimes := make([]string, 0)
 757 | 		noTimes := make([]string, 0)
 758 | 		for _, date := range metaDates {
 759 | 			if regexTimePattern.MatchString(date) {
 760 | 				// 去除非法的尾巴
 761 | 				hasTimes = append(hasTimes, date)
 762 | 			} else {
 763 | 				noTimes = append(noTimes, date)
 764 | 			}
 765 | 		}
 766 | 
 767 | 		// 有时间的情况, 返回最长的
 768 | 		if len(hasTimes) > 0 {
 769 | 			if len(hasTimes) == 1 {
 770 | 				return hasTimes[0]
 771 | 			}
 772 | 
 773 | 			var maxLen int
 774 | 			var maxLenDate string
 775 | 			for _, date := range hasTimes {
 776 | 				length := utf8.RuneCountInString(date)
 777 | 				if length > maxLen {
 778 | 					maxLen = length
 779 | 					maxLenDate = date
 780 | 				}
 781 | 			}
 782 | 
 783 | 			return maxLenDate
 784 | 		}
 785 | 
 786 | 		// 返回最长的, 非中文情况下才会返回没有时间的
 787 | 		if c.Lang != "zh" {
 788 | 			if len(noTimes) > 0 {
 789 | 				if len(noTimes) == 1 {
 790 | 					return noTimes[0]
 791 | 				}
 792 | 
 793 | 				var maxLen int
 794 | 				var maxLenDate string
 795 | 				for _, date := range noTimes {
 796 | 					length := utf8.RuneCountInString(date)
 797 | 					if length > maxLen {
 798 | 						maxLen = length
 799 | 						maxLenDate = date
 800 | 					}
 801 | 				}
 802 | 
 803 | 				return maxLenDate
 804 | 			}
 805 | 		}
 806 | 	}
 807 | 
 808 | 	return ""
 809 | }
 810 | 
 811 | // getTitleByOrigin 获取页面的 H[1-2] 标题, 找出与 OriginTitle 最像的
 812 | func (c *Content) getTitleByOrigin() string {
 813 | 	if !fun.Blank(c.OriginTitle) {
 814 | 		headlines := c.Doc.Find("h1,h2")
 815 | 		if headlines.Size() > 0 {
 816 | 			titles := make([]string, 0)
 817 | 			titleSim := make([]float64, 0)
 818 | 			headlines.Each(func(i int, headline *goquery.Selection) {
 819 | 				text := fun.NormaliseSpace(headline.Text())
 820 | 				sim := fun.SimilarityText(c.OriginTitle, text)
 821 | 				if sim > c.titleSim {
 822 | 					titleSim = append(titleSim, sim)
 823 | 					titles = append(titles, text)
 824 | 				}
 825 | 			})
 826 | 
 827 | 			if len(titles) > 0 {
 828 | 				var title string
 829 | 				var maxScore float64
 830 | 				for i, t := range titles {
 831 | 					if titleSim[i] > maxScore {
 832 | 						title = t
 833 | 					}
 834 | 				}
 835 | 
 836 | 				return title
 837 | 			}
 838 | 		}
 839 | 	}
 840 | 
 841 | 	return ""
 842 | }
 843 | 
 844 | func (c *Content) getTitle(contentNode *html.Node) string {
 845 | 	var title string
 846 | 
 847 | 	// 优先使用 originTitle 判定页面中的 H1-2
 848 | 	title = c.getTitleByOrigin()
 849 | 	if title != "" {
 850 | 		c.titlePos = "headline"
 851 | 		return title
 852 | 	}
 853 | 
 854 | 	// 页面 Title
 855 | 	originMetaTitle := WebTitle(c.Doc, 255)
 856 | 
 857 | 	// 去除原始 metaTitle 最后一个尾巴（一般是站点名称），再进行相似判断
 858 | 	metaTitle := WebContentTitleClean(originMetaTitle, c.Lang)
 859 | 
 860 | 	// 从 Meta 中提取相似 <title> 的标题，优先级较高，返回短的那个
 861 | 	titleByMeta := c.getTitleByMeta(metaTitle)
 862 | 	if titleByMeta != "" {
 863 | 		c.titlePos = "meta"
 864 | 		return titleByMeta
 865 | 	}
 866 | 
 867 | 	// 提取页面 Script 寻找是否包含有 title 的字段
 868 | 	titleScript := c.getTitleByScript(metaTitle)
 869 | 	if titleScript != "" {
 870 | 		c.titlePos = "script"
 871 | 		return titleScript
 872 | 	}
 873 | 
 874 | 	titleList := make([]*html.Node, 0)
 875 | 	titleSim := make([]float64, 0)
 876 | 	if !fun.Blank(originMetaTitle) && contentNode != nil {
 877 | 		// 从 body 开始遍历，收集 h1->h2，并计算与 metaTitle 的相似度
 878 | 		var traverse func(*html.Node)
 879 | 		traverse = func(n *html.Node) {
 880 | 			if n.FirstChild != nil {
 881 | 				if n.Type == html.ElementNode {
 882 | 					// 计算 h1->h2 的相似度
 883 | 					tagName := n.Data
 884 | 					if fun.Matches(tagName, "h[1-2]") {
 885 | 						tagNode := goquery.NewDocumentFromNode(n)
 886 | 						headTitle := c.normaliseText(tagNode.Selection)
 887 | 						sim := fun.SimilarityText(headTitle, metaTitle)
 888 | 						titleSim = append(titleSim, sim)
 889 | 						titleList = append(titleList, n)
 890 | 					}
 891 | 				}
 892 | 
 893 | 				for child := n.FirstChild; child != nil; child = child.NextSibling {
 894 | 					traverse(child)
 895 | 				}
 896 | 			}
 897 | 		}
 898 | 		if c.bodyNode != nil {
 899 | 			traverse(c.bodyNode)
 900 | 		}
 901 | 
 902 | 		// 从 h 标签中获取
 903 | 		index := len(titleList)
 904 | 		if index > 0 {
 905 | 			var maxScore float64
 906 | 			var maxIndex int
 907 | 			maxIndex = -1
 908 | 
 909 | 			// 找相似度最高的
 910 | 			for i := 0; i < index; i++ {
 911 | 				score := titleSim[i]
 912 | 				if score > maxScore {
 913 | 					maxScore = score
 914 | 					maxIndex = i
 915 | 				}
 916 | 			}
 917 | 
 918 | 			if maxIndex != -1 && maxScore > c.titleSim {
 919 | 				c.titlePos = "headline"
 920 | 				tagNode := goquery.NewDocumentFromNode(titleList[maxIndex])
 921 | 				headTitle := c.normaliseText(tagNode.Selection)
 922 | 				return headTitle
 923 | 			}
 924 | 		}
 925 | 	}
 926 | 
 927 | 	// 尝试从包含开头结尾 title 选择器中获取一个相似度高的
 928 | 	titles := c.Doc.Find("body").Find("*[id^=title],*[id$=title],*[class^=title],*[class$=title]")
 929 | 	if titles.Size() > 0 {
 930 | 		first := titles.First()
 931 | 		selectorTitle := c.normaliseText(first)
 932 | 		sim := fun.SimilarityText(metaTitle, selectorTitle)
 933 | 		if sim > c.titleSim {
 934 | 			c.titlePos = "selector"
 935 | 			return selectorTitle
 936 | 		}
 937 | 	}
 938 | 
 939 | 	// 从正文中找最相似 metaTitle 的文本片段
 940 | 	title = c.getTitleByEditDistance(metaTitle)
 941 | 	if title != "" {
 942 | 		c.titlePos = "content"
 943 | 		return title
 944 | 	}
 945 | 
 946 | 	// 最坏的情况是, 直接返回页面标题
 947 | 	c.titlePos = "title"
 948 | 	return metaTitle
 949 | }
 950 | 
 951 | // getTitleByEditDistance 从正文中找最相似 metaTitle 的片段
 952 | func (c *Content) getTitleByEditDistance(originMetaTitle string) string {
 953 | 	max := []float64{0.0}
 954 | 	var buf bytes.Buffer
 955 | 
 956 | 	var traverse func(*html.Node)
 957 | 	traverse = func(n *html.Node) {
 958 | 
 959 | 		if n.FirstChild != nil {
 960 | 			if n.Type == html.TextNode {
 961 | 				node := goquery.NewDocumentFromNode(n)
 962 | 				text := c.normaliseText(node.Selection)
 963 | 				sim := fun.SimilarityText(text, originMetaTitle)
 964 | 				if sim > c.titleSim && sim > max[0] {
 965 | 					max[0] = sim
 966 | 					buf.Reset()
 967 | 					buf.WriteString(text)
 968 | 				}
 969 | 			}
 970 | 
 971 | 			for child := n.FirstChild; child != nil; child = child.NextSibling {
 972 | 				traverse(child)
 973 | 			}
 974 | 		}
 975 | 	}
 976 | 	if c.bodyNode != nil {
 977 | 		traverse(c.bodyNode)
 978 | 	}
 979 | 
 980 | 	if len(buf.String()) > 0 {
 981 | 		return buf.String()
 982 | 	}
 983 | 
 984 | 	return ""
 985 | }
 986 | 
 987 | func (c *Content) getTitleByMeta(metaTitle string) string {
 988 | 	var titles []string
 989 | 	for _, metaSelector := range contentMetaTitleSelectors {
 990 | 		title := strings.TrimSpace(c.Doc.Find(metaSelector).AttrOr("content", ""))
 991 | 		if !fun.Blank(title) {
 992 | 			titles = append(titles, title)
 993 | 		}
 994 | 	}
 995 | 
 996 | 	if len(titles) > 0 {
 997 | 		if metaTitle != "" {
 998 | 			for _, title := range titles {
 999 | 				sim := fun.SimilarityText(title, metaTitle)
1000 | 				if sim > c.titleSim {
1001 | 					titleLen := utf8.RuneCountInString(title)
1002 | 					metaTitleLen := utf8.RuneCountInString(metaTitle)
1003 | 					if titleLen < metaTitleLen {
1004 | 						c.titlePos = "title"
1005 | 						return title
1006 | 					} else {
1007 | 						c.titlePos = "metaTitle"
1008 | 						return metaTitle
1009 | 					}
1010 | 				}
1011 | 			}
1012 | 		} else {
1013 | 			return titles[0]
1014 | 		}
1015 | 	}
1016 | 
1017 | 	return ""
1018 | }
1019 | 
1020 | func (c *Content) computeInfo(node *html.Node) countInfo {
1021 | 	if node.Type == html.ElementNode {
1022 | 		countInfo := countInfo{}
1023 | 		for child := node.FirstChild; child != nil; child = child.NextSibling {
1024 | 			childCountInfo := c.computeInfo(child)
1025 | 			countInfo.TextCount += childCountInfo.TextCount
1026 | 			countInfo.LinkTextCount += childCountInfo.LinkTextCount
1027 | 			countInfo.TagCount += childCountInfo.TagCount
1028 | 			countInfo.LinkTagCount += childCountInfo.LinkTagCount
1029 | 			countInfo.DensitySum += childCountInfo.Density
1030 | 			countInfo.PCount += childCountInfo.PCount
1031 | 			countInfo.LeafList = append(countInfo.LeafList, childCountInfo.LeafList...)
1032 | 		}
1033 | 
1034 | 		countInfo.TagCount++
1035 | 		if node.Data == "a" {
1036 | 			countInfo.LinkTextCount = countInfo.TextCount
1037 | 			countInfo.LinkTagCount++
1038 | 		} else if node.Data == "p" {
1039 | 			countInfo.PCount++
1040 | 		}
1041 | 
1042 | 		pureLen := countInfo.TextCount - countInfo.LinkTextCount
1043 | 		tagLen := countInfo.TagCount - countInfo.LinkTagCount
1044 | 		if pureLen == 0 || tagLen == 0 {
1045 | 			countInfo.Density = 0
1046 | 		} else {
1047 | 			countInfo.Density = float64(pureLen) / float64(tagLen)
1048 | 		}
1049 | 
1050 | 		c.infoMap[node] = countInfo
1051 | 
1052 | 		return countInfo
1053 | 	} else if node.Type == html.TextNode {
1054 | 		countInfo := countInfo{}
1055 | 
1056 | 		text := fun.NormaliseSpace(node.Data)
1057 | 		textLen := utf8.RuneCountInString(text)
1058 | 		countInfo.TextCount = textLen
1059 | 		countInfo.LeafList = append(countInfo.LeafList, textLen)
1060 | 
1061 | 		return countInfo
1062 | 	} else {
1063 | 		return countInfo{}
1064 | 	}
1065 | }
1066 | 
1067 | func (c *Content) computeScore(node *html.Node) float64 {
1068 | 	countInfo := c.infoMap[node]
1069 | 	value := c.computeVar(countInfo.LeafList) + 1
1070 | 	value = math.Sqrt(value)
1071 | 
1072 | 	scoreLog10 := math.Log10(float64(countInfo.PCount) + 1)
1073 | 	scoreLog := math.Log(float64(countInfo.TextCount) - float64(countInfo.LinkTextCount) + 1)
1074 | 	score := math.Log(value) * countInfo.DensitySum * scoreLog * scoreLog10
1075 | 
1076 | 	return score
1077 | }
1078 | 
1079 | func (c *Content) computeVar(leafList []int) float64 {
1080 | 	leafLen := len(leafList)
1081 | 
1082 | 	if leafLen == 0 {
1083 | 		return 0
1084 | 	}
1085 | 
1086 | 	if leafLen == 1 {
1087 | 		return float64(leafList[0]) / float64(2)
1088 | 	}
1089 | 
1090 | 	var sum float64
1091 | 	for _, i := range leafList {
1092 | 		sum += float64(i)
1093 | 	}
1094 | 
1095 | 	ave := sum / float64(leafLen)
1096 | 	sum = 0
1097 | 	for _, i := range leafList {
1098 | 		t := (float64(i) - ave) * (float64(i) - ave)
1099 | 		sum += t
1100 | 	}
1101 | 
1102 | 	sum = sum / float64(leafLen)
1103 | 	return sum
1104 | }
1105 | 
1106 | func (c *Content) normaliseText(s *goquery.Selection) string {
1107 | 	var buf bytes.Buffer
1108 | 
1109 | 	var f func(*html.Node)
1110 | 	f = func(n *html.Node) {
1111 | 		if n.Type == html.TextNode {
1112 | 			text := fun.NormaliseSpace(n.Data)
1113 | 
1114 | 			buf.WriteString(text)
1115 | 		}
1116 | 		if n.FirstChild != nil {
1117 | 			for child := n.FirstChild; child != nil; child = child.NextSibling {
1118 | 				f(child)
1119 | 			}
1120 | 		}
1121 | 	}
1122 | 	for _, n := range s.Nodes {
1123 | 		f(n)
1124 | 	}
1125 | 
1126 | 	return buf.String()
1127 | }
1128 | 
1129 | func (c *Content) Debug() {
1130 | 	for node, info := range c.infoMap {
1131 | 		if node.Data == "div" {
1132 | 			for _, a := range node.Attr {
1133 | 				if a.Key == "id" || a.Key == "class" {
1134 | 					log.Println(node.Attr)
1135 | 					log.Println(info)
1136 | 				}
1137 | 			}
1138 | 		}
1139 | 	}
1140 | }
1141 | 
1142 | func (c *Content) getTitleByScript(metaTitle string) string {
1143 | 	scripts := c.OriginDoc.Find("script")
1144 | 	if scripts.Size() > 0 {
1145 | 		var title string
1146 | 		scripts.Each(func(i int, script *goquery.Selection) {
1147 | 			scriptText := fun.NormaliseLine(script.Text())
1148 | 			titleStrs := regexScriptTitlePattern.FindStringSubmatch(scriptText)
1149 | 			if titleStrs != nil {
1150 | 				titleStr := strings.TrimSpace(titleStrs[1])
1151 | 				sim := fun.SimilarityText(metaTitle, titleStr)
1152 | 				if sim > c.titleSim {
1153 | 					title = titleStr
1154 | 					return
1155 | 				}
1156 | 			}
1157 | 		})
1158 | 
1159 | 		if title != "" {
1160 | 			return title
1161 | 		}
1162 | 	}
1163 | 
1164 | 	return ""
1165 | }
1166 | 
1167 | func (c *Content) getTimeByScript() string {
1168 | 	scripts := c.OriginDoc.Find("script")
1169 | 	if scripts.Size() > 0 {
1170 | 		var time string
1171 | 		scripts.Each(func(i int, script *goquery.Selection) {
1172 | 			scriptText := fun.NormaliseLine(script.Text())
1173 | 			dateStrs := regexScriptTimePattern.FindStringSubmatch(scriptText)
1174 | 			if dateStrs != nil {
1175 | 				dateStr := strings.TrimSpace(dateStrs[1])
1176 | 				time = dateStr
1177 | 				return
1178 | 			}
1179 | 
1180 | 			dateStrs = regexWxScriptTimePattern.FindStringSubmatch(scriptText)
1181 | 			if dateStrs != nil {
1182 | 				dateStr := strings.TrimSpace(dateStrs[1])
1183 | 				dateTs := fun.ToInt(dateStr)
1184 | 				time = fun.Date(dateTs)
1185 | 				return
1186 | 			}
1187 | 		})
1188 | 
1189 | 		if time != "" {
1190 | 			return time
1191 | 		}
1192 | 	}
1193 | 
1194 | 	return ""
1195 | }
1196 | 
1197 | func (c *Content) getTimeByUrl() string {
1198 | 	if c.OriginUrl != "" {
1199 | 		if linkUrl, err := fun.UrlParse(c.OriginUrl); err == nil {
1200 | 			// 内容页 URL path 时间特征统计
1201 | 			pathDir := path.Dir(strings.TrimSpace(linkUrl.Path))
1202 | 			pathClean := pathDirClean(pathDir)
1203 | 			dateStr := regexContentUrlPublishDatePattern.FindString(pathClean)
1204 | 			if dateStr != "" {
1205 | 				dateStr = strings.ReplaceAll(dateStr, fun.SLASH, "")
1206 | 				return dateStr
1207 | 			}
1208 | 		}
1209 | 	}
1210 | 
1211 | 	return ""
1212 | }
1213 | 


--------------------------------------------------------------------------------