├── charset_test.go
├── extract
├── content_test.go
├── link_test.go
├── meta_test.go
├── icp_test.go
├── domain_test.go
├── domain.go
├── web_test.go
├── icp.go
├── meta.go
├── web.go
├── link.go
└── content.go
├── .gitignore
├── banner.txt
├── go.mod
├── spider_news_test.go
├── http_test.go
├── http.go
├── lang_test.go
├── charset.go
├── go.sum
├── README.md
├── detect_test.go
├── spider.go
├── spider_news.go
├── detect.go
├── spider_test.go
├── lang.go
└── LICENSE
/charset_test.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
--------------------------------------------------------------------------------
/extract/content_test.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
--------------------------------------------------------------------------------
/extract/link_test.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import (
4 | "fmt"
5 | "regexp"
6 | "testing"
7 | )
8 |
9 | func TestMatch(t *testing.T) {
10 | m := regexp.MustCompile(`\p{Han}`)
11 | allString := m.FindAllString("123你好,世界asdf", -1)
12 | fmt.Println(allString)
13 | }
14 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 |
8 | # Test binary, built with `go test -c`
9 | *.test
10 |
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 |
14 | # Dependency directories (remove the comment below to include it)
15 | # vendor/
16 |
17 | .idea
18 | .vscode
19 | .setting
--------------------------------------------------------------------------------
/extract/meta_test.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import "testing"
4 |
5 | func TestHostMeta(t *testing.T) {
6 | hosts := []string{
7 | "matichon.co.th",
8 | "wanbao.com.sg",
9 | "wanbao.com.sg",
10 | "waou.com.mo",
11 | "archives.gov.mo",
12 | "mfa.gov.sg",
13 | "nasa.gov",
14 | }
15 |
16 | for _, host := range hosts {
17 | t.Log(MetaFromHost(host, ""))
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/extract/icp_test.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import "testing"
4 |
5 | func TestIcpFromText(t *testing.T) {
6 | texts := []string{
7 | "粤ICP备17055554号",
8 | "粤ICP备17055554-34号",
9 | "沪ICP备05018492",
10 | "粤B2-20090059",
11 | "京公网安备31010402001073号",
12 | "京公网安备-31010-4020010-73号",
13 | "鲁ICP备05002386鲁公网安备37070502000027号",
14 | }
15 |
16 | for _, text := range texts {
17 | icp, loc := IcpFromText(text)
18 | t.Log(icp, loc)
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/banner.txt:
--------------------------------------------------------------------------------
1 | __ _ __
2 | ____ _____ ____ / /______ _ _________ (_)___/ /__ _____
3 | / __ `/ __ \______/ __ \/ //_/ __ `/_____/ ___/ __ \/ / __ / _ \/ ___/
4 | / /_/ / /_/ /_____/ /_/ / ,< / /_/ /_____(__ ) /_/ / / /_/ / __/ /
5 | \__, /\____/ / .___/_/|_|\__, / /____/ .___/_/\__,_/\___/_/
6 | /____/ /_/ /____/ /_/
7 |
8 |
9 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/suosi-inc/go-pkg-spider
2 |
3 | go 1.18
4 |
5 | require (
6 | github.com/PuerkitoBio/goquery v1.8.1
7 | github.com/microcosm-cc/bluemonday v1.0.26
8 | github.com/suosi-inc/chardet v0.1.0
9 | github.com/suosi-inc/lingua-go v1.0.51
10 | github.com/x-funs/go-fun v0.94.0
11 | golang.org/x/net v0.19.0
12 | )
13 |
14 | require (
15 | github.com/andybalholm/cascadia v1.3.2 // indirect
16 | github.com/aymerick/douceur v0.2.0 // indirect
17 | github.com/gorilla/css v1.0.1 // indirect
18 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
19 | golang.org/x/text v0.17.0 // indirect
20 | )
21 |
--------------------------------------------------------------------------------
/extract/domain_test.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 | )
7 |
8 | func TestDomainParse(t *testing.T) {
9 | domains := []string{
10 | "www.net.cn",
11 | "hi.chinanews.com",
12 | "a.wh.cn",
13 | "siat.ac.cn",
14 | "abc.spring.io",
15 | "abc.spring.ai",
16 | "www.china-embassy.or.jp",
17 | "whszdj.wh.cn",
18 | "gk.wh.cn",
19 | "xwxc.mwr.cn",
20 | "legismac.safp.gov.mo",
21 | "dezhou.rcsd.cn",
22 | "www.gov.cn",
23 | "scopsr.gov.cn",
24 | "usa.gov",
25 | "bbc.co.uk",
26 | "dealer.auto.sohu.com",
27 | "bbs.sohu.com",
28 | }
29 |
30 | for _, domain := range domains {
31 | t.Log(DomainParse(domain))
32 | }
33 | }
34 |
35 | func TestDomainTop(t *testing.T) {
36 | domains := []string{
37 | "www.net.cn",
38 | "hi.chinanews.com",
39 | "a.wh.cn",
40 | "siat.ac.cn",
41 | "abc.spring.io",
42 | "abc.spring.ai",
43 | "www.china-embassy.or.jp",
44 | "whszdj.wh.cn",
45 | "gk.wh.cn",
46 | "xwxc.mwr.cn",
47 | "legismac.safp.gov.mo",
48 | "dezhou.rcsd.cn",
49 | "www.gov.cn",
50 | "scopsr.gov.cn",
51 | "usa.gov",
52 | "bbc.co.uk",
53 | }
54 |
55 | for _, domain := range domains {
56 | t.Log(DomainTop(domain))
57 | }
58 | }
59 |
60 | func TestDomainTopFromUrl(t *testing.T) {
61 | fmt.Println(DomainTopFromUrl("https://www.google.com"))
62 | fmt.Println(DomainTopFromUrl("https://www.baidu.com/news"))
63 | fmt.Println(DomainTopFromUrl("http://szb.xnnews.com.cn/zhzx/202207/t20220722_2731400.htm"))
64 | }
65 |
--------------------------------------------------------------------------------
/extract/domain.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import (
4 | "errors"
5 | "strings"
6 |
7 | "github.com/x-funs/go-fun"
8 | "golang.org/x/net/publicsuffix"
9 | )
10 |
11 | type Domain struct {
12 | Subdomain, Domain, TLD string
13 | ICANN bool
14 | }
15 |
16 | // DomainTop 返回顶级域名
17 | func DomainTop(d string) string {
18 | if d, err := DomainParse(d); err == nil {
19 | return d.Domain + fun.DOT + d.TLD
20 | }
21 |
22 | return ""
23 | }
24 |
25 | // DomainTopFromUrl 解析 URL 返回顶级域名
26 | func DomainTopFromUrl(urlStr string) string {
27 | if d, err := DomainParseFromUrl(urlStr); err == nil {
28 | return d.Domain + "." + d.TLD
29 | }
30 |
31 | return ""
32 | }
33 |
34 | // DomainParse 解析域名, 返回 Domain
35 | func DomainParse(domain string) (*Domain, error) {
36 | if fun.Blank(domain) {
37 | return nil, errors.New("domain is blank")
38 | }
39 |
40 | // etld+1
41 | etld1, err := publicsuffix.EffectiveTLDPlusOne(domain)
42 | _, icann := publicsuffix.PublicSuffix(strings.ToLower(domain))
43 | if err != nil {
44 | return nil, err
45 | }
46 |
47 | // convert to domain name, and tld
48 | i := strings.Index(etld1, fun.DOT)
49 | domName := etld1[0:i]
50 | tld := etld1[i+1:]
51 |
52 | // and subdomain
53 | sub := ""
54 | if rest := strings.TrimSuffix(domain, "."+etld1); rest != domain {
55 | sub = rest
56 | }
57 | return &Domain{
58 | Subdomain: sub,
59 | Domain: domName,
60 | TLD: tld,
61 | ICANN: icann,
62 | }, nil
63 | }
64 |
65 | // DomainParseFromUrl 解析域名, 返回 Domain
66 | func DomainParseFromUrl(urlStr string) (*Domain, error) {
67 | u, err := fun.UrlParse(urlStr)
68 | if err != nil {
69 | return nil, err
70 | }
71 |
72 | d := u.Hostname()
73 |
74 | return DomainParse(d)
75 | }
76 |
--------------------------------------------------------------------------------
/extract/web_test.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import (
4 | "fmt"
5 | "net/url"
6 | "path"
7 | "testing"
8 | "unicode/utf8"
9 |
10 | "github.com/x-funs/go-fun"
11 | )
12 |
13 | func TestTitleClean(t *testing.T) {
14 | strs := map[string]string{
15 | "“暴徒试图杀死他!”阿拉木图市长在1月5日的暗杀企图中幸存_网易订阅": "zh",
16 | "“暴徒试图杀死他!”阿拉木图市长在1月5日的暗杀企图中幸存 - 网易订阅": "zh",
17 | "北极圈内最高温达到38℃ 北极熊还好吗?南极情况怎么样?_科技频道_中国青年网": "zh",
18 | "About the Project on Nuclear Issues | Center for Strategic and International Studies": "en",
19 | }
20 |
21 | for str, l := range strs {
22 | t.Log(WebTitleClean(str, l))
23 | }
24 | }
25 |
26 | func TestUrlQuery(t *testing.T) {
27 | // urlStr := "https://people.com/tag/stories-to-make-you-smile/a/b/abc.html?a=1&b=2&c=3#ddd"
28 | urlStr := "https://vipmail.163.com/index.html?abc=123"
29 | u, err := url.Parse(urlStr)
30 |
31 | fmt.Println(err)
32 | fmt.Println(u.Path)
33 | fmt.Println(u.RawQuery)
34 | fmt.Println(path.Dir(u.Path))
35 | // fmt.Println(path.Base(u.Path))
36 |
37 | fmt.Println(utf8.RuneCountInString("https://adx.36kr.com/api/ad/click?sign=2eda7665240cec93f902311eb10c195a¶m.redirectUrl=aHR0cHM6Ly8zNmtyLmNvbS9wLzE4NTM5NTQ2NzgxMzIzNTI¶m.adsdk=Phid2i9VOob6U23ybkDx8q7cr1KbBDM4oiu1d_-C6gY5qf5SKxqBPsptEVMy_wtzqB5Yr08U7ioREUL7HLxIrQ"))
38 | }
39 |
40 | func TestFilterUrl(t *testing.T) {
41 | urlStr := "http://www.163.com/a/b/"
42 | baseUrl, _ := fun.UrlParse(urlStr)
43 |
44 | t.Log(filterUrl("./c/123.html", baseUrl, true))
45 | t.Log(filterUrl("../c/123.html", baseUrl, true))
46 | t.Log(filterUrl("/c/123.html", baseUrl, true))
47 | t.Log(filterUrl("//www.163.com/c/123.html", baseUrl, true))
48 | t.Log(filterUrl("//www.163.com/c/123.pdf?abc=1123", baseUrl, true))
49 | }
50 |
51 | func BenchmarkFilterUrl(b *testing.B) {
52 | urlStr := "http://www.163.com/a/b/"
53 | baseUrl, _ := fun.UrlParse(urlStr)
54 |
55 | b.ResetTimer()
56 |
57 | for i := 0; i < b.N; i++ {
58 | filterUrl("https://www.163.com/news/article/HEAJM4F1000189FH.html", baseUrl, true)
59 |
60 | // url.Parse("https://www.163.com/news/article/HEAJM4F1000189FH.html")
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/spider_news_test.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "crypto/tls"
5 | "fmt"
6 | "net/http"
7 | "net/url"
8 | "testing"
9 |
10 | "github.com/x-funs/go-fun"
11 | )
12 |
13 | var (
14 | newUrl = "http://www.cankaoxiaoxi.com/"
15 | newUrl_domain = "cankaoxiaoxi.com"
16 | overseaUrl = "https://www.bbc.com/news"
17 | )
18 |
19 | func TestNews_GetLinkRes_Noctx(t *testing.T) {
20 | n := NewNewsSpider(newUrl, 2, processLink, nil, WithRetryTime(1), WithTimeOut(10000))
21 | n.GetLinkRes()
22 | }
23 |
24 | func TestNews_GetLinkRes(t *testing.T) {
25 | ctx := "getLinkRes"
26 | n := NewNewsSpider(newUrl, 2, processLink, ctx, WithRetryTime(1), WithTimeOut(10000))
27 | n.RetryTime = 1
28 | n.Depth = 2
29 | n.GetLinkRes()
30 | }
31 |
32 | func TestNews_GetLinkRes_Clone(t *testing.T) {
33 | ctx := "getLinkRes"
34 | n := NewNewsSpider(newUrl, 2, processLink, ctx)
35 |
36 | nc := n.Clone().(*NewsSpider)
37 | nc.Ctx = "getLinkRes_Clone"
38 | nc.GetLinkRes()
39 | }
40 |
41 | func processLink(data ...any) {
42 | newsData := data[0].(*NewsData)
43 |
44 | if newsData.Error == nil {
45 | fmt.Println(newsData.ListUrl)
46 | fmt.Println(newsData.Depth)
47 | for i := range newsData.LinkRes.List {
48 | fmt.Println(data[1], i)
49 | }
50 | }
51 | }
52 |
53 | func TestNews_GetContentNews(t *testing.T) {
54 | ctx := "getContentNews"
55 | n := NewNewsSpider(newUrl, 1, processContent, ctx)
56 | n.GetContentNews()
57 | }
58 |
59 | func processContent(data ...any) {
60 | dd := data[0].(*NewsContent)
61 | fmt.Println(data[1], dd.Title, dd.Lang)
62 | }
63 |
64 | func TestNews_GetNewsWithProxy(t *testing.T) {
65 | transport := &http.Transport{
66 | TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
67 | DisableKeepAlives: true,
68 | }
69 | proxyString := "http://username:password@host:port"
70 | proxy, _ := url.Parse(proxyString)
71 | transport.Proxy = http.ProxyURL(proxy)
72 |
73 | req := &HttpReq{
74 | HttpReq: &fun.HttpReq{
75 | MaxContentLength: HttpDefaultMaxContentLength,
76 | MaxRedirect: 2,
77 | Transport: transport,
78 | Headers: map[string]string{
79 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
80 | },
81 | },
82 | ForceTextContentType: true,
83 | }
84 |
85 | ctx := "getNewsWithProxy"
86 | n := NewNewsSpider(overseaUrl, 1, processContent, ctx, WithReq(req))
87 | n.GetContentNews()
88 | }
89 |
--------------------------------------------------------------------------------
/extract/icp.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import (
4 | "regexp"
5 | "strings"
6 |
7 | "github.com/PuerkitoBio/goquery"
8 | "github.com/x-funs/go-fun"
9 | )
10 |
11 | var (
12 | ProvinceShortMap = map[string]string{
13 | "京": "北京",
14 | "津": "天津",
15 | "沪": "上海",
16 | "渝": "重庆",
17 | "黑": "黑龙江",
18 | "吉": "吉林",
19 | "辽": "辽宁",
20 | "冀": "河北",
21 | "豫": "河南",
22 | "鲁": "山东",
23 | "晋": "山西",
24 | "陕": "陕西",
25 | "秦": "陕西",
26 | "蒙": "内蒙古",
27 | "宁": "宁夏",
28 | "陇": "甘肃",
29 | "甘": "甘肃",
30 | "新": "新疆",
31 | "青": "青海",
32 | "藏": "西藏",
33 | "鄂": "湖北",
34 | "皖": "安徽",
35 | "苏": "江苏",
36 | "浙": "浙江",
37 | "闽": "福建",
38 | "湘": "湖南",
39 | "赣": "江西",
40 | "川": "四川",
41 | "蜀": "四川",
42 | "黔": "贵州",
43 | "贵": "贵州",
44 | "滇": "云南",
45 | "云": "云南",
46 | "粤": "广东",
47 | "桂": "广西",
48 | "琼": "海南",
49 | "港": "中国香港",
50 | "澳": "中国澳门",
51 | "台": "中国台湾",
52 | }
53 | )
54 |
55 | const (
56 | RegexIcp = `(?i)(京|津|冀|晋|蒙|辽|吉|黑|沪|苏|浙|皖|闽|赣|鲁|豫|鄂|湘|粤|桂|琼|川|蜀|贵|黔|云|滇|渝|藏|陇|甘|陕|秦|青|宁|新)ICP(备|证|备案)?[0-9]+`
57 | RegexIcpGa = `(?i)(京|津|冀|晋|蒙|辽|吉|黑|沪|苏|浙|皖|闽|赣|鲁|豫|鄂|湘|粤|桂|琼|川|蜀|贵|黔|云|滇|渝|藏|陇|甘|陕|秦|青|宁|新)公网安备[0-9]+`
58 | RegexIcpDx = `(?i)(京|津|冀|晋|蒙|辽|吉|黑|沪|苏|浙|皖|闽|赣|鲁|豫|鄂|湘|粤|桂|琼|川|蜀|贵|黔|云|滇|渝|藏|陇|甘|陕|秦|青|宁|新)B2-[0-9]+`
59 | )
60 |
61 | var (
62 | RegexIcpPattern = regexp.MustCompile(RegexIcp)
63 | RegexIcpGaPattern = regexp.MustCompile(RegexIcpGa)
64 | RegexIcpDxPattern = regexp.MustCompile(RegexIcpDx)
65 | )
66 |
67 | // Icp 返回网站备案相关的信息
68 | func Icp(doc *goquery.Document) (string, string) {
69 | text := doc.Find("body").Text()
70 |
71 | text = fun.RemoveLines(text)
72 |
73 | text = strings.ReplaceAll(text, fun.TAB, "")
74 | text = strings.ReplaceAll(text, fun.SPACE, "")
75 |
76 | return IcpFromText(text)
77 |
78 | }
79 |
80 | // IcpFromText 提取文本中备案相关的信息
81 | func IcpFromText(text string) (string, string) {
82 | var icp, loc string
83 |
84 | // 优先匹配ICP
85 | matches := RegexIcpPattern.FindStringSubmatch(text)
86 | if len(matches) > 1 {
87 | icp = matches[0]
88 | loc = matches[1]
89 | }
90 |
91 | // 匹配公网安备
92 | if icp == "" {
93 | matches = RegexIcpGaPattern.FindStringSubmatch(text)
94 | if len(matches) > 1 {
95 | icp = matches[0]
96 | loc = matches[1]
97 | }
98 | }
99 |
100 | // 匹配电信增值业务
101 | if icp == "" {
102 | matches = RegexIcpDxPattern.FindStringSubmatch(text)
103 | if len(matches) > 1 {
104 | icp = matches[0]
105 | loc = matches[1]
106 | }
107 | }
108 |
109 | return icp, loc
110 | }
111 |
--------------------------------------------------------------------------------
/http_test.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 |
7 | "github.com/PuerkitoBio/goquery"
8 | "github.com/x-funs/go-fun"
9 | )
10 |
11 | func TestHttpGetCharsetLang(t *testing.T) {
12 | var urlStrs = []string{
13 | // "http://suosi.com.cn",
14 | // "https://www.163.com",
15 | // "https://english.news.cn",
16 | // "https://jp.news.cn",
17 | // "https://kr.news.cn",
18 | // "https://www.donga.com/",
19 | // "http://www.koreatimes.com/",
20 | // "https://arabic.news.cn",
21 | // "https://www.bbc.com",
22 | // "http://government.ru",
23 | "https://french.news.cn",
24 | // "https://www.gouvernement.fr",
25 | // "http://live.siammedia.org/",
26 | // "http://hanoimoi.com.vn",
27 | // "https://www.commerce.gov.mm",
28 | // "https://sanmarg.in/",
29 | // "https://www.rrdmyanmar.gov.mm",
30 | // "http://english.eastday.com/",
31 | // "http://jp.eastday.com/",
32 | // "https://mn.cctv.com/",
33 | }
34 |
35 | for _, urlStr := range urlStrs {
36 |
37 | resp, err := HttpGetResp(urlStr, nil, 30000)
38 |
39 | t.Log(urlStr)
40 | t.Log(err)
41 | t.Log(resp.Success)
42 | t.Log(resp.ContentLength)
43 | t.Log(resp.Headers)
44 | t.Log(resp.Charset)
45 |
46 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
47 | doc.Find(DefaultDocRemoveTags).Remove()
48 |
49 | start := fun.Timestamp(true)
50 | lang := Lang(doc, resp.Charset.Charset, true)
51 | t.Log(lang)
52 |
53 | t.Log(fun.Timestamp(true) - start)
54 | }
55 | }
56 |
57 | func TestHttpGetCharsetLangURL(t *testing.T) {
58 | var urlStrs = []string{
59 | "https://marriott.co.kr",
60 | }
61 |
62 | for _, urlStr := range urlStrs {
63 |
64 | resp, err := HttpGetResp(urlStr, nil, 30000)
65 |
66 | t.Log(urlStr)
67 | t.Log(err)
68 | t.Log(resp.Success)
69 | t.Log(resp.ContentLength)
70 | t.Log(resp.Headers)
71 | t.Log(resp.Charset)
72 |
73 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
74 | doc.Find(DefaultDocRemoveTags).Remove()
75 |
76 | start := fun.Timestamp(true)
77 | lang := Lang(doc, resp.Charset.Charset, true)
78 | t.Log(lang)
79 |
80 | t.Log(fun.Timestamp(true) - start)
81 | }
82 | }
83 |
84 | func TestHttpGet(t *testing.T) {
85 | var urlStr string
86 |
87 | urlStr = "http://www.niuchaoqun.com"
88 | // urlStr = "http://www.qq.com"
89 |
90 | resp, err := HttpGetResp(urlStr, nil, 10000)
91 |
92 | t.Log(urlStr)
93 | t.Log(err)
94 | t.Log(resp.Success)
95 | t.Log(resp.ContentLength)
96 | t.Log(resp.Headers)
97 | t.Log(resp.Charset)
98 |
99 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
100 | doc.Find(DefaultDocRemoveTags).Remove()
101 | lang := Lang(doc, resp.Charset.Charset, true)
102 | t.Log(lang)
103 |
104 | t.Log(fun.String(resp.Body))
105 | }
106 |
107 | func TestHttpGetContentType(t *testing.T) {
108 | var urlStr string
109 |
110 | urlStr = "https://mirrors.163.com/mysql/Downloads/MySQL-8.0/libmysqlclient-dev_8.0.27-1debian10_amd64.deb"
111 |
112 | req := &HttpReq{
113 | ForceTextContentType: true,
114 | }
115 | resp, err := HttpGetResp(urlStr, req, 10000)
116 |
117 | t.Log(urlStr)
118 | t.Log(err)
119 | t.Log(resp.Success)
120 | t.Log(resp.ContentLength)
121 | t.Log(resp.Headers)
122 | t.Log(resp.Charset)
123 |
124 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
125 | doc.Find(DefaultDocRemoveTags).Remove()
126 | lang := Lang(doc, resp.Charset.Charset, true)
127 | t.Log(lang)
128 |
129 | t.Log(fun.String(resp.Body))
130 | }
131 |
132 | func TestHttpGetContentLength(t *testing.T) {
133 | var urlStr string
134 |
135 | urlStr = "http://suosi.com.cn"
136 |
137 | req := &HttpReq{
138 | HttpReq: &fun.HttpReq{
139 | MaxContentLength: 1000,
140 | },
141 | }
142 | resp, err := HttpGetResp(urlStr, req, 10000)
143 |
144 | t.Log(urlStr)
145 | t.Log(err)
146 | t.Log(resp.Success)
147 | t.Log(resp.ContentLength)
148 | t.Log(resp.Headers)
149 | t.Log(resp.Charset)
150 |
151 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
152 | doc.Find(DefaultDocRemoveTags).Remove()
153 | lang := Lang(doc, resp.Charset.Charset, true)
154 | t.Log(lang)
155 |
156 | t.Log(fun.String(resp.Body))
157 | }
158 |
--------------------------------------------------------------------------------
/extract/meta.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import (
4 | "strings"
5 | )
6 |
7 | var HostGovCountryMap = map[string]string{
8 | "hk": "中国",
9 | "tw": "中国",
10 | "mo": "中国",
11 | "jp": "日本",
12 | "kr": "韩国",
13 | "in": "印度",
14 | "uk": "英国",
15 | "us": "美国",
16 | "it": "意大利",
17 | "es": "西班牙",
18 | "ru": "俄罗斯",
19 | "de": "德国",
20 | "fr": "法国",
21 | "th": "泰国",
22 | "vn": "越南",
23 | "sg": "新加坡",
24 | "au": "澳大利亚",
25 | "ca": "加拿大",
26 | "il": "以色列",
27 | "mm": "缅甸",
28 | "dz": "阿尔及利亚",
29 | "pl": "波兰",
30 | "az": "南非",
31 | "ng": "尼日利亚",
32 | "kp": "朝鲜",
33 | "lb": "黎巴嫩",
34 | "ua": "乌克兰",
35 | "tr": "土耳其",
36 | "se": "瑞典",
37 | "lk": "斯里兰卡",
38 | "si": "斯洛文尼亚",
39 | "sk": "斯洛伐克",
40 | "ro": "罗马尼亚",
41 | "pt": "葡萄牙",
42 | "ph": "菲律宾",
43 | "pk": "巴基斯坦",
44 | "py": "巴拉圭",
45 | "np": "尼泊尔",
46 | "ma": "摩洛哥",
47 | "my": "马来西亚",
48 | "lt": "立陶宛",
49 | "ie": "爱尔兰",
50 | "iq": "伊拉克",
51 | "ir": "伊朗",
52 | "id": "印度尼西亚",
53 | "hu": "匈牙利",
54 | "gr": "希腊",
55 | "eg": "埃及",
56 | "cz": "捷克",
57 | "hr": "克罗地亚",
58 | "co": "哥伦比亚",
59 | "cl": "智利",
60 | "br": "巴西",
61 | "bg": "保加利亚",
62 | "be": "比利时",
63 | "bd": "孟加拉国",
64 | "aw": "阿鲁巴",
65 | "am": "亚美尼亚",
66 | "ai": "安圭拉",
67 | "ao": "安哥拉",
68 | "al": "阿尔巴尼亚",
69 | "af": "阿富汗",
70 | "sa": "沙特阿拉伯",
71 | "nl": "荷兰",
72 | }
73 |
74 | // MetaFromHost 根据域名尽可能返回一些固定信息
75 | func MetaFromHost(host string, lang string) (string, string, string) {
76 | var tld string
77 | var country string
78 | var province string
79 | var category string
80 |
81 | host = strings.ToLower(host)
82 |
83 | if domain, err := DomainParse(host); err == nil {
84 | tld = domain.TLD
85 | } else {
86 | return country, province, category
87 | }
88 |
89 | // 美国政府顶级域名
90 | if tld == "gov" {
91 | country = "美国"
92 | category = "政务"
93 | return country, province, category
94 | }
95 |
96 | // 判断是否是政府域名
97 | for c, zh := range HostGovCountryMap {
98 | gov := "gov." + c
99 | if tld == gov {
100 | country = zh
101 | category = "政务"
102 |
103 | if strings.HasSuffix(host, ".hk") && lang == "zh" {
104 | province = "中国香港"
105 | }
106 | if strings.HasSuffix(host, ".tw") && lang == "zh" {
107 | province = "中国台湾"
108 | }
109 | if strings.HasSuffix(host, ".mo") && lang == "zh" {
110 | province = "中国澳门"
111 | }
112 | return country, province, category
113 | }
114 | }
115 |
116 | if strings.HasSuffix(host, ".hk") && lang == "zh" {
117 | country = "中国"
118 | province = "中国香港"
119 | return country, province, category
120 | }
121 |
122 | if strings.HasSuffix(host, ".tw") && lang == "zh" {
123 | country = "中国"
124 | province = "中国台湾"
125 | return country, province, category
126 | }
127 |
128 | if strings.HasSuffix(host, ".mo") && lang == "zh" {
129 | country = "中国"
130 | province = "中国澳门"
131 | return country, province, category
132 | }
133 |
134 | if strings.HasSuffix(host, ".cn") && lang == "zh" {
135 | country = "中国"
136 | return country, province, category
137 | }
138 |
139 | if strings.HasSuffix(host, ".jp") && lang == "ja" {
140 | country = "日本"
141 | return country, province, category
142 | }
143 |
144 | if strings.HasSuffix(host, ".kr") && lang == "ko" {
145 | country = "韩国"
146 | return country, province, category
147 | }
148 |
149 | if strings.HasSuffix(host, ".uk") && lang == "en" {
150 | country = "英国"
151 | return country, province, category
152 | }
153 |
154 | if strings.HasSuffix(host, ".us") && lang == "en" {
155 | country = "美国"
156 | return country, province, category
157 | }
158 |
159 | if strings.HasSuffix(host, ".in") && lang == "hi" {
160 | country = "印度"
161 | return country, province, category
162 | }
163 |
164 | if strings.HasSuffix(host, ".es") && lang == "es" {
165 | country = "西班牙"
166 | return country, province, category
167 | }
168 |
169 | if strings.HasSuffix(host, ".ru") && lang == "ru" {
170 | country = "俄罗斯"
171 | return country, province, category
172 | }
173 |
174 | if strings.HasSuffix(host, ".de") && lang == "de" {
175 | country = "德国"
176 | return country, province, category
177 | }
178 |
179 | if strings.HasSuffix(host, ".fr") && lang == "fr" {
180 | country = "法国"
181 | return country, province, category
182 | }
183 |
184 | return country, province, category
185 | }
186 |
--------------------------------------------------------------------------------
/http.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "crypto/tls"
5 | "errors"
6 | "net"
7 | "net/http"
8 | "time"
9 |
10 | "github.com/x-funs/go-fun"
11 | )
12 |
13 | const (
14 | HttpDefaultTimeOut = 10000
15 | HttpDefaultMaxContentLength = 10 * 1024 * 1024
16 | HttpDefaultUserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
17 | HttpDefaultAcceptEncoding = "gzip, deflate"
18 | )
19 |
20 | var (
21 | textContentTypes = []string{
22 | "text/plain",
23 | "text/html",
24 | "text/xml",
25 | "application/xml",
26 | "application/xhtml+xml",
27 | "application/json",
28 | }
29 | )
30 |
31 | type HttpReq struct {
32 | // 嵌入 fun.HttpReq
33 | *fun.HttpReq
34 |
35 | // 禁止自动探测字符集和转换字符集
36 | DisableCharset bool
37 |
38 | // 强制 ContentType 为文本类型
39 | ForceTextContentType bool
40 | }
41 |
42 | type HttpResp struct {
43 | *fun.HttpResp
44 |
45 | // 字符集
46 | Charset CharsetRes
47 | }
48 |
49 | // HttpDefaultTransport 默认全局使用的 http.Transport
50 | var HttpDefaultTransport = &http.Transport{
51 | DialContext: (&net.Dialer{Timeout: time.Second}).DialContext,
52 | DisableKeepAlives: true,
53 | IdleConnTimeout: 60 * time.Second,
54 | TLSHandshakeTimeout: 10 * time.Second,
55 | ExpectContinueTimeout: 1 * time.Second,
56 | TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
57 | }
58 |
59 | // HttpGet 参数为请求地址 (HttpReq, 超时时间)
60 | // HttpGet(url)、HttpGet(url, HttpReq)、HttpGet(url, timeout)、HttpGet(url, HttpReq, timeout)
61 | // 返回 body, 错误信息
62 | func HttpGet(urlStr string, args ...any) ([]byte, error) {
63 | l := len(args)
64 |
65 | switch l {
66 | case 0:
67 | return HttpGetDo(urlStr, nil, 0)
68 | case 1:
69 | switch v := args[0].(type) {
70 | case int:
71 | timeout := fun.ToInt(args[0])
72 | return HttpGetDo(urlStr, nil, timeout)
73 | case *HttpReq:
74 | return HttpGetDo(urlStr, v, 0)
75 |
76 | }
77 | case 2:
78 | timeout := fun.ToInt(args[1])
79 | switch v := args[0].(type) {
80 | case *HttpReq:
81 | return HttpGetDo(urlStr, v, timeout)
82 | }
83 |
84 | }
85 |
86 | return nil, errors.New("http get params error")
87 | }
88 |
89 | // HttpGetDo Http Get 请求, 参数为请求地址, HttpReq, 超时时间(毫秒)
90 | // 返回 body, 错误信息
91 | func HttpGetDo(urlStr string, r *HttpReq, timeout int) ([]byte, error) {
92 | resp, err := HttpGetResp(urlStr, r, timeout)
93 | if err != nil {
94 | return nil, err
95 | } else {
96 | return resp.Body, nil
97 | }
98 | }
99 |
100 | // HttpGetResp Http Get 请求, 参数为请求地址, HttpReq, 超时时间(毫秒)
101 | // 返回 HttpResp, 错误信息
102 | func HttpGetResp(urlStr string, r *HttpReq, timeout int) (*HttpResp, error) {
103 | req, err := http.NewRequest(http.MethodGet, urlStr, nil)
104 | if err != nil {
105 | return nil, err
106 | }
107 |
108 | return HttpDoResp(req, r, timeout)
109 | }
110 |
111 | // HttpDo Http 请求, 参数为 http.Request, HttpReq, 超时时间(毫秒)
112 | // 返回 body, 错误信息
113 | func HttpDo(req *http.Request, r *HttpReq, timeout int) ([]byte, error) {
114 | resp, err := HttpDoResp(req, r, timeout)
115 | if err != nil {
116 | return nil, err
117 | } else {
118 | return resp.Body, nil
119 | }
120 | }
121 |
122 | // HttpDoResp Http 请求, 参数为 http.Request, HttpReq, 超时时间(毫秒)
123 | // 返回 HttpResp, 错误信息
124 | func HttpDoResp(req *http.Request, r *HttpReq, timeout int) (*HttpResp, error) {
125 | // 处理 Transport
126 | if r == nil {
127 | r = &HttpReq{
128 | HttpReq: &fun.HttpReq{
129 | Transport: HttpDefaultTransport,
130 | },
131 | }
132 | } else if r.HttpReq == nil {
133 | r.HttpReq = &fun.HttpReq{
134 | Transport: HttpDefaultTransport,
135 | }
136 | } else if r.Transport == nil {
137 | r.Transport = HttpDefaultTransport
138 | }
139 |
140 | // 强制文本类型
141 | if r != nil && r.ForceTextContentType {
142 | r.AllowedContentTypes = textContentTypes
143 | }
144 |
145 | // HttpResp
146 | var charset CharsetRes
147 | httpResp := &HttpResp{
148 | Charset: charset,
149 | }
150 |
151 | resp, err := fun.HttpDoResp(req, r.HttpReq, timeout)
152 | httpResp.HttpResp = resp
153 | if err != nil {
154 | return httpResp, err
155 | }
156 |
157 | // 默认会自动进行探测编码和转码, 除非手动禁用
158 | if r == nil || !r.DisableCharset {
159 | charsetRes := Charset(httpResp.Body, httpResp.Headers)
160 | httpResp.Charset = charsetRes
161 |
162 | if charsetRes.Charset != "" && charsetRes.Charset != "UTF-8" {
163 | utf8Body, e := fun.ToUtf8(httpResp.Body, charsetRes.Charset)
164 | if e != nil {
165 | return httpResp, errors.New("ErrorCharset")
166 | } else {
167 | httpResp.Body = utf8Body
168 | }
169 | }
170 | }
171 |
172 | return httpResp, nil
173 | }
174 |
--------------------------------------------------------------------------------
/lang_test.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "regexp"
7 | "testing"
8 |
9 | "github.com/PuerkitoBio/goquery"
10 | "github.com/suosi-inc/lingua-go"
11 | "github.com/x-funs/go-fun"
12 | )
13 |
14 | func TestLinguaText(t *testing.T) {
15 | text := "BEIJING, 10 août (Xinhua) -- Un porte-parole du Bureau du Travail du Comité central du Parti communiste chinois pour les affaires de Taiwan a fait mercredi des remarques sur un livre blanc nouvellement publié intitulé \"La question de Taiwan et la réunification de la Chine dans la nouvelle ère\"."
16 |
17 | start := fun.Timestamp(true)
18 | languages := []lingua.Language{
19 | lingua.French,
20 | lingua.Spanish,
21 | lingua.Portuguese,
22 | lingua.German,
23 | }
24 | detector := lingua.NewLanguageDetectorBuilder().
25 | FromLanguages(languages...).
26 | Build()
27 |
28 | if language, exists := detector.DetectLanguageOf(text); exists {
29 | t.Log(text)
30 | t.Log(language.IsoCode639_1())
31 | fmt.Println(fun.Timestamp(true) - start)
32 | }
33 | }
34 |
35 | func BenchmarkLinguaTest(b *testing.B) {
36 |
37 | text := "BEIJING"
38 |
39 | languages := []lingua.Language{
40 | lingua.French,
41 | lingua.Spanish,
42 | lingua.Portuguese,
43 | lingua.German,
44 | lingua.English,
45 | }
46 | detector := lingua.NewLanguageDetectorBuilder().
47 | FromLanguages(languages...).
48 | Build()
49 |
50 | b.ResetTimer()
51 |
52 | for i := 0; i < b.N; i++ {
53 | _, _ = detector.DetectLanguageOf(text)
54 | }
55 | }
56 |
57 | func TestLang(t *testing.T) {
58 |
59 | var urlStrs = []string{
60 |
61 | "https://www.bbc.com",
62 | "https://www.ft.com/",
63 |
64 | "https://www.163.com/news/article/HEJGEVFT000189FH.html",
65 | "https://www.163.com",
66 |
67 | "https://english.news.cn",
68 | "https://jp.news.cn",
69 | "https://kr.news.cn",
70 | "https://german.news.cn/",
71 | "https://portuguese.news.cn/",
72 | "https://arabic.news.cn",
73 | "https://french.news.cn",
74 |
75 | "https://mn.cctv.com/",
76 |
77 | "http://government.ru",
78 |
79 | "https://www.gouvernement.fr",
80 |
81 | "http://live.siammedia.org/",
82 | "https://www.manchestereveningnews.co.uk/",
83 |
84 | "https://www.chinadaily.com.cn",
85 | "http://cn.chinadaily.com.cn/",
86 | "http://www.chinadaily.com.cn/chinawatch_fr/index.html",
87 | "https://d1ev.com/",
88 | "https://www.cngold.com.cn/",
89 | "https://china.guidechem.com/",
90 | "https://xdkb.net/",
91 | "https://www.lifeweek.com.cn/",
92 | "http://gxbsrd.gov.cn/",
93 | "https://defence24.com/",
94 | "http://www.gmp.or.kr/",
95 | "http://rdfmj.com/",
96 | "https://news.xmnn.cn/xmnn/2022/08/09/101067908.shtml",
97 | }
98 |
99 | for _, urlStr := range urlStrs {
100 | resp, _ := HttpGetResp(urlStr, nil, 10000)
101 |
102 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
103 |
104 | doc.Find(DefaultDocRemoveTags).Remove()
105 |
106 | // 语言
107 | start := fun.Timestamp(true)
108 | langRes := Lang(doc, resp.Charset.Charset, true)
109 |
110 | t.Log(urlStr)
111 | t.Log(resp.Charset)
112 | t.Log(langRes)
113 | t.Log(fun.Timestamp(true) - start)
114 | }
115 |
116 | }
117 |
118 | func TestLangText(t *testing.T) {
119 | start := fun.Timestamp(true)
120 | text := "中文"
121 | t.Log(fun.Timestamp(true) - start)
122 | t.Log(LangText(text))
123 | }
124 |
125 | func TestUnicode(t *testing.T) {
126 | text := "BEIJING, 9. August 2022 (Xinhuanet) -- In einem am Dienstag veröffentlichten Bericht über die Menschenrechtsverletzungen der USA wird darauf hingewiesen, dass die Vereinigten Staaten einen \"Konflikt der Zivilisationen\" geschaffen, Haft und Folter missbraucht sowie die Religionsfreiheit und Menschenwürde verletzt hätten.\n\nDer Bericht mit dem Titel ''Die USA begehen schwerwiegende Verbrechen der Menschenrechtsverletzungen im Nahen Osten und darüber hinaus'' wurde von der Chinesischen Gesellschaft für Menschenrechtsstudien veröffentlicht.\n\nIn dem Bericht heißt es, dass die Vereinigten Staaten keinen Respekt vor der Diversität der Zivilisationen zeigten, der islamischen Zivilisation feindlich gegenüberständen, das historische und kulturelle Erbe des Nahen Ostens zerstörten, Muslime rücksichtslos inhaftierten und folterten und die grundlegenden Menschenrechte der Bevölkerung im Nahen Osten und in anderen Gebieten schwer verletzten.\n\n\"Die Vereinigten Staaten haben die 'islamische Bedrohungstheorie' in der ganzen Welt verbreitet. Sie haben die Überlegenheit der westlichen und christlichen Zivilisation befürwortet, die nicht-westliche Zivilisation verachtet und die islamische Zivilisation stigmatisiert, indem sie sie als 'rückständig', 'terroristisch' und 'gewalttätig' bezeichneten\", heißt es in dem Bericht."
127 | // latinRex := regexp.MustCompile(`\p{Lo}`)
128 | latinRex := regexp.MustCompile("[\u0080-\u00ff]")
129 | latin := latinRex.FindAllString(text, -1)
130 |
131 | t.Log(latin)
132 | }
133 |
--------------------------------------------------------------------------------
/charset.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "net/http"
5 | "regexp"
6 | "strings"
7 | "unicode/utf8"
8 |
9 | "github.com/suosi-inc/chardet"
10 | "github.com/x-funs/go-fun"
11 | )
12 |
13 | const (
14 | CharsetPosHeader = "header"
15 | CharsetPosHtml = "html"
16 | CharsetPosGuess = "guess"
17 | CharsetPosValid = "valid"
18 | )
19 |
20 | const (
21 | RegexCharset = "(?i)charset=\\s*([a-z][_\\-0-9a-z]*)"
22 | RegexCharsetHtml4 = "(?i)]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>"
23 | RegexCharsetHtml5 = "(?i)]*>"
24 | )
25 |
26 | var (
27 | regexCharsetPattern = regexp.MustCompile(RegexCharset)
28 | regexCharsetHtml4Pattern = regexp.MustCompile(RegexCharsetHtml4)
29 | regexCharsetHtml5Pattern = regexp.MustCompile(RegexCharsetHtml5)
30 | )
31 |
32 | type CharsetRes struct {
33 | Charset string
34 | CharsetPos string
35 | }
36 |
37 | // Charset 解析 HTTP body、http.Header 中的编码和语言, 如果未解析成功则尝试进行猜测
38 | func Charset(body []byte, headers *http.Header) CharsetRes {
39 | var charsetRes CharsetRes
40 | var guessCharset string
41 |
42 | // 优先检测是否是有效的 UTF-8
43 | valid := utf8.Valid(body)
44 | if valid {
45 | charsetRes.Charset = "UTF-8"
46 | charsetRes.CharsetPos = CharsetPosValid
47 | return charsetRes
48 | }
49 |
50 | // 根据 Content-Type、Body Html 标签探测编码
51 | charsetRes = CharsetFromHeaderHtml(body, headers)
52 |
53 | // 未识别到 charset 则使用 guess
54 | if charsetRes.Charset == "" {
55 | guessCharset = CharsetGuess(body)
56 |
57 | if guessCharset != "" {
58 | charsetRes.Charset = guessCharset
59 | charsetRes.CharsetPos = CharsetPosGuess
60 | }
61 | }
62 |
63 | return charsetRes
64 | }
65 |
66 | // CharsetFromHeaderHtml 解析 HTTP body、http.Header 中的 charset, 准确性高
67 | func CharsetFromHeaderHtml(body []byte, headers *http.Header) CharsetRes {
68 | var res CharsetRes
69 |
70 | cHeader := CharsetFromHeader(headers)
71 |
72 | cHtml := CharsetFromHtml(body)
73 |
74 | // 只有 Header 则使用 Header
75 | if cHeader != "" && cHtml == "" {
76 | res.Charset = cHeader
77 | res.CharsetPos = CharsetPosHeader
78 | return res
79 | }
80 |
81 | // 只有 Html 则使用 Html
82 | if cHeader == "" && cHtml != "" {
83 | res.Charset = cHtml
84 | res.CharsetPos = CharsetPosHtml
85 | return res
86 | }
87 |
88 | // 同时有 Header 和 Html, 根据情况使用 Header 或 Html
89 | if cHeader != "" && cHtml != "" {
90 | if cHeader == cHtml {
91 | res.Charset = cHeader
92 | res.CharsetPos = CharsetPosHeader
93 | return res
94 | }
95 |
96 | // Header 和 Html 不一致, 以下情况以 Html 为准
97 | if strings.HasPrefix(cHeader, "ISO") || strings.HasPrefix(cHeader, "WINDOWS") {
98 | res.Charset = cHtml
99 | res.CharsetPos = CharsetPosHtml
100 | return res
101 | }
102 |
103 | res.Charset = cHeader
104 | res.CharsetPos = CharsetPosHeader
105 | return res
106 | }
107 |
108 | return res
109 | }
110 |
111 | // CharsetFromHeader 解析 HTTP header 中的 charset
112 | func CharsetFromHeader(headers *http.Header) string {
113 | var charset string
114 | if headers != nil {
115 | contentType := headers.Get("Content-Type")
116 | if !fun.Blank(contentType) {
117 | matches := regexCharsetPattern.FindStringSubmatch(contentType)
118 | if len(matches) > 1 {
119 | charset = matches[1]
120 | }
121 | }
122 | }
123 |
124 | return convertCharset(charset)
125 | }
126 |
127 | // CharsetFromHtml 解析 Html 中的 charset
128 | func CharsetFromHtml(body []byte) string {
129 | var charset string
130 |
131 | if len(body) >= 0 {
132 | // 先检测 HTML 标签
133 | html := fun.String(body)
134 |
135 | // 匹配 HTML4 标签
136 | var charset4 string
137 | matches := regexCharsetHtml4Pattern.FindStringSubmatch(html)
138 | if len(matches) > 1 {
139 | matches = regexCharsetPattern.FindStringSubmatch(matches[1])
140 | if len(matches) > 1 {
141 | charset4 = matches[1]
142 | }
143 | }
144 |
145 | // 匹配 HTML5 标签
146 | var charset5 string
147 | matches = regexCharsetHtml5Pattern.FindStringSubmatch(html)
148 | if len(matches) > 1 {
149 | charset5 = matches[1]
150 | }
151 |
152 | // 只有其中一个
153 | if charset4 != "" && charset5 == "" {
154 | charset = charset4
155 | }
156 |
157 | if charset4 == "" && charset5 != "" {
158 | charset = charset5
159 | }
160 |
161 | if charset4 != "" && charset5 != "" {
162 | // 竟然两个都有, 以最先出现的为准
163 | if charset4 == charset5 {
164 | charset = charset5
165 | } else {
166 | charset4Index := strings.Index(html, charset4)
167 | charset5Index := strings.Index(html, charset5)
168 |
169 | if charset4Index < charset5Index {
170 | charset = charset4
171 | } else {
172 | charset = charset5
173 | }
174 | }
175 |
176 | }
177 | }
178 |
179 | return convertCharset(charset)
180 | }
181 |
182 | // CharsetGuess 根据 HTTP body 猜测编码
183 | func CharsetGuess(body []byte) string {
184 | var guessCharset string
185 |
186 | detector := chardet.NewHtmlDetector()
187 | guess, err := detector.DetectBest(body)
188 | if err == nil {
189 | guessCharset = strings.ToUpper(guess.Charset)
190 | }
191 |
192 | return guessCharset
193 | }
194 |
195 | // convertCharset 格式化 charset
196 | func convertCharset(charset string) string {
197 | c := strings.ToUpper(strings.TrimSpace(charset))
198 |
199 | if c != "" {
200 | // alias utf8
201 | if c == "UTF8" || c == "UTF_8" {
202 | return "UTF-8"
203 | }
204 |
205 | // alias gb2312, gb18030
206 | if strings.HasPrefix(c, "GB") {
207 | return "GBK"
208 | }
209 |
210 | // alias big5-hkscs..
211 | if strings.HasPrefix(c, "BIG5") {
212 | return "Big5"
213 | }
214 |
215 | // alias shift-jis
216 | if strings.HasPrefix(c, "SHIFT") {
217 | return "SHIFT_JIS"
218 | }
219 | }
220 |
221 | return c
222 | }
223 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
2 | github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
3 | github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
4 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
5 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
6 | github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
7 | github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4=
8 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
9 | github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8=
10 | github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0=
11 | github.com/microcosm-cc/bluemonday v1.0.26 h1:xbqSvqzQMeEHCqMi64VAs4d8uy6Mequs3rQ0k/Khz58=
12 | github.com/microcosm-cc/bluemonday v1.0.26/go.mod h1:JyzOCs9gkyQyjs+6h10UEVSe02CGwkhd72Xdqh78TWs=
13 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
14 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
15 | github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
16 | github.com/stretchr/objx v0.4.0 h1:M2gUjqZET1qApGOWNSnZ49BAIMX4F/1plDv3+l31EJ4=
17 | github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk=
18 | github.com/suosi-inc/chardet v0.1.0 h1:AmAXYaZKPAXCpwthMeQG/ABwYreonxjP/BCbhOa7jfw=
19 | github.com/suosi-inc/chardet v0.1.0/go.mod h1:dhKdJO4yQeuLYMyu1QFjoNITgMJ/zyLhs4zwIUnQTKI=
20 | github.com/suosi-inc/lingua-go v1.0.51 h1:+IhIKGPwLWVTxayQSEnMdTaSCUs2GWS0qVwafGSR0wQ=
21 | github.com/suosi-inc/lingua-go v1.0.51/go.mod h1:XDS0K21fYH99TkkUs71HxmJH03SEhPoc+RPi531aaX0=
22 | github.com/x-funs/go-fun v0.94.0 h1:claEwnVz4ybQYcdHLjm6DeDuVRntavqjOHh5dcHJG2g=
23 | github.com/x-funs/go-fun v0.94.0/go.mod h1:fYbm5aJU4EbzJkUQlodJUphsmjWgJ70iGvZNMakMSw4=
24 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
25 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
26 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
27 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
28 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
29 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
30 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
31 | golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
32 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
33 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
34 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
35 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
36 | golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c=
37 | golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U=
38 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
39 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
40 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
41 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
42 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
43 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
44 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
45 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
46 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
47 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
48 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
49 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
50 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
51 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
52 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
53 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
54 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
55 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
56 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
57 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
58 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
59 | golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc=
60 | golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
61 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
62 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
63 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
64 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
65 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
66 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
67 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ```
2 | __ _ __
3 | ____ _____ ____ / /______ _ _________ (_)___/ /__ _____
4 | / __ `/ __ \______/ __ \/ //_/ __ `/_____/ ___/ __ \/ / __ / _ \/ ___/
5 | / /_/ / /_/ /_____/ /_/ / ,< / /_/ /_____(__ ) /_/ / / /_/ / __/ /
6 | \__, /\____/ / .___/_/|_|\__, / /____/ .___/_/\__,_/\___/_/
7 | /____/ /_/ /____/ /_/
8 |
9 | ```
10 |
11 | 一个 Golang 实现的相对智能、无需规则维护的通用新闻网站数据提取工具库。含域名探测、网页编码语种识别、网页链接分类提取、网页新闻要素抽取以及新闻正文抽取等组件。
12 |
13 | # 预览
14 |
15 | 前往 [go-pkg-spider-gui Releases](https://github.com/suosi-inc/go-pkg-spider-gui/releases) 下载支持 Windows、MacOS GUI 客户端,进行体验。
16 |
17 |
18 |
19 |
20 |
21 | # 使用
22 |
23 | ```shell
24 | go get -u github.com/suosi-inc/go-pkg-spider
25 | ```
26 |
27 | # 介绍
28 |
29 | ## Http 客户端
30 |
31 | Http 客户端对 go-fun 中的 `fun.HttpGet` 相关函数进行了一些扩展,增加了以下功能:
32 |
33 | * 自动识别字符集和转换字符集,统一转换为 UTF-8
34 | * 响应文本类型限制
35 |
36 | - **`HttpGet(urlStr string, args ...any) ([]byte, error)`** Http Get 请求
37 | - **`HttpGetResp(urlStr string, r *HttpReq, timeout int) (*HttpResp, error)`** Http Get 请求, 返回 HttpResp
38 |
39 | ## 网页语种自动识别
40 |
41 | 当前支持以下主流语种:**中文、英语、日语、韩语、俄语、阿拉伯语、印地语、德语、法语、西班牙语、葡萄牙语、意大利语、泰语、越南语、缅甸语**。
42 |
43 | 语种识别通过 HTML 、文本特征、字符集统计规则优先识别中文、英语、日语、韩语。
44 |
45 | 同时辅助集成了 [lingua-go](https://github.com/pemistahl/lingua-go) n-gram model 语言识别模型,fork 并移除了很多语种和语料(因为完整包很大)
46 |
47 | - **`LangText(text string) (string, string)`** 识别纯文本语种
48 | - **`Lang(doc *goquery.Document, charset string, listMode bool) LangRes `** 识别 HTML 语种
49 |
50 | ### 示例
51 |
52 | 识别纯文本语种:
53 |
54 | ```go
55 | // 识别纯文本语种
56 | lang, langPos := spider.LangText(text)
57 | ```
58 |
59 | 识别 HTML 语种:
60 |
61 | ```go
62 | // Http 请求获取响应
63 | resp, err := spider.HttpGetResp(urlStr, req, timeout)
64 |
65 | // 转换 goquery.*Document
66 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
67 |
68 | // 根据字符集、页面类型返回
69 | langRes := spider.Lang(doc, resp.Charset.Charset, false)
70 | ```
71 |
72 | ## 域名自动探测
73 |
74 | - **`DetectDomain(domain string, timeout int, retry int) (*DomainRes, error)`** 探测主域名基本信息
75 | - **`func DetectSubDomain(domain string, timeout int, retry int) (*DomainRes, error)`** 探测子域名基本信息
76 |
77 | 根据网站域名,尽可能的探测一些基本信息,基本信息包括:
78 |
79 | ```go
80 | type DomainRes struct {
81 | // 域名
82 | Domain string
83 | // 主页域名
84 | HomeDomain string
85 | // 协议
86 | Scheme string
87 | // 字符集
88 | Charset CharsetRes
89 | // 语种
90 | Lang LangRes
91 | // 国家
92 | Country string
93 | // 省份
94 | Province string
95 | // 分类
96 | Category string
97 | // 标题
98 | Title string
99 | // 描述
100 | Description string
101 | // ICP
102 | Icp string
103 | // 状态
104 | State bool
105 | // 状态码
106 | StatusCode int
107 | // 内容页链接数量
108 | ContentCount int
109 | // 列表页链接数量
110 | ListCount int
111 | // 子域名列表
112 | SubDomains map[string]bool
113 | }
114 | ```
115 |
116 | ## 网页链接分类提取
117 |
118 | 根据页面内容,自动分析识别并提取页面上的内容页、列表页以及其他链接,支持传入自定义规则干扰最终结果
119 |
120 | 分类依据通过链接标题、URL特征、以及统计归纳的方式
121 |
122 | - **`GetLinkData(urlStr string, strictDomain bool, timeout int, retry int) (*LinkData, error)`** 获取页面链接分类数据
123 |
124 | ### 链接分类提取结果定义
125 |
126 | ```go
127 | type LinkData struct {
128 | LinkRes *extract.LinkRes
129 | // 过滤
130 | Filters map[string]string
131 | // 子域名
132 | SubDomains map[string]bool
133 | }
134 |
135 | type LinkRes struct {
136 | // 内容页
137 | Content map[string]string
138 | // 列表页
139 | List map[string]string
140 | // 未知链接
141 | Unknown map[string]string
142 | // 过滤链接
143 | None map[string]string
144 | }
145 | ```
146 |
147 | ## 网页新闻提取
148 |
149 | 新闻最重要的三要素:标题、发布时间、正文。其中发布时间对精准度要求高,标题和正文更追求完整性。
150 |
151 | 体验下来,业内最强大的是: [diffbot](https://www.diffbot.com/) 公司,猜测它可能是基于网页视觉+深度学习来实现。
152 |
153 | 有不少新闻正文提取或新闻正文抽取的开源的方案,大都是基于规则或统计方法实现。如:
154 |
155 | * Python: [GeneralNewsExtractor](https://github.com/GeneralNewsExtractor/GeneralNewsExtractor)
156 | * Java: [WebCollector/ContentExtractor](https://github.com/CrawlScript/WebCollector)
157 |
158 | 更古老的还有:[python-goose](https://github.com/grangier/python-goose), [newspaper](https://github.com/codelucas/newspaper),甚至 Readability、Html2Article 等等。
159 |
160 | 其中:`WebCollector/ContentExtractor` 是 [基于标签路径特征融合新闻内容抽取的 CEPF 算法](http://www.jos.org.cn/jos/article/abstract/4868) 的 Java 实现版本。
161 |
162 | go-pkg-spider 实现了 CEPF 算法的 Golang 版本,在此基础上做了大量优化,内置了一些通用规则,更精细的控制了标题和发布时间的提取与转换,并支持多语种新闻网站的要素提取。
163 |
164 |
165 | ### 新闻要素提取结果定义
166 |
167 | ```go
168 | type News struct {
169 | // 标题
170 | Title string
171 | // 标题提取依据
172 | TitlePos string
173 | // 发布时间
174 | TimeLocal string
175 | // 原始时间
176 | Time string
177 | // 发布时间时间提取依据
178 | TimePos string
179 | // 正文纯文本
180 | Content string
181 | // 正文 Node 节点
182 | ContentNode *html.Node
183 | // 提取用时(毫秒)
184 | Spend int64
185 | // 语种
186 | Lang string
187 | }
188 | ```
189 |
190 | 可根据 `ContentNode *html.Node` 来重新定义需要清洗保留的标签。
191 |
192 | ### 效果
193 |
194 |
195 |
196 |
197 |
198 | ### 示例
199 |
200 | ```go
201 | // Http 请求获取响应
202 | resp, err := spider.HttpGetResp(urlStr, req, timeout)
203 |
204 | // 转换 goquery.*Document
205 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
206 |
207 | // 基本清理
208 | doc.Find(spider.DefaultDocRemoveTags).Remove()
209 |
210 | // 语种
211 | langRes := Lang(doc, resp.Charset.Charset, false)
212 |
213 | // 新闻提取
214 | content := extract.NewContent(contentDoc, langRes.Lang, listTitle, urlStr)
215 |
216 | // 新闻提取结果
217 | news := content.ExtractNews()
218 | ```
219 |
220 | 可以通过下面的已经封装好的方法完成以上步骤:
221 |
222 | - **`GetNews(urlStr string, title string, timeout int, retry int) (*extract.News, *HttpResp, error)`** 获取链接新闻数据
223 |
224 | # 免责声明
225 |
226 | 本项目是一个数据提取工具库,不是爬虫框架或采集软件,只限于技术交流,源码中请求目标网站的相关代码仅为功能测试需要。
227 |
228 | 请在符合法律法规和相关规定的情况下使用本项目,禁止使用本项目进行任何非法、侵权或者违反公序良俗的行为。
229 |
230 | 使用本项目造成的直接或间接的风险由用户自行承担。
231 |
--------------------------------------------------------------------------------
/detect_test.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "testing"
7 |
8 | "github.com/PuerkitoBio/goquery"
9 | "github.com/suosi-inc/go-pkg-spider/extract"
10 | "github.com/x-funs/go-fun"
11 | )
12 |
13 | func TestDomainDetect(t *testing.T) {
14 | domains := []string{
15 | // "china-nengyuan.com",
16 | // "suosi.com.cn",
17 | // "wanjiaxian.com",
18 | "thediplomat.com",
19 | }
20 |
21 | for _, domain := range domains {
22 | domainRes, err := DetectDomain(domain, 10000, 1)
23 | if err == nil {
24 | t.Log(domainRes.Title)
25 | t.Log(domainRes.TitleClean)
26 | t.Log(domainRes)
27 | } else {
28 | t.Log(err)
29 | t.Log(domainRes)
30 | }
31 | }
32 | }
33 |
34 | func BenchmarkLinkTitles(b *testing.B) {
35 | urlStr := "http://www.qq.com/"
36 |
37 | resp, _ := HttpGetResp(urlStr, nil, 30000)
38 |
39 | // 解析 HTML
40 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
41 | doc.Find(DefaultDocRemoveTags).Remove()
42 |
43 | // 语言
44 |
45 | langRes := Lang(doc, resp.Charset.Charset, true)
46 |
47 | fmt.Println(langRes)
48 |
49 | var linkTitles map[string]string
50 |
51 | b.ResetTimer()
52 |
53 | for i := 0; i < b.N; i++ {
54 | // 标题
55 | linkTitles, _ = extract.WebLinkTitles(doc, resp.RequestURL, true)
56 |
57 | // 连接和子域名
58 | _, _ = extract.LinkTypes(linkTitles, langRes.Lang, nil)
59 |
60 | // rules := map[string][]string{
61 | // "163.com": []string{
62 | // "`\\w{16}\\.html`",
63 | // },
64 | // }
65 | // _, _ = extract.LinkTypes(linkTitles, langRes.Lang, rules)
66 | }
67 |
68 | b.StopTimer()
69 |
70 | fmt.Println(langRes.Lang)
71 | fmt.Println(len(linkTitles))
72 |
73 | }
74 |
75 | func TestLinkTitles(t *testing.T) {
76 | var urlStrs = []string{
77 | "https://www.1905.com",
78 | // "https://www.people.com.cn",
79 | // "https://www.36kr.com",
80 | // "https://www.163.com",
81 | // "https://news.163.com/",
82 | // "http://jyj.suqian.gov.cn",
83 | // "https://www.huxiu.com/",
84 | // "http://www.news.cn/politicspro/",
85 | // "http://www.cankaoxiaoxi.com",
86 | // "http://www.bbc.com",
87 | // "https://www.ft.com",
88 | // "https://www.reuters.com/",
89 | // "https://nypost.com/",
90 | // "http://www.mengcheng.gov.cn/",
91 | // "https://www.chunichi.co.jp",
92 | // "https://www.donga.com/",
93 | // "https://people.com/",
94 | // "https://czql.gov.cn/",
95 | // "https://qiye.163.com/",
96 | // "https://www.washingtontimes.com/",
97 | // "https://www.gamersky.com/",
98 | // "https://www.cdns.com.tw/",
99 | // "http://www.163.com/",
100 | }
101 |
102 | for _, urlStr := range urlStrs {
103 |
104 | resp, err := HttpGetResp(urlStr, nil, 30000)
105 |
106 | t.Log(urlStr)
107 | t.Log(err)
108 |
109 | // 解析 HTML
110 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
111 | doc.Find(DefaultDocRemoveTags).Remove()
112 |
113 | // 语言
114 | langRes := Lang(doc, resp.Charset.Charset, true)
115 |
116 | fmt.Println(resp.Charset)
117 | fmt.Println(langRes)
118 |
119 | // 标题
120 | linkTitles, filters := extract.WebLinkTitles(doc, resp.RequestURL, true)
121 |
122 | // 分类链接和子域名列表
123 | linkRes, domainRes := extract.LinkTypes(linkTitles, langRes.Lang, nil)
124 |
125 | // 分类链接和子域名列表, 规则
126 | // rules := map[string][]string{
127 | // "cankaoxiaoxi.com": []string{
128 | // "\\d{7}\\.shtml$",
129 | // },
130 | // }
131 | // linkRes, domainRes := extract.LinkTypes(linkTitles, langRes.Lang, rules)
132 |
133 | fmt.Println("all:", len(linkTitles))
134 | fmt.Println("content:", len(linkRes.Content))
135 | fmt.Println("list:", len(linkRes.List))
136 | fmt.Println("unknown:", len(linkRes.Unknown))
137 | fmt.Println("none:", len(linkRes.None))
138 |
139 | i := 0
140 | for a, title := range filters {
141 | i = i + 1
142 | fmt.Println(i, "filter:"+a+"\t=>\t"+title)
143 | }
144 | i = 0
145 | for subdomain := range domainRes {
146 | i = i + 1
147 | fmt.Println(i, "domain:"+subdomain)
148 | }
149 | i = 0
150 | for a, title := range linkRes.Content {
151 | i = i + 1
152 | fmt.Println(i, "content:"+a+"\t=>\t"+title)
153 | }
154 | i = 0
155 | for a, title := range linkRes.Unknown {
156 | i = i + 1
157 | fmt.Println(i, "unknown:"+a+"\t=>\t"+title)
158 | }
159 | i = 0
160 | for a, title := range linkRes.List {
161 | i = i + 1
162 | fmt.Println(i, "list:"+a+"\t=>\t"+title)
163 | }
164 | i = 0
165 | for a, title := range linkRes.None {
166 | i = i + 1
167 | fmt.Println(i, "none:"+a+"\t=>\t"+title)
168 | }
169 |
170 | }
171 | }
172 |
173 | func TestDetectIcp(t *testing.T) {
174 | var urlStrs = []string{
175 | // "http://suosi.com.cn",
176 | "https://www.163.com",
177 | // "https://www.sohu.com",
178 | // "https://www.qq.com",
179 | // "https://www.hexun.com",
180 | // "https://www.wfmc.edu.cn/",
181 | // "https://www.cankaoxiaoxi.com/",
182 | }
183 |
184 | for _, urlStr := range urlStrs {
185 |
186 | resp, err := HttpGetResp(urlStr, nil, 30000)
187 |
188 | t.Log(err)
189 | t.Log(urlStr)
190 |
191 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
192 | doc.Find(DefaultDocRemoveTags).Remove()
193 | icp, loc := extract.Icp(doc)
194 | t.Log(icp, loc)
195 | }
196 | }
197 |
198 | func TestLangFromUtf8Body(t *testing.T) {
199 | var urlStrs = []string{
200 | // "https://www.163.com",
201 | // "https://english.news.cn",
202 | // "https://jp.news.cn",
203 | // "https://kr.news.cn",
204 | // "https://arabic.news.cn",
205 | // "https://www.bbc.com",
206 | // "http://government.ru",
207 | // "https://french.news.cn",
208 | // "https://www.gouvernement.fr",
209 | // "http://live.siammedia.org/",
210 | // "http://hanoimoi.com.vn",
211 | // "https://www.commerce.gov.mm",
212 | // "https://www.rrdmyanmar.gov.mm",
213 | "https://czql.gov.cn/",
214 | }
215 |
216 | for _, urlStr := range urlStrs {
217 | resp, _ := fun.HttpGetResp(urlStr, nil, 30000)
218 |
219 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
220 | doc.Find(DefaultDocRemoveTags).Remove()
221 |
222 | start := fun.Timestamp(true)
223 | lang, pos := LangFromUtf8Body(doc, false)
224 | t.Log(urlStr)
225 | t.Log(lang)
226 | t.Log(pos)
227 | t.Log(fun.Timestamp(true) - start)
228 |
229 | }
230 | }
231 |
232 | func TestDetectFriendDomainDo(t *testing.T) {
233 | var domains = []string{
234 | "northnews.cn",
235 | }
236 |
237 | for _, domain := range domains {
238 | friendDomains, err := DetectFriendDomainDo(domain, 10000)
239 |
240 | t.Log(err)
241 | t.Log(friendDomains)
242 | }
243 | }
244 |
--------------------------------------------------------------------------------
/spider.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "bytes"
5 | "errors"
6 | "regexp"
7 | "strings"
8 |
9 | "github.com/PuerkitoBio/goquery"
10 | "github.com/suosi-inc/go-pkg-spider/extract"
11 | "github.com/x-funs/go-fun"
12 | )
13 |
14 | const (
15 | RegexHostnameIp = `\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}`
16 |
17 | RegexMetaRefresh = `(?i)url=(.+)`
18 | )
19 |
20 | var (
21 | DefaultDocRemoveTags = "script,noscript,style,iframe,br,link,svg"
22 |
23 | RegexHostnameIpPattern = regexp.MustCompile(RegexHostnameIp)
24 |
25 | regexMetaRefreshPattern = regexp.MustCompile(RegexMetaRefresh)
26 | )
27 |
28 | type LinkData struct {
29 | LinkRes *extract.LinkRes
30 | Filters map[string]string
31 | SubDomains map[string]bool
32 | }
33 |
34 | // GetLinkData 获取页面链接数据
35 | func GetLinkData(urlStr string, strictDomain bool, timeout int, retry int) (*LinkData, error) {
36 | if retry <= 0 {
37 | retry = 1
38 | }
39 |
40 | errs := make([]string, 0)
41 |
42 | for i := 0; i < retry; i++ {
43 | linkData, err := GetLinkDataDo(urlStr, strictDomain, nil, nil, timeout)
44 | if err == nil {
45 | return linkData, err
46 | } else {
47 | errs = append(errs, err.Error())
48 | }
49 | }
50 |
51 | return nil, errors.New("ErrorLinkRes" + fun.ToString(errs))
52 | }
53 |
54 | // GetLinkDataWithReq 获取页面链接数据
55 | func GetLinkDataWithReq(urlStr string, strictDomain bool, req *HttpReq, timeout int, retry int) (*LinkData, error) {
56 | if retry <= 0 {
57 | retry = 1
58 | }
59 |
60 | errs := make([]string, 0)
61 |
62 | for i := 0; i < retry; i++ {
63 | linkData, err := GetLinkDataDo(urlStr, strictDomain, nil, req, timeout)
64 | if err == nil {
65 | return linkData, err
66 | } else {
67 | errs = append(errs, err.Error())
68 | }
69 | }
70 |
71 | return nil, errors.New("ErrorLinkRes" + fun.ToString(errs))
72 | }
73 |
74 | // GetLinkDataWithReqAndRule 获取页面链接数据
75 | func GetLinkDataWithReqAndRule(urlStr string, strictDomain bool, rules extract.LinkTypeRule, req *HttpReq, timeout int, retry int) (*LinkData, error) {
76 | if retry <= 0 {
77 | retry = 1
78 | }
79 |
80 | errs := make([]string, 0)
81 |
82 | for i := 0; i < retry; i++ {
83 | linkData, err := GetLinkDataDo(urlStr, strictDomain, rules, req, timeout)
84 | if err == nil {
85 | return linkData, err
86 | } else {
87 | errs = append(errs, err.Error())
88 | }
89 | }
90 |
91 | return nil, errors.New("ErrorLinkRes" + fun.ToString(errs))
92 | }
93 |
94 | // GetLinkDataWithRule 获取页面链接数据
95 | func GetLinkDataWithRule(urlStr string, strictDomain bool, rules extract.LinkTypeRule, timeout int, retry int) (*LinkData, error) {
96 | if retry <= 0 {
97 | retry = 1
98 | }
99 |
100 | errs := make([]string, 0)
101 |
102 | for i := 0; i < retry; i++ {
103 | linkData, err := GetLinkDataDo(urlStr, strictDomain, rules, nil, timeout)
104 | if err == nil {
105 | return linkData, err
106 | } else {
107 | errs = append(errs, err.Error())
108 | }
109 | }
110 |
111 | return nil, errors.New("ErrorLinkRes" + fun.ToString(errs))
112 | }
113 |
114 | // GetLinkDataDo 获取页面链接数据
115 | func GetLinkDataDo(urlStr string, strictDomain bool, rules extract.LinkTypeRule, req *HttpReq, timeout int) (*LinkData, error) {
116 | if timeout == 0 {
117 | timeout = 10000
118 | }
119 |
120 | if req == nil {
121 | req = &HttpReq{
122 | HttpReq: &fun.HttpReq{
123 | MaxContentLength: HttpDefaultMaxContentLength,
124 | MaxRedirect: 3,
125 | },
126 | ForceTextContentType: true,
127 | }
128 | }
129 |
130 | resp, err := HttpGetResp(urlStr, req, timeout)
131 | if resp != nil && err == nil && resp.Success {
132 | // 解析 HTML
133 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
134 | if docErr == nil {
135 | linkData := &LinkData{}
136 |
137 | doc.Find(DefaultDocRemoveTags).Remove()
138 |
139 | // 语言
140 | langRes := Lang(doc, resp.Charset.Charset, true)
141 |
142 | // 站内链接
143 | linkTitles, filters := extract.WebLinkTitles(doc, resp.RequestURL, strictDomain)
144 |
145 | // 链接分类
146 | linkRes, subDomains := extract.LinkTypes(linkTitles, langRes.Lang, rules)
147 |
148 | linkData.LinkRes = linkRes
149 | linkData.Filters = filters
150 | linkData.SubDomains = subDomains
151 |
152 | return linkData, nil
153 | } else {
154 | return nil, errors.New("ErrorDocParse")
155 | }
156 | }
157 |
158 | return nil, errors.New("ErrorRequest")
159 | }
160 |
161 | // GetNews 获取链接新闻数据
162 | func GetNews(urlStr string, title string, timeout int, retry int) (*extract.News, *HttpResp, error) {
163 | if retry <= 0 {
164 | retry = 1
165 | }
166 |
167 | errs := make([]string, 0)
168 |
169 | for i := 0; i < retry; i++ {
170 | news, resp, err := GetNewsDo(urlStr, title, nil, timeout)
171 | if err == nil {
172 | return news, resp, nil
173 | } else {
174 | errs = append(errs, err.Error())
175 | }
176 | }
177 |
178 | return nil, nil, errors.New("ErrorRequest" + fun.ToString(errs))
179 | }
180 |
181 | // GetNewsWithReq 获取链接新闻数据
182 | func GetNewsWithReq(urlStr string, title string, req *HttpReq, timeout int, retry int) (*extract.News, *HttpResp, error) {
183 | if retry <= 0 {
184 | retry = 1
185 | }
186 |
187 | errs := make([]string, 0)
188 |
189 | for i := 0; i < retry; i++ {
190 | news, resp, err := GetNewsDo(urlStr, title, req, timeout)
191 | if err == nil {
192 | return news, resp, nil
193 | } else {
194 | errs = append(errs, err.Error())
195 | }
196 | }
197 |
198 | return nil, nil, errors.New("ErrorRequest" + fun.ToString(errs))
199 | }
200 |
201 | // GetNewsDo 获取链接新闻数据
202 | func GetNewsDo(urlStr string, title string, req *HttpReq, timeout int) (*extract.News, *HttpResp, error) {
203 | return getNewsDoTop(urlStr, title, req, timeout, true)
204 | }
205 |
206 | // getNewsDoTop 获取链接新闻数据
207 | func getNewsDoTop(urlStr string, title string, req *HttpReq, timeout int, top bool) (*extract.News, *HttpResp, error) {
208 | if timeout == 0 {
209 | timeout = HttpDefaultTimeOut
210 | }
211 |
212 | if req == nil {
213 | req = &HttpReq{
214 | HttpReq: &fun.HttpReq{
215 | MaxContentLength: HttpDefaultMaxContentLength,
216 | MaxRedirect: 2,
217 | },
218 | ForceTextContentType: true,
219 | }
220 | }
221 |
222 | resp, err := HttpGetResp(urlStr, req, timeout)
223 |
224 | if resp != nil && err == nil && resp.Success {
225 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
226 | if docErr == nil {
227 | contentDoc := goquery.CloneDocument(doc)
228 | doc.Find(DefaultDocRemoveTags).Remove()
229 |
230 | // 具有 HTML 跳转属性, 如果为本域名下, 则跳转一次
231 | if top {
232 | if refresh, exists := doc.Find("meta[http-equiv='refresh' i]").Attr("content"); exists {
233 | refreshMatch := regexMetaRefreshPattern.FindStringSubmatch(refresh)
234 | if len(refreshMatch) > 1 {
235 | requestHostname := resp.RequestURL.Hostname()
236 | requestTopDomain := extract.DomainTop(requestHostname)
237 | refreshUrl := strings.TrimSpace(refreshMatch[1])
238 | if r, err := fun.UrlParse(refreshUrl); err == nil {
239 | refreshHostname := r.Hostname()
240 | refreshTopDomain := extract.DomainTop(refreshHostname)
241 | if refreshTopDomain != "" && refreshTopDomain == requestTopDomain {
242 | return getNewsDoTop(refreshUrl, title, req, timeout, false)
243 | }
244 | }
245 | }
246 | }
247 | }
248 |
249 | // 语言
250 | langRes := Lang(doc, resp.Charset.Charset, false)
251 |
252 | // 正文抽取
253 | content := extract.NewContent(contentDoc, langRes.Lang, title, urlStr)
254 | news := content.ExtractNews()
255 |
256 | return news, resp, nil
257 | } else {
258 | return nil, resp, errors.New("ErrorDocParse")
259 | }
260 | }
261 |
262 | return nil, nil, errors.New("ErrorRequest")
263 | }
264 |
--------------------------------------------------------------------------------
/spider_news.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "strings"
5 | "sync"
6 | "time"
7 |
8 | "github.com/x-funs/go-fun"
9 | )
10 |
11 | // 新闻采集器结构体
12 | type NewsSpider struct {
13 | Url string // 根链接
14 | Depth uint8 // 采集页面深度
15 | seen map[string]bool // 是否已采集
16 | IsSub bool // 是否采集子域名
17 | linkChan chan *NewsData // NewsData 通道共享
18 | contentChan chan *NewsContent // NewsContent 通道共享
19 | ProcessFunc func(...any) // 处理函数
20 | RetryTime int // 请求重试次数
21 | TimeOut int // 请求响应时间
22 | wg *sync.WaitGroup // 同步等待组
23 | Req *HttpReq // 请求体
24 | Ctx any // 任务详情上下文,传入ProcessFunc函数中
25 | }
26 |
27 | // 新闻内容结构体
28 | type NewsContent struct {
29 | Url string // 链接
30 | Title string // 标题
31 | Time string // 发布时间
32 | Content string // 正文纯文本
33 | Lang string // 语种
34 | }
35 |
36 | // 新闻LinkData总数据
37 | type NewsData struct {
38 | *LinkData
39 | Depth uint8 // 采集深度溯源
40 | ListUrl string // 列表页溯源
41 | Error error
42 | }
43 |
44 | // 自定义配置函数
45 | type Option func(*NewsSpider)
46 |
47 | // 原型链接口
48 | type Prototype interface {
49 | Clone() Prototype
50 | }
51 |
52 | // NewNewsSpider 初始化
53 | func NewNewsSpider(url string, depth uint8, pf func(...any), ctx any, options ...Option) *NewsSpider {
54 | n := &NewsSpider{
55 | Url: url,
56 | Depth: depth,
57 | seen: map[string]bool{},
58 | IsSub: false,
59 | linkChan: make(chan *NewsData),
60 | contentChan: make(chan *NewsContent),
61 | ProcessFunc: pf,
62 | RetryTime: 2,
63 | TimeOut: 20000,
64 | wg: &sync.WaitGroup{},
65 | Req: nil,
66 | Ctx: ctx,
67 | }
68 |
69 | // 函数式选项模式
70 | for _, option := range options {
71 | option(n)
72 | }
73 |
74 | return n
75 | }
76 |
77 | func WithRetryTime(retryTime int) Option {
78 | return func(n *NewsSpider) {
79 | n.RetryTime = retryTime
80 | }
81 | }
82 |
83 | func WithTimeOut(timeout int) Option {
84 | return func(n *NewsSpider) {
85 | n.TimeOut = timeout
86 | }
87 | }
88 |
89 | func WithReq(req *HttpReq) Option {
90 | return func(n *NewsSpider) {
91 | n.Req = req
92 | }
93 | }
94 |
95 | func WithIsSub(isSub bool) Option {
96 | return func(n *NewsSpider) {
97 | n.IsSub = isSub
98 | }
99 | }
100 |
101 | // 原型链结构体拷贝
102 | func (n *NewsSpider) Clone() Prototype {
103 | nc := *n
104 |
105 | // 拷贝时需重置chan和wg等字段
106 | nc.seen = map[string]bool{}
107 | nc.linkChan = make(chan *NewsData)
108 | nc.contentChan = make(chan *NewsContent)
109 | nc.wg = &sync.WaitGroup{}
110 |
111 | return &nc
112 | }
113 |
114 | // GetNews 开始采集
115 | func (n *NewsSpider) GetNews(linksHandleFunc func(*NewsData)) {
116 | // 初始化列表页和内容页切片
117 | var (
118 | listSlice []string
119 | listSliceTemp []string
120 | subDomainSlice []string
121 | )
122 |
123 | // 获取首页url和协议
124 | scheme, indexUrl := GetIndexUrl(n.Url)
125 |
126 | // 首次添加当前页
127 | listSliceTemp = append(listSliceTemp, n.Url)
128 |
129 | if n.IsSub {
130 | // 先探测出首页url的所有子域名
131 | subDomains, _ := GetSubdomains(indexUrl, n.Req, n.TimeOut, n.RetryTime*100)
132 |
133 | for subDomain := range subDomains {
134 | subDomainSlice = append(subDomainSlice, subDomain)
135 | listSliceTemp = append(listSliceTemp, subDomain)
136 | }
137 | }
138 |
139 | // 深度优先循环遍历获取页面列表页和内容页
140 | for i := 0; i < int(n.Depth); i++ {
141 | listS, _ := n.GetNewsLinkRes(linksHandleFunc, scheme, listSliceTemp, uint8(i+1), n.TimeOut, n.RetryTime)
142 | listSlice = append(listSlice, listS...)
143 |
144 | // 重置循环列表页
145 | if len(listS) == 0 {
146 | break
147 | }
148 | listSliceTemp = listS
149 | }
150 | }
151 |
152 | // GetNewsLinkRes 获取news页面链接分组, 仅返回列表页和内容页
153 | func (n *NewsSpider) GetNewsLinkRes(linksHandleFunc func(*NewsData), scheme string, urls []string, depth uint8, timeout int, retry int) ([]string, error) {
154 | listSlice := []string{}
155 |
156 | for _, url := range urls {
157 | if !strings.Contains(url, "http") {
158 | url = scheme + url
159 | }
160 |
161 | if linkData, err := GetLinkDataWithReq(url, true, n.Req, timeout, retry); err == nil {
162 | for l := range linkData.LinkRes.List {
163 | if !n.seen[l] {
164 | n.seen[l] = true
165 | listSlice = append(listSlice, l)
166 | }
167 | }
168 |
169 | newsData := &NewsData{linkData, depth, url, nil}
170 |
171 | n.wg.Add(1)
172 | go linksHandleFunc(newsData)
173 |
174 | } else {
175 | // 报错空的LinkData也需要push
176 | newsData := &NewsData{nil, depth, url, err}
177 |
178 | n.wg.Add(1)
179 | go linksHandleFunc(newsData)
180 |
181 | // return nil, errors.New("GetNewsLinkRes Err")
182 | }
183 | }
184 |
185 | return listSlice, nil
186 | }
187 |
188 | // CrawlLinkRes 直接推送列表页内容页
189 | func (n *NewsSpider) CrawlLinkRes(l *NewsData) {
190 | defer n.wg.Done()
191 | // defer n.sleep()
192 |
193 | n.PushLinks(l)
194 | }
195 |
196 | // GetContentNews 解析内容页详情数据
197 | func (n *NewsSpider) CrawlContentNews(l *NewsData) {
198 | defer n.wg.Done()
199 | // defer n.sleep()
200 |
201 | if l.Error == nil {
202 | for c, v := range l.LinkRes.Content {
203 | if !n.seen[c] {
204 | n.seen[c] = true
205 | cc := map[string]string{}
206 | cc[c] = v
207 |
208 | n.wg.Add(1)
209 | go n.ReqContentNews(cc)
210 | }
211 | }
212 | }
213 | }
214 |
215 | // ReqContentNews 获取内容页详情数据
216 | func (n *NewsSpider) ReqContentNews(content map[string]string) {
217 | defer n.wg.Done()
218 |
219 | time.Sleep(time.Duration(fun.RandomInt(10, 100)) * time.Millisecond)
220 |
221 | for url, title := range content {
222 | if news, _, err := GetNews(url, title, n.TimeOut, n.RetryTime); err == nil {
223 | newsData := &NewsContent{}
224 | newsData.Url = url
225 | newsData.Title = news.Title
226 | newsData.Content = news.Content
227 | newsData.Time = news.TimeLocal
228 | newsData.Lang = news.Lang
229 |
230 | n.PushContentNews(newsData)
231 | }
232 | }
233 | }
234 |
235 | // PushLinks 推送links数据
236 | func (n *NewsSpider) PushLinks(data *NewsData) {
237 | n.linkChan <- data
238 | }
239 |
240 | // PushContentNews 推送详情页数据
241 | func (n *NewsSpider) PushContentNews(data *NewsContent) {
242 | n.contentChan <- data
243 | }
244 |
245 | // Wait wg阻塞等待退出
246 | func (n *NewsSpider) Wait() {
247 | n.wg.Wait()
248 | }
249 |
250 | // Close 关闭Chan
251 | func (n *NewsSpider) Close() {
252 | close(n.linkChan)
253 | close(n.contentChan)
254 | }
255 |
256 | // process 处理chan data函数
257 | func (n *NewsSpider) process(processFunc func(...any)) {
258 | for {
259 | select {
260 | case data, ok := <-n.linkChan:
261 | if !ok {
262 | return
263 | }
264 | processFunc(data, n.Ctx)
265 | case data, ok := <-n.contentChan:
266 | if !ok {
267 | return
268 | }
269 | processFunc(data, n.Ctx)
270 | }
271 | }
272 | }
273 |
274 | // GetLinkRes 回调获取LinkRes数据
275 | func (n *NewsSpider) GetLinkRes() {
276 | n.GetNews(n.CrawlLinkRes)
277 |
278 | go n.process(n.ProcessFunc)
279 |
280 | n.Wait()
281 | defer n.Close()
282 | }
283 |
284 | // GetContentNews 回调获取内容页数据
285 | func (n *NewsSpider) GetContentNews() {
286 | n.GetNews(n.CrawlContentNews)
287 |
288 | go n.process(n.ProcessFunc)
289 |
290 | n.Wait()
291 | defer n.Close()
292 | }
293 |
294 | // GetSubdomains 获取subDomain
295 | func GetSubdomains(url string, req *HttpReq, timeout int, retry int) (map[string]bool, error) {
296 | if linkData, err := GetLinkDataWithReq(url, true, req, timeout, retry); err == nil {
297 | return linkData.SubDomains, nil
298 | } else {
299 | return nil, err
300 | }
301 | }
302 |
303 | // GetIndexUrl 获取首页url
304 | func GetIndexUrl(url string) (string, string) {
305 | urlSlice := strings.Split(url, "/")
306 | if len(urlSlice) == 1 {
307 | // domain
308 | return "https://", "https://www." + url
309 | }
310 | scheme := urlSlice[0] + "//"
311 | indexUrl := scheme + urlSlice[2]
312 | return scheme, indexUrl
313 | }
314 |
315 | // sleep depth只有一层时,需要等待几秒,避免wg done后直接退出,导致select来不及取出数据
316 | func (n *NewsSpider) sleep() {
317 | if n.Depth == 1 {
318 | time.Sleep(2 * time.Second)
319 | }
320 | }
321 |
--------------------------------------------------------------------------------
/extract/web.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import (
4 | "errors"
5 | "net/url"
6 | "path"
7 | "regexp"
8 | "strings"
9 |
10 | "github.com/PuerkitoBio/goquery"
11 | "github.com/x-funs/go-fun"
12 | )
13 |
14 | const (
15 | RegexHostnameIp = `\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}`
16 | )
17 |
18 | var (
19 | filterUrlSuffix = []string{
20 | ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".xml",
21 | ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
22 | ".zip", ".rar", ".7z", ".gz", ".apk", ".cgi", ".exe", ".bz2", ".play",
23 | ".rss", ".sig", ".sgf",
24 | ".mp3", ".mp4", ".rm", ".rmvb", ".mov", ".ogv", ".flv",
25 | }
26 |
27 | invalidUrlCharsets = []string{"{", "}", "[", "]", "@", "$", "<", ">", "\""}
28 |
29 | titleZhSplits = []string{"_", "|", "-", "-", "|", "—", "*", ":", ",", ",", ":", "·", ">>", "="}
30 |
31 | titleZhContentSplits = []string{"_", "|", "-", "-", "|", "—"}
32 |
33 | titleEnSplits = []string{" - ", " | ", ":"}
34 |
35 | RegexHostnameIpPattern = regexp.MustCompile(RegexHostnameIp)
36 | )
37 |
38 | // WebTitle 返回网页标题, 最大 128 个字符
39 | func WebTitle(doc *goquery.Document, maxLength int) string {
40 | var title string
41 | titleNode := doc.Find("title")
42 | if titleNode.Size() > 1 {
43 | // 竟然有多个 title, 只取第一个
44 | title = titleNode.First().Text()
45 | } else {
46 | title = titleNode.Text()
47 | }
48 |
49 | title = fun.RemoveLines(title)
50 | title = strings.TrimSpace(title)
51 |
52 | if maxLength > 0 && maxLength < 128 {
53 | return fun.SubString(title, 0, maxLength)
54 | } else {
55 | return fun.SubString(title, 0, 128)
56 | }
57 | }
58 |
59 | // WebTitleClean 返回尽量清洗后的网页标题
60 | func WebTitleClean(title string, lang string) string {
61 | // 中文网站, 查找中文网站的分割标记, 找到任意一个, 从尾部循环删除后返回
62 | if lang == "zh" {
63 |
64 | for _, split := range titleZhSplits {
65 | if fun.HasPrefixCase(title, split) {
66 | title = fun.RemovePrefix(title, split)
67 | }
68 | }
69 |
70 | // 去除首页开头
71 | if fun.HasPrefixCase(title, "首页") {
72 | title = regexp.MustCompile("首页([ |\\-_-—|])*").ReplaceAllString(title, "")
73 | }
74 |
75 | titleClean := title
76 | for _, split := range titleZhSplits {
77 | var exists bool
78 | end := strings.LastIndex(titleClean, split)
79 | if end != -1 {
80 | exists = true
81 | for {
82 | titleClean = strings.TrimSpace(titleClean[:end])
83 | end = strings.LastIndex(titleClean, split)
84 |
85 | if end == -1 {
86 | break
87 | }
88 | }
89 | if exists {
90 | break
91 | }
92 | }
93 | }
94 |
95 | // 去除尾巴
96 | if titleClean != "首页" {
97 | titleClean = fun.RemoveSuffix(titleClean, "首页")
98 | }
99 |
100 | titleClean = fun.RemoveSign(titleClean)
101 |
102 | return titleClean
103 |
104 | } else {
105 | // 其他, 查找英文分割标记, 如果找到, 从尾部删除一次返回
106 | for _, split := range titleEnSplits {
107 | end := strings.LastIndex(title, split)
108 | if end != -1 {
109 | titleClean := strings.TrimSpace(title[:end])
110 | return titleClean
111 | }
112 | }
113 | }
114 |
115 | return title
116 | }
117 |
118 | // WebContentTitleClean 返回内容页尽量清洗后的网页标题
119 | func WebContentTitleClean(title string, lang string) string {
120 | // 中文网站, 查找中文网站的分割标记, 找到任意一个, 从尾部循环删除后返回
121 | if lang == "zh" {
122 | for _, split := range titleZhContentSplits {
123 | if fun.HasPrefixCase(title, split) {
124 | title = fun.RemovePrefix(title, split)
125 | }
126 | }
127 |
128 | titleClean := title
129 | for _, split := range titleZhContentSplits {
130 | var exists bool
131 | end := strings.LastIndex(titleClean, split)
132 | if end != -1 {
133 | exists = true
134 | for {
135 | titleClean = strings.TrimSpace(titleClean[:end])
136 | end = strings.LastIndex(titleClean, split)
137 |
138 | if end == -1 {
139 | break
140 | }
141 | }
142 | if exists {
143 | break
144 | }
145 | }
146 | }
147 |
148 | return titleClean
149 |
150 | } else {
151 | // 其他, 查找英文分割标记, 如果找到, 从尾部删除一次返回
152 | for _, split := range titleEnSplits {
153 | end := strings.LastIndex(title, split)
154 | if end != -1 {
155 | titleClean := strings.TrimSpace(title[:end])
156 | return titleClean
157 | }
158 | }
159 | }
160 |
161 | return title
162 | }
163 |
164 | // WebKeywords 返回网页 Keyword
165 | func WebKeywords(doc *goquery.Document) string {
166 | keywords := doc.Find("meta[name='keywords' i]").AttrOr("content", "")
167 | keywords = fun.RemoveLines(keywords)
168 | keywords = strings.TrimSpace(keywords)
169 |
170 | return keywords
171 | }
172 |
173 | // WebDescription 返回网页描述, 最大 384 个字符
174 | func WebDescription(doc *goquery.Document, maxLength int) string {
175 | description := doc.Find("meta[name='description' i]").AttrOr("content", "")
176 | description = fun.RemoveLines(description)
177 | description = strings.TrimSpace(description)
178 |
179 | if maxLength > 0 && maxLength < 384 {
180 | return fun.SubString(description, 0, maxLength)
181 | } else {
182 | return fun.SubString(description, 0, 384)
183 | }
184 | }
185 |
186 | // WebLinkTitles 返回网页链接和锚文本
187 | func WebLinkTitles(doc *goquery.Document, baseUrl *url.URL, strictDomain bool) (map[string]string, map[string]string) {
188 | var linkTitles = make(map[string]string)
189 | var filters = make(map[string]string)
190 |
191 | // 当前请求的 urlStr
192 | if baseUrl == nil {
193 | return linkTitles, filters
194 | }
195 |
196 | // 获取所有 a 链接
197 | aTags := doc.Find("a")
198 | if aTags.Size() > 0 {
199 | var tmpLinks = make(map[string]string)
200 |
201 | // 提取所有的 a 链接
202 | aTags.Each(func(i int, s *goquery.Selection) {
203 | tmpLink, exists := s.Attr("href")
204 | if exists {
205 | tmpLink = fun.RemoveLines(tmpLink)
206 | tmpLink = strings.TrimSpace(tmpLink)
207 |
208 | tmpTitle := s.Text()
209 | tmpTitle = fun.NormaliseSpace(tmpTitle)
210 | tmpTitle = strings.TrimSpace(tmpTitle)
211 | if tmpLink != "" && tmpTitle != "" {
212 | // 如果链接已存在, 保留长标题
213 | if _, exists := tmpLinks[tmpLink]; exists {
214 | oldTitle := tmpLinks[tmpLink]
215 | if len(oldTitle) < len(tmpTitle) {
216 | tmpLinks[tmpLink] = tmpTitle
217 | }
218 | } else {
219 | tmpLinks[tmpLink] = tmpTitle
220 | }
221 | }
222 | }
223 | })
224 |
225 | // 过滤链接
226 | tmpLinkLen := len(tmpLinks)
227 | if tmpLinkLen > 0 {
228 | for link, title := range tmpLinks {
229 | if a, err := filterUrl(link, baseUrl, strictDomain); err == nil {
230 | linkTitles[a] = title
231 | } else {
232 | filters[a] = err.Error()
233 | }
234 | }
235 | }
236 | }
237 |
238 | return linkTitles, filters
239 | }
240 |
241 | // filterUrl 过滤 url
242 | func filterUrl(link string, baseUrl *url.URL, strictDomain bool) (string, error) {
243 | var urlStr string
244 |
245 | // 过滤掉链接中包含特殊字符的
246 | if fun.ContainsAny(link, invalidUrlCharsets...) {
247 | return link, errors.New("invalid url with illegal characters")
248 | }
249 |
250 | // 转换为绝对路径
251 | if !fun.HasPrefixCase(link, "http") && !fun.HasPrefixCase(link, "https") {
252 | if l, err := baseUrl.Parse(link); err == nil {
253 | urlStr = l.String()
254 | } else {
255 | return link, errors.New("invalid url with baseUrl parse error")
256 | }
257 | } else {
258 | urlStr = link
259 | }
260 |
261 | // 解析验证
262 | u, err := fun.UrlParse(urlStr)
263 | if err != nil {
264 | return urlStr, errors.New("invalid url with parse error")
265 | }
266 |
267 | // 验证转换后是否是绝对路径
268 | if !u.IsAbs() {
269 | return urlStr, errors.New("invalid url with not absolute url")
270 | }
271 |
272 | // 验证非常规端口
273 | if u.Port() != "" {
274 | return urlStr, errors.New("invalid url with not 80 port")
275 | }
276 |
277 | // 验证主机名
278 | if RegexHostnameIpPattern.MatchString(u.Hostname()) {
279 | return urlStr, errors.New("invalid url with ip hostname")
280 | }
281 |
282 | // 过滤掉明显错误的后缀
283 | ext := path.Ext(u.Path)
284 | if strings.Contains(ext, ".") {
285 | ext = strings.ToLower(ext)
286 | if fun.SliceContains(filterUrlSuffix, ext) {
287 | return urlStr, errors.New("invalid url with suffix")
288 | }
289 | }
290 |
291 | // 过滤掉站外链接
292 | if strictDomain {
293 | hostname := u.Hostname()
294 | domainTop := DomainTop(hostname)
295 | baseDomainTop := DomainTop(baseUrl.Hostname())
296 | if domainTop != baseDomainTop {
297 | return urlStr, errors.New("invalid url with strict domain")
298 | }
299 | }
300 |
301 | return urlStr, nil
302 | }
303 |
--------------------------------------------------------------------------------
/detect.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "bytes"
5 | "errors"
6 | "net/url"
7 | "strings"
8 |
9 | "github.com/PuerkitoBio/goquery"
10 | "github.com/suosi-inc/go-pkg-spider/extract"
11 | "github.com/x-funs/go-fun"
12 | )
13 |
14 | type DomainRes struct {
15 | // 域名
16 | Domain string
17 | // 主页域名
18 | HomeDomain string
19 | // 协议
20 | Scheme string
21 | // 字符集
22 | Charset CharsetRes
23 | // 语种
24 | Lang LangRes
25 | // 国家
26 | Country string
27 | // 省份
28 | Province string
29 | // 分类
30 | Category string
31 | // 标题
32 | Title string
33 | // 标题
34 | TitleClean string
35 | // 描述
36 | Description string
37 | // ICP
38 | Icp string
39 | // 状态
40 | State bool
41 | // 状态码
42 | StatusCode int
43 | // 内容页链接数量
44 | ContentCount int
45 | // 列表页链接数量
46 | ListCount int
47 | // 子域名列表
48 | SubDomains map[string]bool
49 | }
50 |
51 | // DetectDomain 域名探测
52 | // DomainRes.State true 和 err nil 表示探测成功
53 | // DomainRes.State true 可能会返回 err, 如 doc 解析失败
54 | // DomainRes.State false 时根据 StatusCode 判断是请求是否成功或请求成功但响应失败(如404)
55 | func DetectDomain(domain string, timeout int, retry int) (*DomainRes, error) {
56 | if retry == 0 {
57 | retry = 1
58 | }
59 |
60 | for i := 0; i < retry; i++ {
61 | domainRes, err := DetectDomainDo(domain, true, timeout)
62 | if domainRes.StatusCode != 0 || err == nil {
63 | return domainRes, err
64 | }
65 | }
66 |
67 | var charset CharsetRes
68 | var lang LangRes
69 | domainRes := &DomainRes{
70 | Charset: charset,
71 | Lang: lang,
72 | }
73 | return domainRes, errors.New("ErrorDomainDetect")
74 | }
75 |
76 | // DetectSubDomain 子域名探测
77 | // DomainRes.State true 和 err nil 表示探测成功
78 | // DomainRes.State true 可能会返回 err, 如 doc 解析失败
79 | // DomainRes.State false 时根据 StatusCode 判断是请求是否成功或请求成功但响应失败(如404)
80 | func DetectSubDomain(domain string, timeout int, retry int) (*DomainRes, error) {
81 | if retry == 0 {
82 | retry = 1
83 | }
84 |
85 | for i := 0; i < retry; i++ {
86 | domainRes, err := DetectDomainDo(domain, false, timeout)
87 | if domainRes.StatusCode != 0 || err == nil {
88 | return domainRes, err
89 | }
90 | }
91 |
92 | var charset CharsetRes
93 | var lang LangRes
94 | domainRes := &DomainRes{
95 | Charset: charset,
96 | Lang: lang,
97 | }
98 | return domainRes, errors.New("ErrorDomainDetect")
99 | }
100 |
101 | func DetectDomainDo(domain string, isTop bool, timeout int) (*DomainRes, error) {
102 | if timeout == 0 {
103 | timeout = 10000
104 | }
105 |
106 | domainRes := &DomainRes{}
107 |
108 | req := &HttpReq{
109 | HttpReq: &fun.HttpReq{
110 | MaxContentLength: 10 * 1024 * 1024,
111 | MaxRedirect: 3,
112 | },
113 | ForceTextContentType: true,
114 | }
115 |
116 | scheme := "http"
117 |
118 | // 是否进行首页探测
119 | var homes []string
120 | if isTop {
121 | homes = []string{"www", ""}
122 | } else {
123 | homes = []string{""}
124 | }
125 |
126 | for _, home := range homes {
127 |
128 | var urlStr string
129 | var homeDomain string
130 | if home != "" {
131 | homeDomain = home + fun.DOT + domain
132 | urlStr = scheme + "://" + homeDomain
133 | } else {
134 | homeDomain = domain
135 | urlStr = scheme + "://" + homeDomain
136 | }
137 |
138 | resp, err := HttpGetResp(urlStr, req, timeout)
139 |
140 | if resp != nil && err == nil && resp.Success {
141 | domainRes.Domain = domain
142 | domainRes.StatusCode = resp.StatusCode
143 |
144 | // 如果发生 HTTP 跳转, 则重新设置 homeDomain, 判断跳转后是否是同一个主域名, 如果域名改变则记录并返回错误
145 | domainRes.HomeDomain = homeDomain
146 | requestHostname := resp.RequestURL.Hostname()
147 | if domainRes.HomeDomain != requestHostname {
148 | requestTopDomain := extract.DomainTop(requestHostname)
149 | if requestTopDomain != "" && requestTopDomain != domain {
150 | // 验证主机名
151 | if RegexHostnameIpPattern.MatchString(requestHostname) {
152 | return domainRes, errors.New("ErrorRedirectHost")
153 | }
154 | // 验证非常规端口
155 | if resp.RequestURL.Port() != "" {
156 | return domainRes, errors.New("ErrorRedirectHost")
157 | }
158 |
159 | return domainRes, errors.New("ErrorRedirect:" + requestTopDomain)
160 | }
161 |
162 | domainRes.HomeDomain = requestHostname
163 | }
164 |
165 | // 如果发生了协议跳转, 则重新设置 scheme
166 | domainRes.Scheme = scheme
167 | if domainRes.Scheme != resp.RequestURL.Scheme {
168 | domainRes.Scheme = resp.RequestURL.Scheme
169 | }
170 |
171 | // 字符集
172 | domainRes.Charset = resp.Charset
173 |
174 | // 解析 HTML
175 | u, _ := url.Parse(urlStr)
176 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
177 | if docErr == nil {
178 | doc.Find(DefaultDocRemoveTags).Remove()
179 |
180 | // 具有 HTML 跳转属性, HTTP 无法自动处理永远返回错误, 判断跳转后是否是同一个主域名, 记录并返回
181 | if refresh, exists := doc.Find("meta[http-equiv='refresh' i]").Attr("content"); exists {
182 | refreshMatch := regexMetaRefreshPattern.FindStringSubmatch(refresh)
183 | if len(refreshMatch) > 1 {
184 | refreshUrl := refreshMatch[1]
185 | if r, err := fun.UrlParse(refreshUrl); err == nil {
186 | refreshHostname := r.Hostname()
187 | refreshTopDomain := extract.DomainTop(refreshHostname)
188 | if refreshTopDomain != "" && refreshTopDomain != domain {
189 | // 验证主机名
190 | if RegexHostnameIpPattern.MatchString(refreshHostname) {
191 | return domainRes, errors.New("ErrorMetaJumpHost")
192 | }
193 | // 验证非常规端口
194 | if r.Port() != "" {
195 | return domainRes, errors.New("ErrorMetaJumpHost")
196 | }
197 |
198 | return domainRes, errors.New("ErrorMetaJump:" + refreshTopDomain)
199 | }
200 | }
201 | return domainRes, errors.New("ErrorMetaJump")
202 | }
203 | }
204 |
205 | // 中国 ICP 解析
206 | icp, province := extract.Icp(doc)
207 | if icp != "" && province != "" {
208 | domainRes.Country = "中国"
209 | domainRes.Icp = icp
210 | domainRes.Province = extract.ProvinceShortMap[province]
211 | }
212 |
213 | // 语言
214 | langRes := Lang(doc, resp.Charset.Charset, true)
215 | domainRes.Lang = langRes
216 |
217 | // 尽可能的探测一些信息国家/省份/类别
218 | if domainRes.Country == "" {
219 | country, province, category := extract.MetaFromHost(u.Hostname(), langRes.Lang)
220 | domainRes.Country = country
221 | domainRes.Province = province
222 | domainRes.Category = category
223 | }
224 |
225 | // 标题摘要
226 | domainRes.Title = extract.WebTitle(doc, 0)
227 | domainRes.TitleClean = extract.WebTitleClean(domainRes.Title, langRes.Lang)
228 | domainRes.Description = extract.WebDescription(doc, 0)
229 |
230 | // 站内链接
231 | linkTitles, _ := extract.WebLinkTitles(doc, resp.RequestURL, true)
232 |
233 | // 链接分类
234 | links, subDomains := extract.LinkTypes(linkTitles, langRes.Lang, nil)
235 |
236 | domainRes.ContentCount = len(links.Content)
237 | domainRes.ListCount = len(links.List)
238 | domainRes.SubDomains = subDomains
239 |
240 | domainRes.State = true
241 |
242 | return domainRes, nil
243 | } else {
244 | return domainRes, errors.New("ErrorDocParse")
245 | }
246 | } else {
247 | if resp != nil {
248 | domainRes.StatusCode = resp.StatusCode
249 | }
250 | }
251 | }
252 |
253 | return domainRes, errors.New("ErrorDomainDetect")
254 | }
255 |
256 | func DetectFriendDomain(domain string, timeout int, retry int) (map[string]string, error) {
257 | if retry == 0 {
258 | retry = 1
259 | }
260 |
261 | friendDomains := make(map[string]string, 0)
262 |
263 | for i := 0; i < retry; i++ {
264 | friendDomains, err := DetectFriendDomainDo(domain, timeout)
265 | if err == nil {
266 | return friendDomains, err
267 | }
268 | }
269 |
270 | return friendDomains, errors.New("ErrorDomainDetect")
271 | }
272 |
273 | func DetectFriendDomainDo(domain string, timeout int) (map[string]string, error) {
274 | if timeout == 0 {
275 | timeout = 10000
276 | }
277 |
278 | friendDomains := make(map[string]string, 0)
279 |
280 | req := &HttpReq{
281 | HttpReq: &fun.HttpReq{
282 | MaxContentLength: 10 * 1024 * 1024,
283 | MaxRedirect: 3,
284 | },
285 | ForceTextContentType: true,
286 | }
287 |
288 | scheme := "http"
289 | homes := []string{"www", ""}
290 |
291 | for _, home := range homes {
292 |
293 | var urlStr string
294 | var homeDomain string
295 | if home != "" {
296 | homeDomain = home + fun.DOT + domain
297 | urlStr = scheme + "://" + homeDomain
298 | } else {
299 | homeDomain = domain
300 | urlStr = scheme + "://" + homeDomain
301 | }
302 |
303 | resp, err := HttpGetResp(urlStr, req, timeout)
304 |
305 | if resp != nil && err == nil && resp.Success {
306 |
307 | doc, docErr := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
308 | if docErr == nil {
309 | doc.Find(DefaultDocRemoveTags).Remove()
310 |
311 | // 非限制域名所有链接
312 | linkTitles, _ := extract.WebLinkTitles(doc, resp.RequestURL, false)
313 |
314 | if len(linkTitles) > 0 {
315 | for link, title := range linkTitles {
316 | if link == "" || title == "" {
317 | continue
318 | }
319 |
320 | u, e := fun.UrlParse(link)
321 | if e != nil {
322 | continue
323 | }
324 |
325 | // 验证非常规端口
326 | if u.Port() != "" {
327 | continue
328 | }
329 |
330 | // 验证主机名
331 | if fun.Matches(`\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}`, u.Hostname()) {
332 | continue
333 | }
334 |
335 | pathDir := strings.TrimSpace(u.Path)
336 | if pathDir == "" || pathDir == fun.SLASH || pathDir == "/index.html" || pathDir == "/index.htm" || pathDir == "/index.shtml" {
337 | hostname := u.Hostname()
338 | domainTop := extract.DomainTop(hostname)
339 | baseDomainTop := domain
340 | if domainTop != baseDomainTop {
341 | friendDomains[domainTop] = title
342 | }
343 | }
344 | }
345 | }
346 |
347 | return friendDomains, nil
348 | } else {
349 | return friendDomains, errors.New("ErrorDocParse")
350 | }
351 | } else {
352 | return friendDomains, err
353 | }
354 | }
355 |
356 | return friendDomains, errors.New("ErrorDomainDetect")
357 | }
358 |
--------------------------------------------------------------------------------
/spider_test.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "bytes"
5 | "crypto/tls"
6 | "fmt"
7 | "net/http"
8 | "net/url"
9 | "regexp"
10 | "strconv"
11 | "testing"
12 | "unicode/utf8"
13 |
14 | "github.com/PuerkitoBio/goquery"
15 | "github.com/microcosm-cc/bluemonday"
16 | "github.com/suosi-inc/go-pkg-spider/extract"
17 | "github.com/x-funs/go-fun"
18 | )
19 |
20 | func BenchmarkHtmlParse(b *testing.B) {
21 |
22 | resp, _ := fun.HttpGetResp("https://www.163.com", nil, 30000)
23 |
24 | b.ResetTimer()
25 | for i := 0; i < b.N; i++ {
26 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
27 | doc.Find(DefaultDocRemoveTags).Remove()
28 | }
29 | }
30 |
31 | func TestGoquery(t *testing.T) {
32 | body, _ := HttpGet("https://jp.news.cn/index.htm")
33 | doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
34 |
35 | // lang, exist := doc.Find("html").Attr("id")
36 |
37 | doc.Find("script,noscript,style,iframe,br,link,svg,textarea").Remove()
38 | text := doc.Find("body").Text()
39 | text = fun.RemoveSign(text)
40 |
41 | fmt.Println(text)
42 | }
43 |
44 | func TestRegex(t *testing.T) {
45 | str := ",.!,,D_NAME。!;‘’”“《》**dfs#%^&()-+我1431221 中国123漢字かどうかのjavaを<決定>$¥"
46 | r := regexp.MustCompile(`[\p{Hiragana}|\p{Katakana}]`)
47 | s := r.FindAllString(str, -1)
48 | t.Log(str)
49 | t.Log(s)
50 | }
51 |
52 | func TestUrlParse(t *testing.T) {
53 | var urlStrs = []string{
54 | "https://www.163.com",
55 | "https://www.163.com/",
56 | "https://www.163.com/a",
57 | "https://www.163.com/aa.html",
58 | "https://www.163.com/a/b",
59 | "https://www.163.com/a/bb.html",
60 | "https://www.163.com/a/b/",
61 | "https://www.163.com/a/b/c",
62 | "https://www.163.com/a/b/cc.html",
63 | }
64 |
65 | for _, urlStr := range urlStrs {
66 | u, _ := url.Parse(urlStr)
67 | link := "javascript:;"
68 | absolute, err := u.Parse(link)
69 | t.Log(err)
70 |
71 | _, err = url.Parse(absolute.String())
72 | if err != nil {
73 | t.Log(err)
74 | }
75 |
76 | t.Log(urlStr + " + " + link + " => " + absolute.String())
77 | }
78 |
79 | }
80 |
81 | func TestCount(t *testing.T) {
82 | fmt.Println(regexLangHtmlPattern.MatchString("zh"))
83 | fmt.Println(regexLangHtmlPattern.MatchString("en"))
84 | fmt.Println(regexLangHtmlPattern.MatchString("zh-cn"))
85 | fmt.Println(regexLangHtmlPattern.MatchString("utf-8"))
86 |
87 | fmt.Println(utf8.RuneCountInString("https://khmers.cn/2022/05/23/%e6%b4%aa%e6%a3%ae%e6%80%bb%e7%90%86%ef%bc%9a%e6%9f%ac%e5%9f%94%e5%af%a8%e7%b4%af%e8%ae%a1%e8%8e%b7%e5%be%97%e8%b6%85%e8%bf%875200%e4%b8%87%e5%89%82%e6%96%b0%e5%86%a0%e7%96%ab%e8%8b%97%ef%bc%8c/"))
88 | }
89 |
90 | func TestGetLinkData(t *testing.T) {
91 | var urlStrs = []string{
92 | // "https://www.1905.com",
93 | // "https://www.people.com.cn",
94 | // "https://www.36kr.com",
95 | // "https://www.163.com",
96 | // "https://news.163.com/",
97 | // "http://jyj.suqian.gov.cn",
98 | // "https://www.huxiu.com/",
99 | // "http://www.news.cn/politicspro/",
100 | // "http://www.cankaoxiaoxi.com",
101 | // "http://www.bbc.com",
102 | // "https://www.ft.com",
103 | // "https://www.reuters.com/",
104 | // "https://nypost.com/",
105 | // "http://www.mengcheng.gov.cn/",
106 | // "https://www.chunichi.co.jp",
107 | // "https://www.donga.com/",
108 | // "https://people.com/",
109 | // "https://czql.gov.cn/",
110 | // "https://qiye.163.com/",
111 | // "https://www.washingtontimes.com/",
112 | // "https://www.gamersky.com/",
113 | // "https://www.cdns.com.tw/",
114 | // "http://www.163.com/",
115 |
116 | // "https://data.163.com",
117 | // "https://www.sensetime.com/cn/news-index",
118 | // "",
119 | "https://www.sis.gov.eg/section/7413/7410?lang=zh-cn",
120 | }
121 |
122 | for _, urlStr := range urlStrs {
123 |
124 | if linkData, err := GetLinkData(urlStr, false, 10000, 1); err == nil {
125 | fmt.Println("subDomain:", len(linkData.SubDomains))
126 | fmt.Println("content:", len(linkData.LinkRes.Content))
127 | fmt.Println("list:", len(linkData.LinkRes.List))
128 | fmt.Println("unknown:", len(linkData.LinkRes.Unknown))
129 | fmt.Println("none:", len(linkData.LinkRes.None))
130 |
131 | i := 0
132 | for a, title := range linkData.Filters {
133 | i = i + 1
134 | fmt.Println(i, "filter:"+a+"\t=>\t"+title)
135 | }
136 | i = 0
137 | for a, title := range linkData.SubDomains {
138 | i = i + 1
139 | fmt.Println(i, "subDomain:"+a+"\t=>\t"+strconv.FormatBool(title))
140 | }
141 | i = 0
142 | for a, title := range linkData.LinkRes.Content {
143 | i = i + 1
144 | fmt.Println(i, "content:"+a+"\t=>\t"+title)
145 | }
146 | i = 0
147 | for a, title := range linkData.LinkRes.Unknown {
148 | i = i + 1
149 | fmt.Println(i, "unknown:"+a+"\t=>\t"+title)
150 | }
151 | i = 0
152 | for a, title := range linkData.LinkRes.List {
153 | i = i + 1
154 | fmt.Println(i, "list:"+a+"\t=>\t"+title)
155 | }
156 | i = 0
157 | for a, title := range linkData.LinkRes.None {
158 | i = i + 1
159 | fmt.Println(i, "none:"+a+"\t=>\t"+title)
160 | }
161 | }
162 | }
163 | }
164 |
165 | func TestGetNews(t *testing.T) {
166 |
167 | var urlStrs = []string{
168 | // "http://www.cankaoxiaoxi.com/finance/20220831/2489264.shtml",
169 | // "https://www.163.com/news/article/HG3DE7AQ000189FH.html",
170 | // "http://suosi.com.cn/",
171 | // "http://www.cankaoxiaoxi.com/world/20220831/2489267.shtml",
172 | // "http://www.cankaoxiaoxi.com/photo/20220901/2489404.shtml",
173 | // "http://column.cankaoxiaoxi.com/2022/0831/2489330.shtml",
174 | // "http://www.gov.cn/xinwen/2022-08/31/content_5707661.htm",
175 | // "http://suosi.com.cn/2019/14.shtml",
176 | // "https://www.wangan.com/p/7fy78317feb66b37",
177 | // "https://www.wangan.com/news/7fy78y38c7207bf0",
178 | // "http://env.people.com.cn/n1/2022/0901/c1010-32516651.html",
179 | "http://com.gd.gov.cn/go/article.php?typeid=40&contentId=23262",
180 | // "http://www.changzhou.gov.cn/ns_news/827166202029392",
181 | // "https://www.163.com/money/article/HG4TRBL1002580S6.html?clickfrom=w_yw_money",
182 | // "https://mp.weixin.qq.com/s?__biz=MzUxODkxNTYxMA==&mid=2247484842&idx=1&sn=d9822ee4662523609aee7441066c2a96&chksm=f980d6dfcef75fc93cb1e7942cb16ec82a7fb7ec3c2d857c307766daff667bd63ab1b4941abd&exportkey=AXWfguuAyJjlOJgCHf10io8%3D&acctmode=0&pass_ticket=8eXqj",
183 | // "https://www.bbc.com/news/world-asia-62744522",
184 | // "https://www.sohu.com/a/581634395_121284943",
185 | // "https://edition.cnn.com/2022/01/30/europe/lithuania-took-on-china-intl-cmd/index.html",
186 | // "https://www.36kr.com/p/1897541916043649",
187 | // "https://www.huxiu.com/article/651531.html",
188 | // "http://www.news.cn/politics/2022-09/02/c_1128969463.htm",
189 | // "https://www.ccdi.gov.cn/yaowenn/202209/t20220901_215343.html",
190 | // "https://new.qq.com/omn/20200701/20200701A04H7500",
191 | // "http://v.china.com.cn/2022-09/06/content_78407150.html",
192 | // "http://www.chinagwy.org.cn/content-cat-10/143162.html",
193 | // "https://news.52pk.com/xwlm/201912/7366710.shtml",
194 | // "https://www.business-standard.com/article/finance/govt-rbi-propose-action-plan-for-facilitating-special-rupee-accounts-122090701260_1.html",
195 | // "https://www.squirepattonboggs.com/en/news/2022/09/squire-patton-boggs-advises-new-wave-group-ab-on-uk-acquisition",
196 | // "https://www.thebulletin.be/number-road-deaths-belgium-rises-sharply",
197 | // "https://www.dailyexpress.com.my/read/4840/ma63-zero-without-equitable-economic-partnership/",
198 | // "https://news.cgtn.com/news/2022-08-20/CGTN-documentary-Remote-Killing-released-1cE7t7RD104/index.html",
199 | // "https://www.sensetime.com/en/news-detail/51164633?categoryId=1072",
200 | }
201 |
202 | for _, urlStr := range urlStrs {
203 | if news, resp, err := GetNews(urlStr, "", 10000, 1); err == nil {
204 | t.Log(resp.Charset)
205 | t.Log(news.Lang)
206 | t.Log(news.Spend)
207 | t.Log(news.Title)
208 | t.Log(news.TitlePos)
209 | t.Log(news.TimeLocal)
210 | t.Log(news.Time)
211 | t.Log(news.TimePos)
212 | t.Log(news.Content)
213 |
214 | if news.ContentNode != nil {
215 | // 内容 html 节点
216 | node := goquery.NewDocumentFromNode(news.ContentNode)
217 | contentHtml, _ := node.Html()
218 | t.Log(fun.NormaliseLine(contentHtml))
219 |
220 | // 内容 html 节点清理, 仅保留 p img 标签
221 | p := bluemonday.NewPolicy()
222 | p.AllowElements("p")
223 | p.AllowImages()
224 | html := p.Sanitize(contentHtml)
225 | t.Log(fun.NormaliseLine(html))
226 | }
227 | }
228 | }
229 | }
230 |
231 | func TestGetNewsWithReq(t *testing.T) {
232 | transport := &http.Transport{
233 | TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
234 | DisableKeepAlives: true,
235 | }
236 | proxyString := "http://username:password@host:port"
237 | proxy, _ := url.Parse(proxyString)
238 | transport.Proxy = http.ProxyURL(proxy)
239 |
240 | req := &HttpReq{
241 | HttpReq: &fun.HttpReq{
242 | MaxContentLength: HttpDefaultMaxContentLength,
243 | MaxRedirect: 2,
244 | Transport: transport,
245 | },
246 | ForceTextContentType: true,
247 | }
248 |
249 | var urlStrs = []string{
250 | "https://www.bbc.com/news/world-asia-62744522",
251 | }
252 |
253 | for _, urlStr := range urlStrs {
254 | if news, resp, err := GetNewsWithReq(urlStr, "", req, 10000, 1); err == nil {
255 | t.Log(resp.Charset)
256 | t.Log(news.Spend)
257 | t.Log(news.Title)
258 | t.Log(news.TitlePos)
259 | t.Log(news.TimeLocal)
260 | t.Log(news.Time)
261 | t.Log(news.TimePos)
262 | t.Log(news.Content)
263 |
264 | if news.ContentNode != nil {
265 | // 内容 html 节点
266 | node := goquery.NewDocumentFromNode(news.ContentNode)
267 | contentHtml, _ := node.Html()
268 | t.Log(fun.NormaliseLine(contentHtml))
269 |
270 | // 内容 html 节点清理, 仅保留 p img 标签
271 | p := bluemonday.NewPolicy()
272 | p.AllowElements("p")
273 | p.AllowImages()
274 | html := p.Sanitize(contentHtml)
275 | t.Log(fun.NormaliseLine(html))
276 | }
277 | }
278 | }
279 | }
280 |
281 | func TestDemo(t *testing.T) {
282 | a := "2022-05-26 17:00:57 UTC"
283 | findString := regexp.MustCompile(extract.RegexPublishDate).FindStringSubmatch(a)
284 | t.Log(findString)
285 | t.Log(fun.Date(fun.StrToTime("2022-04-10T18:24:00")))
286 | }
287 |
--------------------------------------------------------------------------------
/lang.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "regexp"
5 | "strings"
6 | "unicode/utf8"
7 |
8 | "github.com/PuerkitoBio/goquery"
9 | "github.com/suosi-inc/go-pkg-spider/extract"
10 | "github.com/suosi-inc/lingua-go"
11 | "github.com/x-funs/go-fun"
12 | )
13 |
14 | const (
15 | LangPosCharset = "charset"
16 | LangPosHtmlTag = "html"
17 | LangPosBody = "body"
18 | LangPosLingua = "lingua"
19 | LangPosTitleZh = "title"
20 | BodyChunkSize = 2048
21 | BodyMinSize = 64
22 |
23 | RegexLangHtml = "^(?i)([a-z]{2}|[a-z]{2}\\-[a-z]+)$"
24 | )
25 |
26 | var (
27 | CharsetLangMap = map[string]string{
28 | "GBK": "zh",
29 | "Big5": "zh",
30 | "ISO-2022-CN": "zh",
31 | "SHIFT_JIS": "ja",
32 | "KOI8-R": "ru",
33 | "EUC-JP": "ja",
34 | "EUC-KR": "ko",
35 | "EUC-CN": "zh",
36 | "ISO-2022-JP": "ja",
37 | "ISO-2022-KR": "ko",
38 | }
39 |
40 | LangEnZhMap = map[string]string{
41 | "zh": "中文",
42 | "en": "英语",
43 | "ja": "日语",
44 | "ru": "俄语",
45 | "ko": "韩语",
46 | "ar": "阿拉伯语",
47 | "hi": "印地语",
48 | "de": "德语",
49 | "fr": "法语",
50 | "es": "西班牙语",
51 | "pt": "葡萄牙语",
52 | "it": "意大利语",
53 | "th": "泰语",
54 | "vi": "越南语",
55 | "my": "缅甸语",
56 | }
57 |
58 | LangZhEnMap = map[string]string{
59 | "中文": "zh",
60 | "英语": "en",
61 | "日语": "ja",
62 | "俄语": "ru",
63 | "韩语": "ko",
64 | "阿拉伯语": "ar",
65 | "印地语": "hi",
66 | "德语": "de",
67 | "法语": "fr",
68 | "西班牙语": "es",
69 | "葡萄牙语": "pt",
70 | "意大利语": "it",
71 | "泰语": "th",
72 | "越南语": "vi",
73 | "缅甸语": "my",
74 | }
75 |
76 | langMetaSelectors = []string{
77 | "meta[http-equiv='content-language' i]",
78 | "meta[name='lang' i]",
79 | }
80 |
81 | linguaLanguages = []lingua.Language{
82 | lingua.Arabic,
83 | lingua.Russian,
84 | lingua.Hindi,
85 | lingua.Korean,
86 | }
87 |
88 | linguaLatinLanguages = []lingua.Language{
89 | lingua.French,
90 | lingua.German,
91 | lingua.Spanish,
92 | lingua.Portuguese,
93 | lingua.English,
94 | }
95 |
96 | linguaMap = map[string]string{
97 | "arabic": "ar",
98 | "russian": "ru",
99 | "hindi": "hi",
100 | "korean": "ko",
101 | "french": "fr",
102 | "german": "de",
103 | "spanish": "es",
104 | "portuguese": "pt",
105 | "english": "en",
106 | }
107 |
108 | regexLangHtmlPattern = regexp.MustCompile(RegexLangHtml)
109 | regexPuncsPattern = regexp.MustCompile(`[\pP\pS]`)
110 | regexEnPattern = regexp.MustCompile(`[a-zA-Z]`)
111 | regexLatinPattern = regexp.MustCompile("[\u0080-\u00ff]")
112 | regexZhPattern = regexp.MustCompile(`\p{Han}`)
113 | regexJaPattern = regexp.MustCompile(`[\p{Hiragana}|\p{Katakana}]`)
114 | regexKoPattern = regexp.MustCompile(`\p{Hangul}`)
115 | )
116 |
117 | type LangRes struct {
118 | Lang string
119 | LangPos string
120 | }
121 |
122 | // LangText 探测纯文本语种
123 | func LangText(text string) (string, string) {
124 | return langFromText(text)
125 | }
126 |
127 | // Lang 探测 HTML 语种
128 | func Lang(doc *goquery.Document, charset string, listMode bool) LangRes {
129 | var res LangRes
130 | var lang string
131 |
132 | // 如果存在特定语言的 charset 对照表, 则直接返回
133 | if charset != "" {
134 | if _, exist := CharsetLangMap[charset]; exist {
135 | res.Lang = CharsetLangMap[charset]
136 | res.LangPos = LangPosCharset
137 | return res
138 | }
139 | }
140 |
141 | // 优先判断Title是否包含中文, 辅助内容排除日韩
142 | titleLang, pos := LangFromTitle(doc, listMode)
143 | if titleLang != "" {
144 | res.Lang = titleLang
145 | res.LangPos = pos
146 | return res
147 | }
148 |
149 | // 解析 Html 语言属性, 当不为空不为 en 时可信度比较高, 直接返回
150 | lang = LangFromHtml(doc)
151 | if lang != "" && lang != "en" {
152 | res.Lang = lang
153 | res.LangPos = LangPosHtmlTag
154 | return res
155 | }
156 |
157 | // 当 utf 编码时, lang 为空或 en 可信度比较低, 进行基于内容语种的检测
158 | if strings.HasPrefix(charset, "UTF") && (lang == "" || lang == "en") {
159 | bodyLang, pos := LangFromUtf8Body(doc, listMode)
160 | if bodyLang != "" {
161 | res.Lang = bodyLang
162 | res.LangPos = pos
163 | }
164 | }
165 |
166 | return res
167 | }
168 |
169 | func LangFromHtml(doc *goquery.Document) string {
170 | var lang string
171 |
172 | // html lang
173 | if lang, exists := doc.Find("html").Attr("lang"); exists {
174 | lang = strings.TrimSpace(lang)
175 | if regexLangHtmlPattern.MatchString(lang) {
176 | lang = fun.SubString(lang, 0, 2)
177 | return lang
178 | }
179 | }
180 | if lang, exists := doc.Find("html").Attr("xml:lang"); exists {
181 | lang = strings.TrimSpace(lang)
182 | if regexLangHtmlPattern.MatchString(lang) {
183 | lang = fun.SubString(lang, 0, 2)
184 | return lang
185 | }
186 |
187 | }
188 | for _, selector := range langMetaSelectors {
189 | if lang, exists := doc.Find(selector).Attr("content"); exists {
190 | lang = strings.TrimSpace(lang)
191 | if regexLangHtmlPattern.MatchString(lang) {
192 | lang = fun.SubString(lang, 0, 2)
193 | return lang
194 | }
195 | }
196 | }
197 |
198 | return lang
199 | }
200 | func LangFromTitle(doc *goquery.Document, listMode bool) (string, string) {
201 | var lang string
202 | var text string
203 |
204 | // 获取 Title
205 | title := extract.WebTitle(doc, 0)
206 | text = fun.RemoveSign(title)
207 | text = strings.TrimSpace(text)
208 |
209 | if text != "" {
210 | // 首先判断标题是否包含汉字
211 | han := regexZhPattern.FindAllString(text, -1)
212 | if han != nil {
213 | hanCount := len(han)
214 |
215 | // 汉字数量 >=2
216 | if hanCount >= 2 {
217 |
218 | // 需要抽取内容验证包含有日语韩语, 如(日本語_新華網)
219 | bodyText := bodyTextForLang(doc, listMode)
220 |
221 | // 去除所有符号
222 | bodyText = fun.RemoveSign(bodyText)
223 |
224 | // 最大截取 BodyChunkSize 个字符
225 | bodyText = fun.SubString(bodyText, 0, BodyChunkSize)
226 | bodyText = strings.TrimSpace(bodyText)
227 |
228 | bodyTextCount := utf8.RuneCountInString(bodyText)
229 |
230 | // 包含一定的日语
231 | ja := regexJaPattern.FindAllString(bodyText, -1)
232 | if ja != nil {
233 | jaCount := len(ja)
234 | jaRate := float64(jaCount) / float64(bodyTextCount)
235 |
236 | // 日语出现比例
237 | if jaRate > 0.2 {
238 | lang = "ja"
239 | return lang, LangPosTitleZh
240 | }
241 | }
242 |
243 | // 包含一定的韩语
244 | ko := regexKoPattern.FindAllString(bodyText, -1)
245 | if ko != nil {
246 | koCount := len(ko)
247 | koRate := float64(koCount) / float64(bodyTextCount)
248 |
249 | // 韩语出现比例
250 | if koRate > 0.2 {
251 | lang = "ko"
252 | return lang, LangPosTitleZh
253 | }
254 | }
255 |
256 | lang = "zh"
257 | return lang, LangPosTitleZh
258 | }
259 | }
260 | }
261 |
262 | return lang, ""
263 | }
264 |
265 | func LangFromUtf8Body(doc *goquery.Document, listMode bool) (string, string) {
266 | var text string
267 |
268 | // 抽取内容
269 | text = bodyTextForLang(doc, listMode)
270 |
271 | return langFromText(text)
272 | }
273 |
274 | func langFromText(text string) (string, string) {
275 | var lang string
276 |
277 | // 去除换行(为了保留语义只替换多余的空格)
278 | text = fun.RemoveLines(text)
279 | text = strings.ReplaceAll(text, fun.TAB, "")
280 | text = strings.ReplaceAll(text, " ", "")
281 |
282 | // 去除符号
283 | text = regexPuncsPattern.ReplaceAllString(text, "")
284 |
285 | // 最大截取 BodyChunkSize 个字符
286 | text = fun.SubString(text, 0, BodyChunkSize)
287 | text = strings.TrimSpace(text)
288 |
289 | // 截取后的字符长度
290 | textCount := utf8.RuneCountInString(text)
291 |
292 | // 内容太少不足以判断语言, 放弃
293 | if textCount < BodyMinSize {
294 | return "", ""
295 | }
296 |
297 | // 首先判断是否包含汉字, 中文和日语
298 | han := regexZhPattern.FindAllString(text, -1)
299 | if han != nil {
300 | hanCount := len(han)
301 | hanRate := float64(hanCount) / float64(textCount)
302 |
303 | // 汉字比例
304 | if hanRate >= 0.3 {
305 | ja := regexJaPattern.FindAllString(text, -1)
306 | if ja != nil {
307 | jaCount := len(ja)
308 | jaRate := float64(jaCount) / float64(hanCount)
309 |
310 | // 日语在汉字中的占比
311 | if jaRate > 0.1 {
312 | lang = "ja"
313 | return lang, LangPosBody
314 | }
315 | }
316 |
317 | lang = "zh"
318 | return lang, LangPosBody
319 | }
320 | }
321 |
322 | // 其次判断拉丁语系, 分析主要的一些语种
323 | english := regexEnPattern.FindAllString(text, -1)
324 | if english != nil {
325 | englishCount := len(english)
326 | englishRate := float64(englishCount) / float64(textCount)
327 | if englishRate > 0.618 {
328 |
329 | // 包含拉丁补充字符集, 使用 lingua 分析主要的非英语拉丁语种
330 | latin := regexLatinPattern.FindAllString(text, -1)
331 | if latin != nil {
332 | latinCount := len(latin)
333 |
334 | if latinCount > 5 {
335 | detector := lingua.NewLanguageDetectorBuilder().FromLanguages(linguaLatinLanguages...).Build()
336 | if language, exists := detector.DetectLanguageOf(text); exists {
337 | key := strings.ToLower(language.String())
338 | linguaLang := linguaMap[key]
339 | return linguaLang, LangPosLingua
340 | }
341 | }
342 | }
343 |
344 | return "en", LangPosBody
345 | }
346 | }
347 |
348 | // 最后, 使用 lingua 分析其他主要的非拉丁语种
349 | detector := lingua.NewLanguageDetectorBuilder().FromLanguages(linguaLanguages...).Build()
350 | if language, exists := detector.DetectLanguageOf(text); exists {
351 |
352 | key := strings.ToLower(language.String())
353 | linguaLang := linguaMap[key]
354 | return linguaLang, LangPosLingua
355 | }
356 |
357 | return lang, ""
358 | }
359 |
360 | func bodyTextForLang(doc *goquery.Document, listMode bool) string {
361 | var text string
362 |
363 | // 列表页模式
364 | if listMode {
365 | // 优先获取网页中最多 64 个 a 标签, 如果没有 a 标签或过少,放弃
366 | aTag := doc.Find("a")
367 | aTagSize := aTag.Size()
368 | if aTagSize >= 16 {
369 | sliceMax := fun.Min(aTagSize, 64)
370 | text = aTag.Slice(0, sliceMax).Text()
371 |
372 | // 如果 a 标签中包含过多的 {} 可能是动态渲染, 放弃
373 | if strings.Count(text, "{") >= 5 && strings.Count(text, "}") >= 5 {
374 | return ""
375 | }
376 | }
377 | } else {
378 | // 内容页模式, 获取网页中最多 64 个 p 标签
379 | pTag := doc.Find("p")
380 | pTagSize := pTag.Size()
381 | sliceMax := fun.Min(pTagSize, 64)
382 | text = pTag.Slice(0, sliceMax).Text()
383 |
384 | // 如果内容太少, 获取全部 body 文本
385 | textCount := utf8.RuneCountInString(text)
386 | if textCount < BodyMinSize {
387 | text = doc.Find("body").Text()
388 | }
389 | }
390 |
391 | return text
392 | }
393 |
--------------------------------------------------------------------------------
/extract/link.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import (
4 | "net/url"
5 | "path"
6 | "regexp"
7 | "strings"
8 | "unicode/utf8"
9 |
10 | "github.com/x-funs/go-fun"
11 | )
12 |
13 | const (
14 | LinkTypeNone LinkType = 0
15 | LinkTypeContent LinkType = 1
16 | LinkTypeList LinkType = 2
17 | LinkTypeUnknown LinkType = 3
18 |
19 | RegexUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2]|[1-9])[/]?(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])?)`
20 |
21 | RegexIndexSuffix = `^/index\.(html|shtml|htm|php|asp|aspx|jsp)$`
22 |
23 | RegexTitleZhBlack = "(经营|制作|信息服务|出版|出版服务|演出|视听节目|新闻|视听|新网)许可证"
24 | )
25 |
26 | var (
27 | zhPuncs = []string{",", "。", ";", ":", "?", "!", "(", ")", "“", "”"}
28 |
29 | wordLangs = []string{"en", "ru", "ar", "de", "fr", "es", "pt"}
30 |
31 | zhEnTitles = []string{"nba", "cba", "5g", "ai", "it", "ipo"}
32 |
33 | regexUrlPublishDatePattern = regexp.MustCompile(RegexUrlPublishDate)
34 |
35 | regexZhPattern = regexp.MustCompile(`\p{Han}`)
36 |
37 | regexEnPattern = regexp.MustCompile(`[a-zA-Z]`)
38 |
39 | regexPuncPattern = regexp.MustCompile(`\pP`)
40 |
41 | regexTitleZhBlackPattern = regexp.MustCompile(RegexTitleZhBlack)
42 |
43 | regexIndexSuffixPattern = regexp.MustCompile(RegexIndexSuffix)
44 | )
45 |
46 | type LinkType int
47 |
48 | type LinkTypeRule map[string][]string
49 |
50 | type LinkRes struct {
51 | // 内容页
52 | Content map[string]string
53 | // 列表页
54 | List map[string]string
55 | // 未知链接
56 | Unknown map[string]string
57 | // 过滤链接
58 | None map[string]string
59 | }
60 |
61 | // LinkTypes 返回链接分类结果
62 | func LinkTypes(linkTitles map[string]string, lang string, rules LinkTypeRule) (*LinkRes, map[string]bool) {
63 | linkRes := &LinkRes{
64 | Content: make(map[string]string),
65 | List: make(map[string]string),
66 | Unknown: make(map[string]string),
67 | None: make(map[string]string),
68 | }
69 |
70 | subDomains := make(map[string]bool)
71 |
72 | // 统计数据
73 | var contentPublishCount int
74 | contentTopPaths := make(map[string]int)
75 |
76 | for link, title := range linkTitles {
77 | if linkUrl, err := fun.UrlParse(link); err == nil {
78 | hostname := linkUrl.Hostname()
79 | domainTop := DomainTop(hostname)
80 | if hostname != domainTop {
81 | subDomains[hostname] = true
82 | }
83 |
84 | // 无规则自动模式
85 | if rules == nil {
86 | linkType := LinkIsContentByTitle(linkUrl, title, lang)
87 | switch linkType {
88 | case LinkTypeContent:
89 | linkRes.Content[link] = title
90 |
91 | // 内容页 URL path 时间特征统计
92 | pathDir := path.Dir(strings.TrimSpace(linkUrl.Path))
93 | pathClean := pathDirClean(pathDir)
94 | if regexUrlPublishDatePattern.MatchString(pathClean) {
95 | contentPublishCount++
96 | }
97 |
98 | // 内容页 URL path 统计
99 | paths := fun.SplitTrim(pathDir, fun.SLASH)
100 | if len(paths) > 0 {
101 | pathIndex := paths[0]
102 | contentTopPaths[pathIndex]++
103 | }
104 | case LinkTypeList:
105 | linkRes.List[link] = title
106 | case LinkTypeNone:
107 | linkRes.None[link] = title
108 | case LinkTypeUnknown:
109 | linkRes.Unknown[link] = title
110 | }
111 | } else {
112 | // 有规则匹配模式
113 | if LinkIsContentByRegex(linkUrl, rules) {
114 | linkRes.Content[link] = title
115 | } else {
116 | // 无 path 或者默认 path, 应当由 domain 处理
117 | pathDir := strings.TrimSpace(linkUrl.Path)
118 | if pathDir == "" || pathDir == fun.SLASH || regexIndexSuffixPattern.MatchString(pathDir) {
119 | linkRes.None[link] = title
120 | } else {
121 | linkRes.List[link] = title
122 | }
123 | }
124 | }
125 | }
126 | }
127 |
128 | // 基于内容页 URL path 特征统计与分类
129 | if rules == nil {
130 | linkRes = linkTypePathProcess(linkRes, contentTopPaths, contentPublishCount)
131 | }
132 |
133 | // 最后的清洗
134 | linkRes = linkClean(linkRes, lang)
135 |
136 | return linkRes, subDomains
137 | }
138 |
139 | func linkClean(linkRes *LinkRes, lang string) *LinkRes {
140 | if lang == "zh" {
141 | contentCount := len(linkRes.Content)
142 | if contentCount > 0 {
143 | for link, title := range linkRes.Content {
144 | if regexTitleZhBlackPattern.MatchString(title) {
145 | linkRes.None[link] = title
146 | delete(linkRes.Content, link)
147 | }
148 | }
149 | }
150 | }
151 |
152 | return linkRes
153 | }
154 |
155 | func linkTypePathProcess(linkRes *LinkRes, contentTopPaths map[string]int, contentPublishCount int) *LinkRes {
156 | // 统计
157 | contentCount := len(linkRes.Content)
158 | listCount := len(linkRes.List)
159 | unknownCount := len(linkRes.Unknown)
160 |
161 | // 内容页 URL path 发布时间特征比例
162 | publishProb := float32(contentPublishCount) / float32(contentCount)
163 |
164 | // 内容页 URL path 占比较多的特征, 只取 Top 2
165 | topPaths := make([]string, 0)
166 | if contentCount >= 8 {
167 | for topPath, stat := range contentTopPaths {
168 | if stat > 1 {
169 | prob := float32(stat) / float32(contentCount)
170 | if prob > 0.4 {
171 | topPaths = append(topPaths, topPath)
172 | }
173 | }
174 | }
175 | }
176 |
177 | // 内容页 URL path 具有明显的发布时间特征比例, 处理 List、Unknown
178 | if publishProb > 0.7 {
179 | if listCount > 0 {
180 | for link, title := range linkRes.List {
181 | linkUrl, _ := fun.UrlParse(link)
182 | pathDir := path.Dir(strings.TrimSpace(linkUrl.Path))
183 | pathClean := pathDirClean(pathDir)
184 | if regexUrlPublishDatePattern.MatchString(pathClean) {
185 | // 判断下长度才加入
186 | titleLen := utf8.RuneCountInString(title)
187 | if titleLen >= 2 {
188 | linkRes.Content[link] = title
189 | delete(linkRes.List, link)
190 | }
191 | }
192 | }
193 | }
194 | if unknownCount > 0 {
195 | for link, title := range linkRes.Unknown {
196 | linkUrl, _ := fun.UrlParse(link)
197 | pathDir := path.Dir(strings.TrimSpace(linkUrl.Path))
198 | pathClean := pathDirClean(pathDir)
199 | if regexUrlPublishDatePattern.MatchString(pathClean) {
200 | // 判断下长度才加入
201 | titleLen := utf8.RuneCountInString(title)
202 | if titleLen >= 2 {
203 | linkRes.Content[link] = title
204 | } else {
205 | linkRes.List[link] = title
206 | }
207 | } else {
208 | linkRes.List[link] = title
209 | }
210 | delete(linkRes.Unknown, link)
211 | }
212 | }
213 | } else if len(topPaths) > 0 && unknownCount > 0 {
214 | // 内容页 URL path 具有前缀特征, 处理 Unknown
215 | for link, title := range linkRes.Unknown {
216 | linkUrl, _ := fun.UrlParse(link)
217 |
218 | pathDir := path.Dir(strings.TrimSpace(linkUrl.Path))
219 | paths := fun.SplitTrim(pathDir, fun.SLASH)
220 | if len(paths) > 0 {
221 | pathIndex := paths[0]
222 | if fun.SliceContains(topPaths, pathIndex) {
223 | // 判断下长度才加入
224 | titleLen := utf8.RuneCountInString(title)
225 | if titleLen >= 2 {
226 | linkRes.Content[link] = title
227 | } else {
228 | linkRes.List[link] = title
229 | }
230 | } else {
231 | linkRes.List[link] = title
232 | }
233 | delete(linkRes.Unknown, link)
234 | }
235 | }
236 | }
237 |
238 | // path 具有特征, 清洗一下内容页中无 path 的
239 | if contentCount > 0 && (publishProb > 0.7 || len(topPaths) > 0) {
240 | for link, title := range linkRes.Content {
241 | linkUrl, _ := fun.UrlParse(link)
242 | pathStr := strings.TrimSpace(linkUrl.Path)
243 | pathDir := path.Dir(pathStr)
244 | paths := fun.SplitTrim(pathDir, fun.SLASH)
245 | if pathStr == "" || pathStr == "/" || len(paths) == 0 {
246 | linkRes.Unknown[link] = title
247 | delete(linkRes.Content, link)
248 | }
249 | }
250 | }
251 |
252 | return linkRes
253 | }
254 |
255 | func LinkIsContentByRegex(linkUrl *url.URL, rules LinkTypeRule) bool {
256 | hostname := linkUrl.Hostname()
257 | domainTop := DomainTop(hostname)
258 |
259 | if _, exist := rules[hostname]; exist {
260 | for _, regex := range rules[hostname] {
261 | if fun.Matches(linkUrl.String(), regex) {
262 | return true
263 | }
264 | }
265 | } else if _, exist := rules[domainTop]; exist {
266 | for _, regex := range rules[domainTop] {
267 | if fun.Matches(linkUrl.String(), regex) {
268 | return true
269 | }
270 | }
271 | }
272 |
273 | return false
274 | }
275 |
276 | func LinkIsContentByTitle(linkUrl *url.URL, title string, lang string) LinkType {
277 | link := linkUrl.String()
278 |
279 | if utf8.RuneCountInString(link) > 255 {
280 | return LinkTypeNone
281 | }
282 |
283 | // 无 path 或者默认 path, 应当由 domain 处理
284 | pathDir := strings.TrimSpace(linkUrl.Path)
285 | if pathDir == "" || pathDir == fun.SLASH || regexIndexSuffixPattern.MatchString(pathDir) {
286 | return LinkTypeNone
287 | }
288 |
289 | if lang == "zh" {
290 | // 中文
291 | zhs := regexZhPattern.FindAllString(title, -1)
292 | hanCount := len(zhs)
293 |
294 | // 必须包含中文才可能是内容页
295 | if hanCount > 0 {
296 | // 内容页标题中文大于 5
297 | if hanCount > 5 {
298 |
299 | // 去掉空格
300 | title = strings.ReplaceAll(title, fun.SPACE, "")
301 | titleLen := utf8.RuneCountInString(title)
302 |
303 | // >= 8 判定为内容页 URL
304 | if titleLen >= 8 {
305 | return LinkTypeContent
306 | } else if titleLen < 8 {
307 |
308 | // 如果是中文, 判断是否包含常用标点
309 | if lang == "zh" {
310 | if fun.ContainsAny(title, zhPuncs...) {
311 | return LinkTypeContent
312 | }
313 | }
314 | return LinkTypeUnknown
315 | }
316 | } else {
317 | return LinkTypeList
318 | }
319 | } else {
320 | // 没有中文, 简单匹配英文字典
321 | if fun.SliceContains(zhEnTitles, strings.ToLower(title)) {
322 | return LinkTypeList
323 | }
324 |
325 | return LinkTypeNone
326 | }
327 |
328 | } else if fun.SliceContains(wordLangs, lang) {
329 | // 英语等单词类的语种
330 | // 去掉所有标点
331 | title = regexPuncPattern.ReplaceAllString(title, "")
332 |
333 | ens := regexEnPattern.FindAllString(title, -1)
334 | enCount := len(ens)
335 |
336 | // 必须包含英文字母
337 | if enCount > 0 {
338 | // 按照空格切分计算长度
339 | words := fun.SplitTrim(title, fun.SPACE)
340 |
341 | // 大于等于5个单词
342 | if len(words) >= 5 {
343 | return LinkTypeContent
344 | } else {
345 | return LinkTypeList
346 | }
347 | } else {
348 | return LinkTypeNone
349 | }
350 | } else {
351 | // 其他语种, 去除标点, 计算长度
352 | title = regexPuncPattern.ReplaceAllString(title, "")
353 |
354 | titleLen := utf8.RuneCountInString(title)
355 | if titleLen >= 8 {
356 | return LinkTypeContent
357 | } else if titleLen < 8 {
358 | // TODO 其他规则
359 | return LinkTypeList
360 | }
361 | }
362 |
363 | return LinkTypeNone
364 | }
365 |
366 | func pathDirClean(pathDir string) string {
367 | pathClean := strings.ReplaceAll(pathDir, fun.DOT, "")
368 | pathClean = strings.ReplaceAll(pathClean, fun.DASH, "")
369 | pathClean = strings.ReplaceAll(pathClean, fun.UNDERSCORE, "")
370 |
371 | return pathClean
372 | }
373 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/extract/content.go:
--------------------------------------------------------------------------------
1 | // Package extract 新闻要素抽取, 在 CEPF 算法基础上做了大量的优化
2 | // Refer to: 基于标签路径特征融合新闻内容抽取的 CEPF 算法 (吴共庆等) http://www.jos.org.cn/jos/article/abstract/4868
3 | package extract
4 |
5 | import (
6 | "bytes"
7 | "log"
8 | "math"
9 | "path"
10 | "regexp"
11 | "strings"
12 | "unicode/utf8"
13 |
14 | "github.com/PuerkitoBio/goquery"
15 | "github.com/x-funs/go-fun"
16 | "golang.org/x/net/html"
17 | )
18 |
19 | const (
20 | ContentRemoveTags = "script,noscript,style,iframe,br,link,svg,textarea"
21 |
22 | // RegexPublishDate 完整的发布时间正则
23 | RegexPublishDate = "(((20[1-3]\\d{1})[-/年.])(0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?((\\.\\d{3})?)(z|Z|[\\+-]\\d{2}[:]?\\d{2})?)?)"
24 |
25 | // RegexPublishShortDate 年份缩写发布时间正则, 如 22-09-02 11:11:11
26 | RegexPublishShortDate = "(((20[1-3]\\d{1}|[1-3]\\d{1})[-/年.])(0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?((\\.\\d{3})?)(z|Z|[\\+-]\\d{2}[:]?\\d{2})?)?)"
27 |
28 | // RegexPublishDateNoYear 不包含年的发布时间(优先级低), 09-02
29 | RegexPublishDateNoYear = "((0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?)?)"
30 |
31 | // RegexEnPublishDate1 英文格式的正则1, 如 02 Sep 2022 11:40:53 pm
32 | RegexEnPublishDate1 = "(?i)((?:(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])(?:st|nd|rd|th)?)[, ]{0,4}(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sept?|oct|nov|dec)[, ]{0,4}(20[1-3]\\d{1})([, ]{0,4}([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:]([0-5][0-9]|[0-9])([:]([0-5][0-9]|[0-9]))?([, ]{0,4}(am|pm))?)?)"
33 |
34 | // RegexEnPublishDate2 英文格式的正则2, 如 Sep 02 2022 11:40:53 pm
35 | RegexEnPublishDate2 = "(?i)((january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sept?|oct|nov|dec)[, ]{0,4}(?:(0[1-9]|[1-2][0-9]|3[0-1]|[1-9])(?:st|nd|rd|th)?)[, ]{0,4}(20[1-3]\\d{1})([, ]{0,4}([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:]([0-5][0-9]|[0-9])([:]([0-5][0-9]|[0-9]))?([, ]{0,4}(am|pm))?)?)"
36 |
37 | // RegexEnUsPublishDate 英文美式格式的正则3, 如 8/30/2022 11:11:11
38 | RegexEnUsPublishDate = "((0[1-9]|1[0-2]|[1-9])[-/.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[-/.](20[1-3]\\d{1}|[1-3]\\d{1})[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:]([0-5][0-9]|[0-9])[:]?(([0-5][0-9]|[0-9]))?)?)"
39 |
40 | // RegexTime 仅时间正则
41 | RegexTime = "([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?"
42 |
43 | // RegexZhPublishPrefix 中文的发布时间前缀
44 | RegexZhPublishPrefix = "(?i)(发布|创建|出版|发表|编辑)?(时间|日期)"
45 |
46 | // RegexZhPublishDate 中文的固定格式, 如 发布时间: xxx
47 | RegexZhPublishDate = RegexZhPublishPrefix + "[\\pP ]{1,8}" + RegexPublishShortDate
48 |
49 | // RegexScriptTitle Script 中的标题
50 | RegexScriptTitle = `(?i)"title"[\t ]{0,4}:[\t ]{0,4}"(.*)"`
51 |
52 | // RegexScriptTime Script 中的发布时间
53 | RegexScriptTime = `(?i)"[\w_\-]*pub.*"[\t ]{0,4}:[\t ]{0,4}"(((20[1-3]\d{1})[-/年.])(0[1-9]|1[0-2]|[1-9])[-/月.](0[1-9]|[1-2][0-9]|3[0-1]|[1-9])[日Tt]?[ ]{0,3}(([0-9]|[0-1][0-9]|2[0-3]|[1-9])[:点时]([0-5][0-9]|[0-9])[:分]?(([0-5][0-9]|[0-9])[秒]?)?((\.\d{3})?)(z|Z|[\+-]\d{2}[:]?\d{2})?))"`
54 |
55 | // RegexWxScriptTime 微信 Script 中的发布时间
56 | RegexWxScriptTime = `(?i)ct[\t ]{0,4}=[\t ]{0,4}"(1[2-9]\d{8})"`
57 |
58 | // RegexContentUrlPublishDate 内容页URL中隐藏的时间, 必须是非常完整标准的时间 20221003
59 | RegexContentUrlPublishDate = `(20[2-3]\d{1}[/]?(0[1-9]|1[0-2])[/]?(0[1-9]|[1-2][0-9]|3[0-1]))`
60 |
61 | // RegexFormatTime3 错误的时间格式, 用于过滤
62 | RegexFormatTime3 = `[:分]\d{3}$`
63 |
64 | // RegexFormatTime4 错误的时间格式, 用于过滤
65 | RegexFormatTime4 = `[:分]\d{4}$`
66 |
67 | // RegexZone 错误的时区格式, 用于过滤
68 | RegexZone = `(([\+-]\d{2})[:]?\d{2})$`
69 |
70 | // TitleSimZh 中文相似度阈值
71 | TitleSimZh = 0.3
72 |
73 | // TitleSimWord 单词相似度阈值
74 | TitleSimWord = 0.5
75 | )
76 |
77 | var (
78 | contentMetaTitleSelectors = []string{
79 | "meta[property='og:title' i]",
80 | "meta[property='twitter:title' i]",
81 | "meta[name='twitter:title' i]",
82 | }
83 |
84 | contentMetaDatetimeDicts = []string{"publish", "pubdate", "pubtime", "release", "dctermsdate"}
85 |
86 | regexPublishDatePattern = regexp.MustCompile(RegexPublishDate)
87 |
88 | regexPublishShortDatePattern = regexp.MustCompile(RegexPublishShortDate)
89 |
90 | regexPublishDateNoYearPattern = regexp.MustCompile(RegexPublishDateNoYear)
91 |
92 | regexZhPublishDatePattern = regexp.MustCompile(RegexZhPublishDate)
93 |
94 | regexEnPublishDatePattern1 = regexp.MustCompile(RegexEnPublishDate1)
95 |
96 | regexEnPublishDatePattern2 = regexp.MustCompile(RegexEnPublishDate2)
97 |
98 | regexEnUsPublishDatePattern = regexp.MustCompile(RegexEnUsPublishDate)
99 |
100 | regexTimePattern = regexp.MustCompile(RegexTime)
101 |
102 | regexScriptTitlePattern = regexp.MustCompile(RegexScriptTitle)
103 |
104 | regexScriptTimePattern = regexp.MustCompile(RegexScriptTime)
105 |
106 | regexWxScriptTimePattern = regexp.MustCompile(RegexWxScriptTime)
107 |
108 | regexContentUrlPublishDatePattern = regexp.MustCompile(RegexContentUrlPublishDate)
109 |
110 | regexFormatTime3 = regexp.MustCompile(RegexFormatTime3)
111 |
112 | regexFormatTime4 = regexp.MustCompile(RegexFormatTime4)
113 |
114 | regexZonePattern = regexp.MustCompile(RegexZone)
115 | )
116 |
117 | type News struct {
118 | // 标题
119 | Title string
120 | // 标题提取依据
121 | TitlePos string
122 | // 发布时间
123 | TimeLocal string
124 | // 原始时间
125 | Time string
126 | // 发布时间时间提取依据
127 | TimePos string
128 | // 正文纯文本
129 | Content string
130 | // 正文 Node 节点
131 | ContentNode *html.Node
132 | // 提取用时(毫秒)
133 | Spend int64
134 | // 语种
135 | Lang string
136 | }
137 |
138 | type Content struct {
139 | // 原始 Doc
140 | OriginDoc *goquery.Document
141 | // Doc
142 | Doc *goquery.Document
143 | // 原始标题, 来自于上级页面
144 | OriginTitle string
145 | // 原始链接, 来自于上级页面
146 | OriginUrl string
147 | // 语种
148 | Lang string
149 |
150 | infoMap map[*html.Node]countInfo
151 | bodyNode *html.Node
152 | title string
153 | titlePos string
154 | titleSim float64
155 | timePos string
156 | timeEnFormat bool
157 | }
158 |
159 | type countInfo struct {
160 | // 文本长度, 如 标签的文本
161 | TextCount int
162 | // 带有链接的文本长度, 如 标签中的文本
163 | LinkTextCount int
164 | // 标签数量
165 | TagCount int
166 | // 带有链接的标签数量
167 | LinkTagCount int
168 | // 密度
169 | Density float64
170 | // 密度统计
171 | DensitySum float64
172 | // 标签数量
173 | PCount int
174 | // 叶子列表
175 | LeafList []int
176 | }
177 |
178 | func NewContent(docOrg *goquery.Document, lang string, originTitle string, originUrl string) *Content {
179 | originDoc := goquery.CloneDocument(docOrg)
180 | doc := goquery.CloneDocument(docOrg)
181 | doc.Find(ContentRemoveTags).Remove()
182 |
183 | // 标题相似度阈值判定
184 | titleSim := TitleSimZh
185 | if fun.SliceContains(wordLangs, lang) {
186 | titleSim = TitleSimWord
187 | }
188 |
189 | infoMap := make(map[*html.Node]countInfo, 0)
190 |
191 | return &Content{OriginDoc: originDoc, Doc: doc, OriginTitle: originTitle, OriginUrl: originUrl, Lang: lang, infoMap: infoMap, titleSim: titleSim}
192 | }
193 |
194 | func (c *Content) ExtractNews() *News {
195 | news := &News{}
196 |
197 | // 开始时间
198 | begin := fun.Timestamp(true)
199 |
200 | // 提取正文结点和正文
201 | contentNode := c.getContentNode()
202 | if contentNode != nil {
203 | news.ContentNode = contentNode
204 |
205 | content := c.formatContent(contentNode)
206 | news.Content = content
207 | }
208 |
209 | // 提取标题
210 | title := c.getTitle(contentNode)
211 | news.Title = title
212 | news.TitlePos = c.titlePos
213 | c.title = title
214 |
215 | // 提取发布时间
216 | time := c.getTime()
217 | if time != "" {
218 | // 格式化时间
219 | news.Time = time
220 | news.TimePos = c.timePos
221 | time = c.formatTime(time)
222 | ts := fun.StrToTime(time)
223 | if ts > 0 {
224 | news.TimeLocal = fun.Date(ts)
225 | }
226 | }
227 |
228 | news.Spend = fun.Timestamp(true) - begin
229 | news.Lang = c.Lang
230 |
231 | return news
232 | }
233 |
234 | // formatTime 时间格式化清洗(尽可能的)
235 | func (c *Content) formatTime(time string) string {
236 | if !c.timeEnFormat {
237 | // 当包含时区信息时格式化空格
238 | if fun.ContainsAny(time, "T", "t", "Z", "z") {
239 | time = strings.ReplaceAll(time, " ", "")
240 | }
241 | // 当包含时区T时又没有偏移, 按本地时间处理
242 | if fun.Contains(time, "T") && !fun.ContainsCase(time, "z") {
243 | if !regexZonePattern.MatchString(time) {
244 | time = strings.ReplaceAll(time, "T", " ")
245 | }
246 | }
247 | }
248 |
249 | // 错误的尾巴处理
250 | if fun.Contains(time, ":") && !fun.ContainsAny(time, "时", "点") {
251 | time = strings.TrimSuffix(time, "分")
252 | }
253 | return time
254 | }
255 |
256 | // formatContent 正文格式化, 处理
的换行, 最终将多个换行符和空格均合并为一个
257 | func (c *Content) formatContent(contentNode *html.Node) string {
258 | // 先提取 HTML
259 | node := goquery.NewDocumentFromNode(contentNode)
260 | contentHtml, _ := node.Html()
261 |
262 | // 给
则增加换行 \n
263 | contentHtml = strings.ReplaceAll(contentHtml, "
", "
\n")
264 | n, _ := goquery.NewDocumentFromReader(strings.NewReader(contentHtml))
265 | str := n.Text()
266 |
267 | // 最后合并多余的换行
268 | lines := fun.SplitTrim(str, fun.LF)
269 | if len(lines) > 0 {
270 | for i, line := range lines {
271 | lines[i] = fun.NormaliseSpace(line)
272 | }
273 | str = strings.Join(lines, fun.LF)
274 | } else {
275 | str = fun.NormaliseSpace(str)
276 | }
277 |
278 | return str
279 | }
280 |
281 | func (c *Content) getContentNode() *html.Node {
282 | var maxScore float64
283 | var contentNode *html.Node
284 |
285 | // 取第一个 body 标签
286 | bodyNodes := c.Doc.Find("body").Nodes
287 | if len(bodyNodes) > 0 {
288 | bodyNode := bodyNodes[0]
289 | c.bodyNode = bodyNode
290 |
291 | // 递归遍历计算并统计, 最后找得分最高那个节点
292 | c.computeInfo(c.bodyNode)
293 |
294 | for node := range c.infoMap {
295 | if node.Data == "a" || node == bodyNode {
296 | continue
297 | }
298 |
299 | score := c.computeScore(node)
300 | if score > maxScore {
301 | maxScore = score
302 | contentNode = node
303 | }
304 | }
305 | }
306 |
307 | return contentNode
308 | }
309 |
310 | func (c *Content) getTime() string {
311 | // meta
312 | regexZhPatterns := []*regexp.Regexp{
313 | regexPublishDatePattern,
314 | }
315 | metaZhTime := c.getTimeByMeta(regexZhPatterns)
316 | if metaZhTime != "" {
317 | c.timePos = "meta"
318 | return metaZhTime
319 | }
320 |
321 | // meta En
322 | if c.Lang != "zh" {
323 | regexEnPatterns := []*regexp.Regexp{
324 | regexEnPublishDatePattern1,
325 | regexEnPublishDatePattern2,
326 | }
327 | metaEnTime := c.getTimeByMetaEn(regexEnPatterns)
328 | if metaEnTime != "" {
329 | c.timePos = "meta"
330 | c.timeEnFormat = true
331 | return metaEnTime
332 | }
333 | }
334 |
335 | //