├── spider ├── config │ ├── entry.txt │ ├── limitedtothis.txt │ └── filterext.txt ├── spider.conf ├── README.md └── spider_main.go ├── .gitignore ├── local-file-search ├── skipdir.txt ├── README.md ├── search.go └── scan.go ├── analyze ├── extractutil │ ├── test.html │ ├── test_body.txt │ ├── test_title.txt │ ├── README.md │ ├── extractutil_test.go │ └── extractutil.go ├── analyze.conf ├── README.md ├── iconv_main.go └── analyze_main.go ├── indexing ├── doc-mark │ ├── docmark.conf │ ├── README.md │ └── doc_mark_main.go ├── calculate-idf │ ├── idf.conf │ ├── README.md │ └── calculate_idf_main.go ├── calculate-tf-idf │ ├── tfidf.conf │ ├── README.md │ └── calculate_tf_idf_main.go ├── participleutil │ ├── README.md │ ├── participleutil_test.go │ ├── participleutil.go │ ├── trie │ │ ├── trie_test.go │ │ └── trie.go │ ├── datrie │ │ ├── datrie_test.go │ │ └── datrie.go │ └── participle │ │ ├── participle_test.go │ │ └── participle.go └── README.md ├── web ├── views │ └── html │ │ ├── index.html │ │ ├── pagination.html │ │ └── search.html ├── utils │ └── RespondUtil.go ├── models │ ├── dto │ │ └── ResultPageDTO.go │ └── dao │ │ └── ResultPageDAO.go ├── main.go └── controllers │ └── Controller.go ├── search ├── search.conf ├── README.md └── search.go ├── README.md └── code_count.go /spider/config/entry.txt: -------------------------------------------------------------------------------- 1 | http://www.yixieshi.com/ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | *.exe 3 | *.db 4 | sqliteadmin/ -------------------------------------------------------------------------------- /spider/config/limitedtothis.txt: -------------------------------------------------------------------------------- 1 | http://www.yixieshi.com/ -------------------------------------------------------------------------------- /local-file-search/skipdir.txt: -------------------------------------------------------------------------------- 1 | C:\Windows 2 | C:\Program Files -------------------------------------------------------------------------------- /analyze/extractutil/test.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gansidui/gose/HEAD/analyze/extractutil/test.html -------------------------------------------------------------------------------- /indexing/doc-mark/docmark.conf: -------------------------------------------------------------------------------- 1 | { 2 | "ExtractUrlDataPath": "D:/SearchEngine/extract/extracturldata.db" 3 | } -------------------------------------------------------------------------------- /analyze/extractutil/test_body.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gansidui/gose/HEAD/analyze/extractutil/test_body.txt -------------------------------------------------------------------------------- /analyze/extractutil/test_title.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gansidui/gose/HEAD/analyze/extractutil/test_title.txt -------------------------------------------------------------------------------- /analyze/extractutil/README.md: -------------------------------------------------------------------------------- 1 | 正文提取包。 2 | 3 | 传入一个string类型的参数,然后返回 标题(title) 和 正文(body)的string类型, 此时的正文(body)中包含了标题(title) 。 -------------------------------------------------------------------------------- /indexing/calculate-idf/idf.conf: -------------------------------------------------------------------------------- 1 | { 2 | "ExtractWebpagePath": "D:/SearchEngine/extract/webpage/", 3 | "ExtractUrlDataPath": "D:/SearchEngine/extract/extracturldata.db" 4 | } -------------------------------------------------------------------------------- /indexing/calculate-tf-idf/tfidf.conf: -------------------------------------------------------------------------------- 1 | { 2 | "ExtractWebpagePath": "D:/SearchEngine/extract/webpage/", 3 | "ExtractUrlDataPath": "D:/SearchEngine/extract/extracturldata.db" 4 | } -------------------------------------------------------------------------------- /analyze/analyze.conf: -------------------------------------------------------------------------------- 1 | { 2 | "DownUrlDataPath": "D:/SearchEngine/down/downurldata.db", 3 | "ExtractWebpagePath": "D:/SearchEngine/extract/webpage/", 4 | "ExtractUrlDataPath": "D:/SearchEngine/extract/extracturldata.db" 5 | } -------------------------------------------------------------------------------- /indexing/participleutil/README.md: -------------------------------------------------------------------------------- 1 | trie是字典树 2 | 3 | datrie是双数组字典树(double array trie) 4 | 5 | participle是辅助包, 采用了 datrie 6 | 7 | mydic.txt 为词库 8 | 9 | 采用双向最大匹配 10 | 11 | mydic.txt中有 224076 个中文词汇, 74739个英语单词, 总共 298815 个。 -------------------------------------------------------------------------------- /web/views/html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Gopher Search 4 | 5 | 6 |
7 | 8 | 9 |
10 | 11 | -------------------------------------------------------------------------------- /local-file-search/README.md: -------------------------------------------------------------------------------- 1 | 2 | scan.go 编译生成 scan.exe,该程序首先从skipdir.txt中读取过滤掉的目录,每条目录占一行。 3 | 4 | 得到的文件信息存放在localfile.db(sqlite3数据库)中,localfile.db中只有一张表 info。 5 | 6 | 每次运行scan.exe程序都将删除info,然后重新构造 info 。 7 | 8 | info 表的内容为: 9 | 10 | 文件名,路径 11 | name path 12 | 13 | 14 | -------------------------------------------------------------------------------- /indexing/README.md: -------------------------------------------------------------------------------- 1 | participleutil是分词包 2 | 3 | calculate-idf 计算每个词的逆文档频率 【可以根据某个标准语料库计算出来的,这样就无须频繁更新】 4 | 5 | doc-mark 对文档进行标号 6 | 7 | calculate-tf-idf 计算TF-IDF 并建立倒排索引 8 | 9 | 10 | 11 | 执行顺序: 12 | 13 | (go run calculate_idf_main.go 平时不需要执行) 14 | 15 | go run doc_mark_main.go 16 | 17 | go run calculate_tf_idf_main.go 18 | 19 | -------------------------------------------------------------------------------- /indexing/doc-mark/README.md: -------------------------------------------------------------------------------- 1 | 对文档进行标号 2 | 3 | 文档是指那些已经被分析过了的文档,即已经抽取了正文和标题的文档, 在配置文件中指定了路径 4 | 5 | { 6 | "ExtractUrlDataPath": "D:/SearchEngine/extract/extracturldata.db" 7 | } 8 | 9 | 然后将标号的结果生成sqlite数据库保存,名为 docmark.db, 表为data 10 | 字段为: 11 | md5 id 12 | 13 | 其中 md5 为主键,这样保证标号唯一,每次标号都先计算数据库中最大的标号值,然后再不断递增标号。 14 | 15 | create table data(md5 varchar(32) not null primary key, id integer not null) -------------------------------------------------------------------------------- /indexing/participleutil/participleutil_test.go: -------------------------------------------------------------------------------- 1 | package participleutil 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestParticiple(t *testing.T) { 9 | 10 | LoadDic("./mydic.txt") 11 | 12 | ss := Participle("学历史学好") 13 | for _, v := range ss { 14 | fmt.Printf("%s/", v) 15 | } 16 | fmt.Println() 17 | 18 | ss = Participle("搜噶,我爱豆豆猪") 19 | for _, v := range ss { 20 | fmt.Printf("%s/", v) 21 | } 22 | fmt.Println() 23 | } 24 | -------------------------------------------------------------------------------- /spider/config/filterext.txt: -------------------------------------------------------------------------------- 1 | .zip 2 | .arj 3 | .rar 4 | .lzh 5 | .jar 6 | .exe 7 | .dll 8 | .cab 9 | .apk 10 | .app 11 | .cbx 12 | .dbf 13 | .fky 14 | .fpt 15 | .fxp 16 | .mnx 17 | .pcb 18 | .prg 19 | .qpr 20 | .tbk 21 | .jpg 22 | .png 23 | .gif 24 | .css 25 | .ico 26 | .avi 27 | .awd 28 | .bak 29 | .bmp 30 | .doc 31 | .docx 32 | .js 33 | .mp3 34 | .mp4 35 | .mpeg 36 | .mpg 37 | .njx 38 | .pdf 39 | .ppt 40 | .lnk 41 | .xml 42 | .swf 43 | .rmvb 44 | .mkv -------------------------------------------------------------------------------- /search/search.conf: -------------------------------------------------------------------------------- 1 | { 2 | "DownUrlDataPath": "D:/SearchEngine/down/downurldata.db", 3 | "ExtractWebpagePath": "D:/SearchEngine/extract/webpage/", 4 | "ExtractUrlDataPath": "D:/SearchEngine/extract/extracturldata.db", 5 | "DicPath": "D:/golib/src/github.com/gansidui/gose/indexing/participleutil/mydic.txt", 6 | "TfIdfPath": "D:/golib/src/github.com/gansidui/gose/indexing/calculate-tf-idf/tfidf.db", 7 | "DocMarkPath": "D:/golib/src/github.com/gansidui/gose/indexing/doc-mark/docmark.db" 8 | } -------------------------------------------------------------------------------- /analyze/extractutil/extractutil_test.go: -------------------------------------------------------------------------------- 1 | package extractutil 2 | 3 | import ( 4 | "io/ioutil" 5 | "log" 6 | "os" 7 | "testing" 8 | ) 9 | 10 | func TestExtract(t *testing.T) { 11 | content, err := ioutil.ReadFile("./test.html") 12 | if err != nil { 13 | log.Fatal(err) 14 | } 15 | 16 | title := ExtractTitle(string(content)) 17 | ioutil.WriteFile("./test_title.txt", []byte(title), os.ModePerm) 18 | 19 | body := ExtractBody(string(content)) 20 | ioutil.WriteFile("./test_body.txt", []byte(body), os.ModePerm) 21 | } 22 | -------------------------------------------------------------------------------- /spider/spider.conf: -------------------------------------------------------------------------------- 1 | { 2 | "EntryPath": "D:/golib/src/github.com/gansidui/gose/spider/config/entry.txt", 3 | "UrlQueuePath": "D:/golib/src/github.com/gansidui/gose/spider/config/urlqueue.db", 4 | "FilterExtPath": "D:/golib/src/github.com/gansidui/gose/spider/config/filterext.txt", 5 | "LimitedToThisPath": "D:/golib/src/github.com/gansidui/gose/spider/config/limitedtothis.txt", 6 | "DownWebpagePath": "D:/SearchEngine/down/webpage/", 7 | "DownUrlDataPath": "D:/SearchEngine/down/downurldata.db", 8 | "MaxNum": 10000, 9 | "IntervalTime": "100ms" 10 | } -------------------------------------------------------------------------------- /web/views/html/pagination.html: -------------------------------------------------------------------------------- 1 | {{define "pagination"}} 2 | 24 | {{end}} -------------------------------------------------------------------------------- /web/views/html/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Gopher Search 4 | 5 | 6 |
7 | 8 | {{with .Pagination}} 9 | 10 | {{end}} 11 | 12 | 13 | 14 |

Used:{{.UsedTime}} Total:{{.ResultTotal}}

15 | 16 |
17 | 18 | {{with .Articles}} 19 | {{range .}} 20 |
21 |

{{.Title}}

22 |
{{.Url}}
23 |

{{.Summary}}

24 | {{end}} 25 | {{end}} 26 | 27 | {{template "pagination" .Pagination}} 28 | 29 | 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A simple search engine in the golang 2 | 3 | ----------------------------------------------------------------------- 4 | 5 | 使用说明: 6 | 7 | 首先在 D 盘下建立文件夹 SearchEngine, 当然在其他路径下也可以,但必须得更改各个配置文件。 8 | 9 | 10 | 11 | 爬虫(spider): 12 | 根据说明配置好爬虫, go run spider_main.go , 下载的网页数据保存在 D:/SearchEngine/down/ 下。 13 | 14 | 15 | 分析(analyze): 16 | go run analyze_main.go , 将爬取下来的网页提取出正文和标题, 提取出来的数据保存在 D:/SearchEngine/extract/ 下。 17 | 18 | 19 | 索引(indexing): 20 | 根据 indexing 的说明建立倒排索引 21 | 22 | 查找(search): 23 | 根据搜索串查找文档,用于web中的models模块。 24 | 25 | 显示(web): 26 | 搜索引擎的界面,展示搜索结果。 go run main.go 启动服务器。 27 | 28 | 29 | 30 | 31 | local-file-search 是本地磁盘文件搜索 32 | 33 | sqliteadmin 是管理sqlite数据库的可视化工具 -------------------------------------------------------------------------------- /analyze/README.md: -------------------------------------------------------------------------------- 1 | 将爬虫爬取的网页进行分析,抽取出标题和正文 2 | 3 | { 4 | "DownUrlDataPath": "D:/SearchEngine/down/downurldata.db", 5 | "ExtractWebpagePath": "D:/SearchEngine/extract/webpage/", 6 | "ExtractUrlDataPath": "D:/SearchEngine/extract/extracturldata.db" 7 | } 8 | 9 | 假设网页被爬虫下载来后存的文件名为 xxx.html, 那么抽取后的标题和正文存在 ExtractWebpagePath 目录下, 10 | 11 | 那么存储标题的文件名为 xxx_title.txt, 存储正文的文件名为 xxx_body.txt 。实际上 xxx 为原网页url的md5值。 12 | 13 | DownUrlDataPath 这个存储爬虫下载的网页信息。 14 | 15 | ExtractUrlDataPath 这个sqlite数据库只有一个表(data), 且只有一个字段为 md5, 标记该网页已经分析过了。 16 | 17 | 18 | ----------------------------------------------------------------------- 19 | 提取后的文件的编码可能是GB2312编码格式, (在简体中文系统下,ansi 编码代表 GB2312 编码) 20 | 所以需要iconv_main.go将其转换为utf-8格式。 -------------------------------------------------------------------------------- /indexing/participleutil/participleutil.go: -------------------------------------------------------------------------------- 1 | package participleutil 2 | 3 | import ( 4 | "bufio" 5 | "github.com/gansidui/gose/indexing/participleutil/participle" 6 | "os" 7 | ) 8 | 9 | var p *participle.Participle 10 | 11 | func init() { 12 | p = participle.NewParticiple() 13 | } 14 | 15 | // 加载词库 16 | func LoadDic(dicPath string) { 17 | file, err := os.Open(dicPath) 18 | if err != nil { 19 | os.Exit(1) 20 | } 21 | defer file.Close() 22 | 23 | re := bufio.NewReader(file) 24 | 25 | for { 26 | line, _, err := re.ReadLine() 27 | if err != nil { 28 | break 29 | } 30 | p.Insert(string(line)) 31 | } 32 | } 33 | 34 | // 分词 35 | func Participle(src string) []string { 36 | return p.BidirectionalMatch(src) 37 | } 38 | -------------------------------------------------------------------------------- /indexing/calculate-idf/README.md: -------------------------------------------------------------------------------- 1 | 计算每个词的逆文档频率(Inverse Document Frequency,缩写为IDF): 2 | 3 | 逆文档频率(IDF) = log10( 语料库的文档总数 / (包含该词的文档数) +1 ) 4 | 5 | 6 | 如果一个词越常见,那么分母就越大,逆文档频率就越小越接近0。分母之所以要加1,是为了避免分母为0(即所有文档都不包含该词)。 7 | 8 | log10 表示对得到的值取以10为底的对数。 9 | 10 | 11 | ./idf.conf 配置文件: 12 | 13 | { 14 | "ExtractWebpagePath": "D:/SearchEngine/extract/webpage/", 15 | "ExtractUrlDataPath": "D:/SearchEngine/extract/extracturldata.db" 16 | } 17 | 18 | ExtractWebpagePath 中的文件名以 _body.txt 结尾的文档是抽取出来的网页正文,可以看做成一个语料库。 19 | ExtractUrlDataPath 是将文档抽取了正文生成 _body.txt 的数据库。 20 | 21 | 22 | 23 | 然后将得到的逆文档频率信息以如下形式保存到sqlite3数据库中,数据库名为 idf.db , 只有一个表为 data 。 24 | 同时也将这个数据保存到文本文件, idf.txt ,方便随时查看。 25 | 26 | word idf 27 | 28 | 例如: 29 | 30 | 中国 0.603 31 | 蜜蜂 2.713 32 | 养殖 2.410 33 | -------------------------------------------------------------------------------- /search/README.md: -------------------------------------------------------------------------------- 1 | { 2 | "DownUrlDataPath": "D:/SearchEngine/down/downurldata.db", 3 | "ExtractWebpagePath": "D:/SearchEngine/extract/webpage/", 4 | "ExtractUrlDataPath": "D:/SearchEngine/extract/extracturldata.db", 5 | "DicPath": "D:/golib/src/github.com/gansidui/gose/indexing/participleutil/mydic.txt", 6 | "TfIdfPath": "D:/golib/src/github.com/gansidui/gose/indexing/calculate-tf-idf/tfidf.db", 7 | "DocMarkPath": "D:/golib/src/github.com/gansidui/gose/indexing/doc-mark/docmark.db" 8 | } 9 | 10 | 这个包是给web中的models模块使用的,配置文件用的全部是绝对路径。 11 | 12 | 搜索的原理: 13 | 14 | 初始化时读取 15 | word --> id, tfidf 16 | id --> md5 17 | md5 --> url, path 18 | 19 | 输入一个搜索串,将其按空格拆分成多个句子,然后分别对这些句子进行搜索, 20 | 21 | 对每个句子进行搜索:将句子进行分词得到多个关键词,通过关键词的索引得到文档id集合,对这些id集合取并集。 22 | 23 | 然后将各个句子搜索的结果取交集。(这个符合用户的搜索习惯) -------------------------------------------------------------------------------- /web/utils/RespondUtil.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "html/template" 5 | "io" 6 | "log" 7 | "net/http" 8 | ) 9 | 10 | func Respond(w http.ResponseWriter, status int, html string) { 11 | w.WriteHeader(status) 12 | io.WriteString(w, html) 13 | w.Header().Set("Content-Type", "text/html") 14 | w.Header().Set("Content-Length", string(len(html))) 15 | } 16 | 17 | func RespondNotFound(w http.ResponseWriter) { 18 | Respond(w, http.StatusNotFound, "

Page Not Found

") 19 | } 20 | 21 | func RespondServerError(w http.ResponseWriter) { 22 | Respond(w, http.StatusInternalServerError, "

服务器内部错误

") 23 | } 24 | 25 | func RespondTemplate(w http.ResponseWriter, status int, templateFile string, data interface{}) { 26 | w.WriteHeader(status) 27 | t, err := template.ParseFiles(templateFile) 28 | if err != nil { 29 | log.Fatal(err) 30 | } 31 | t.Execute(w, data) 32 | } 33 | -------------------------------------------------------------------------------- /code_count.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "log" 7 | "os" 8 | "path/filepath" 9 | ) 10 | 11 | var numCodeRows int 12 | 13 | func getRowsFromFile(path string) int { 14 | file, err := os.Open(path) 15 | if err != nil { 16 | log.Fatal(err) 17 | } 18 | defer file.Close() 19 | 20 | num := 0 21 | re := bufio.NewReader(file) 22 | for { 23 | _, _, err = re.ReadLine() 24 | if err != nil { 25 | break 26 | } 27 | num++ 28 | } 29 | fmt.Println(path, "------------", num) 30 | return num 31 | } 32 | 33 | func WalkFunc(path string, info os.FileInfo, err error) error { 34 | if path == "calculate_code_rows.go" { 35 | return nil 36 | } 37 | if filepath.Ext(path) == ".go" { 38 | numCodeRows += getRowsFromFile(path) 39 | } 40 | return nil 41 | } 42 | 43 | func main() { 44 | numCodeRows = 0 45 | filepath.Walk("./", WalkFunc) 46 | fmt.Println("total:", numCodeRows) 47 | } 48 | -------------------------------------------------------------------------------- /web/models/dto/ResultPageDTO.go: -------------------------------------------------------------------------------- 1 | package dto 2 | 3 | import ( 4 | "github.com/gansidui/gose/search" 5 | ) 6 | 7 | // 页面链接信息 8 | type ShowPageInfo struct { 9 | QueryString string // 搜索串 10 | Page int // 当前链接的页码 11 | Start int // 当前页的结果起始位置 12 | } 13 | 14 | // 页码信息 15 | type PaginationInfo struct { 16 | PageTotal int // 总共有多少页 17 | PerPageArticlesNum int // 每页多少篇文章 18 | PrevPageStart int // 前一页起始位置 19 | NextPageStart int // 后一页起始位置 20 | HasPrevPage bool // 是否有上一页 21 | ShowPages []*ShowPageInfo // 中间显示哪些页码链接 22 | HasNextPage bool // 是否有下一页 23 | QueryString string // 搜索串 24 | } 25 | 26 | // 文章信息, 在 search 包中 27 | // type ArticleInfo struct { 28 | // Title string // 标题 29 | // Summary string // 摘要 30 | // Url string // 原始url 31 | // Path string // 本地路径 32 | // } 33 | 34 | // 结果页信息 35 | type ResultPageInfo struct { 36 | Articles *[]search.ArticleInfo 37 | Pagination *PaginationInfo 38 | ResultTotal int 39 | UsedTime string 40 | } 41 | -------------------------------------------------------------------------------- /web/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "github.com/gansidui/gose/search" 7 | "github.com/gansidui/gose/web/controllers" 8 | "log" 9 | "net" 10 | "net/http" 11 | ) 12 | 13 | var ( 14 | ip string 15 | port string 16 | ) 17 | 18 | func init() { 19 | defaultIP := "127.0.0.1" 20 | defaultPort := "9090" 21 | 22 | // 获取本机的IP(A global unicast address) 23 | addr, _ := net.InterfaceAddrs() 24 | for _, v := range addr { 25 | IP := net.ParseIP(v.String()) 26 | if IP.IsGlobalUnicast() { 27 | defaultIP = v.String() 28 | break 29 | } 30 | } 31 | 32 | flag.StringVar(&ip, "ip", defaultIP, "ip address") 33 | flag.StringVar(&port, "port", defaultPort, "port number") 34 | flag.Parse() 35 | 36 | log.SetFlags(log.Lshortfile | log.LstdFlags) 37 | search.ReadConfig("../search/search.conf") 38 | search.InitSearch() 39 | } 40 | 41 | func main() { 42 | http.HandleFunc("/", controllers.HomePage) 43 | http.HandleFunc("/search", controllers.SearchPage) 44 | 45 | addr := ip + ":" + port 46 | fmt.Println("Listenning:", addr) 47 | err := http.ListenAndServe(addr, nil) 48 | if err != nil { 49 | log.Fatal("ListenAndServe:", err) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /indexing/calculate-tf-idf/README.md: -------------------------------------------------------------------------------- 1 | 词频 TF(Term Frequency) 2 | 逆文档频率(IDF) = log10( 语料库的文档总数 / (包含该词的文档数) +1 ) 3 | 4 | 5 | 计算词频(TF) 6 | 7 | 词频(TF) = 某个词在文章中的出现次数 8 | 9 | 考虑到文章有长短之分,为了便于不同文章的比较,进行"词频"标准化或归一化处理。 10 | 11 | 词频(TF) = 某个词在文章中的出现次数 / 文章的总词数 12 | 13 | 14 | 15 | 计算 TF-IDF 16 | 17 | 18 | TF-IDF = 词频(TF) * 逆文档频率(IDF) 19 | 20 | 可以看到,TF-IDF与一个词在文档中的出现次数成正比,与该词在整个语言中的出现次数成反比。 21 | 22 | 所以,自动提取关键词的算法就很清楚了,就是计算出文档的每个词的TF-IDF值,然后按降序排列,取排在最前面的几个词。 23 | 24 | 25 | 那么搜索引擎根据关键词搜索文档,只要找到那些针对这个关键词的TF-IDF值最高的文档即可。(多个关键词可以累加TF-IDF值) 26 | 27 | 28 | 对网页库中的每个网页标号为1,2,3,..... 29 | 30 | 对每个网页进行分词,按如下形式存储每个关键词的信息: 31 | 32 | 词1 网页1 TF-IDF值1 网页2 TF-IDF值2 网页3 TF-IDF值3 ...... 33 | 词2 网页1 TF-IDF值1 网页2 TF-IDF值2 网页3 TF-IDF值3 ...... 34 | ...... 35 | 36 | 37 | 将这些信息保存sqlite3数据库中,数据库名为 tfidf.db , 只有一个表为 data(word docid tfidf) 。 38 | 39 | 40 | config.data配置文件的说明参考 calculate-idf 的说明,只是这里不是作为语料库,而是作为网页库。 41 | 42 | 43 | 事实上该过程也得到了倒排索引。 44 | 45 | TF-IDF算法的优点是简单快速,结果比较符合实际情况。缺点是,单纯以"词频"衡量一个词的重要性,不够全面,有时重要的词可能出现次数并不多。 46 | 而且,这种算法无法体现词的位置信息,出现位置靠前的词与出现位置靠后的词,都被视为重要性相同,这是不正确的。 47 | (一种解决方法是,对全文的第一段和每一段的第一句话,给予较大的权重。) 48 | 这里采用的方法是将标题提取出来,标题里面包含的关键词的次数以5倍计算,因为一篇文章的关键内容一般都在标题中体现了出来。 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /web/controllers/Controller.go: -------------------------------------------------------------------------------- 1 | package controllers 2 | 3 | import ( 4 | "github.com/gansidui/gose/web/models/dao" 5 | "github.com/gansidui/gose/web/utils" 6 | "html/template" 7 | "log" 8 | "net/http" 9 | ) 10 | 11 | func HomePage(w http.ResponseWriter, r *http.Request) { 12 | if r.Method == "POST" { 13 | r.ParseForm() 14 | q := r.Form.Get("q") 15 | if q == "" { 16 | utils.RespondTemplate(w, http.StatusOK, "views/html/index.html", nil) 17 | } else { 18 | s := "/search?q=" + q + "&start=0" 19 | http.Redirect(w, r, s, 303) 20 | } 21 | } else { 22 | utils.RespondTemplate(w, http.StatusOK, "views/html/index.html", nil) 23 | } 24 | } 25 | 26 | func SearchPage(w http.ResponseWriter, r *http.Request) { 27 | if r.Method == "POST" { 28 | r.ParseForm() 29 | q := r.Form.Get("q") 30 | s := "/search?q=" + q + "&start=0" 31 | http.Redirect(w, r, s, 303) 32 | } else { 33 | q := r.URL.Query().Get("q") 34 | start := r.URL.Query().Get("start") 35 | num := "10" 36 | resultPage, success := dao.GetResultPageInfo(q, start, num) 37 | if success { 38 | w.WriteHeader(http.StatusOK) 39 | t, err := template.ParseFiles("views/html/search.html", "views/html/pagination.html") 40 | if err != nil { 41 | log.Fatal(err) 42 | } 43 | t.Execute(w, &resultPage) 44 | } else { 45 | utils.RespondNotFound(w) 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /analyze/iconv_main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "code.google.com/p/mahonia" 5 | "encoding/json" 6 | "fmt" 7 | "io/ioutil" 8 | "log" 9 | "os" 10 | "path/filepath" 11 | "time" 12 | ) 13 | 14 | // 配置文件 15 | type Config struct { 16 | DownUrlDataPath string 17 | ExtractWebpagePath string 18 | ExtractUrlDataPath string 19 | } 20 | 21 | // 读取配置文件 22 | func NewConfig() *Config { 23 | file, err := ioutil.ReadFile("./analyze.conf") 24 | if err != nil { 25 | log.Fatal(err, "\r\n") 26 | } 27 | 28 | var conf Config 29 | err = json.Unmarshal(file, &conf) 30 | if err != nil { 31 | log.Fatal(err, "\r\n") 32 | } 33 | return &conf 34 | } 35 | 36 | var numFile int 37 | 38 | // 把文件从gb2312编码转换成utf8编码 39 | func walkFunc(path string, info os.FileInfo, err error) error { 40 | file, err := os.Open(path) 41 | checkError(err) 42 | defer file.Close() 43 | 44 | decoder := mahonia.NewDecoder("gb2312") 45 | data, err := ioutil.ReadAll(decoder.NewReader(file)) 46 | ioutil.WriteFile(path, data, os.ModePerm) 47 | 48 | numFile++ 49 | fmt.Println("numFile:", numFile) 50 | 51 | return nil 52 | } 53 | 54 | func main() { 55 | numFile = 0 56 | start := time.Now() 57 | conf := NewConfig() 58 | filepath.Walk(conf.ExtractWebpagePath, walkFunc) 59 | fmt.Println("Used time:", time.Since(start)) 60 | } 61 | 62 | func checkError(err error) { 63 | if err != nil { 64 | log.Fatal(err) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /local-file-search/search.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "database/sql" 6 | "fmt" 7 | _ "github.com/mattn/go-sqlite3" 8 | "log" 9 | "os" 10 | "time" 11 | ) 12 | 13 | func main() { 14 | log.SetFlags(log.Lshortfile | log.LstdFlags) 15 | 16 | db, err := sql.Open("sqlite3", "./localfile.db") 17 | if err != nil { 18 | log.Fatal(err) 19 | } 20 | defer db.Close() 21 | 22 | scanner := bufio.NewScanner(os.Stdin) 23 | for scanner.Scan() { 24 | s := fmt.Sprintf("%%%s%%", scanner.Text()) // 子串查询 25 | 26 | start := time.Now() 27 | rows, err := db.Query("select * from info where name like ?", s) 28 | if err != nil { 29 | log.Fatal(err) 30 | } 31 | fmt.Println("query used time:", time.Since(start)) 32 | 33 | num := 0 34 | for rows.Next() { 35 | var name string 36 | var path string 37 | rows.Scan(&name, &path) 38 | fmt.Println(name) 39 | fmt.Println(path) 40 | fmt.Println("--------------------------------------------------") 41 | num++ 42 | if num > 5 { 43 | break 44 | } 45 | } 46 | // 必须释放查询结果,不然内存暴涨 47 | rows.Close() 48 | 49 | // start = time.Now() 50 | // rows, err = db.Query("select count(*) from info where name like ?", s) 51 | // if err != nil { 52 | // log.Fatal(err) 53 | // } 54 | // rows.Next() 55 | // rows.Scan(&num) 56 | // rows.Close() 57 | // fmt.Println("total:", num, "used time:", time.Since(start)) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /web/models/dao/ResultPageDAO.go: -------------------------------------------------------------------------------- 1 | package dao 2 | 3 | import ( 4 | "fmt" 5 | "github.com/gansidui/gose/search" 6 | "github.com/gansidui/gose/web/models/dto" 7 | "strconv" 8 | "time" 9 | ) 10 | 11 | // 根据搜索串q, 起始位置start,每页显示数量num 得到结果页信息 12 | func GetResultPageInfo(q, start, num string) (resultPage dto.ResultPageInfo, success bool) { 13 | startInt, err := strconv.Atoi(start) 14 | if err != nil { 15 | return resultPage, false 16 | } 17 | numInt, err := strconv.Atoi(num) 18 | if err != nil { 19 | return resultPage, false 20 | } 21 | 22 | // 搜索,得到Articles 23 | startTime := time.Now() 24 | result, total := search.GetSearchResult(q, startInt, startInt+numInt-1) 25 | 26 | resultPage.Articles = &result 27 | resultPage.ResultTotal = total 28 | resultPage.UsedTime = fmt.Sprintf("%v", time.Since(startTime)) 29 | 30 | // 设置Pagination 31 | var pageTotal int = (total-1)/numInt + 1 32 | var curPageInt int = startInt/numInt + 1 33 | var pagination dto.PaginationInfo 34 | pagination.PageTotal = pageTotal 35 | pagination.PerPageArticlesNum = numInt 36 | pagination.PrevPageStart = startInt - numInt 37 | pagination.NextPageStart = startInt + numInt 38 | pagination.HasPrevPage = false 39 | pagination.HasNextPage = false 40 | pagination.QueryString = q 41 | 42 | if curPageInt > 1 { 43 | pagination.HasPrevPage = true 44 | } 45 | if curPageInt < pageTotal { 46 | pagination.HasNextPage = true 47 | } 48 | // 最多显示num个页码链接 49 | for i, p := 0, (curPageInt-1)/numInt*numInt+1; i < numInt && p <= pageTotal; i, p = i+1, p+1 { 50 | pagination.ShowPages = append(pagination.ShowPages, &dto.ShowPageInfo{QueryString: q, Page: p, Start: (p - 1) * numInt}) 51 | } 52 | 53 | resultPage.Pagination = &pagination 54 | 55 | return resultPage, true 56 | } 57 | -------------------------------------------------------------------------------- /indexing/participleutil/trie/trie_test.go: -------------------------------------------------------------------------------- 1 | package trie 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestFind(t *testing.T) { 9 | 10 | tr := NewTrie() 11 | tr.Insert("ab") 12 | tr.Insert("cd") 13 | tr.Insert("abcd") 14 | tr.Insert("abcde") 15 | 16 | if tr.Num() != 4 { 17 | t.Error("Error") 18 | } 19 | 20 | flag, preWordLastflag := tr.Find("ab") 21 | if !flag { 22 | t.Error("Error") 23 | } 24 | 25 | flag, preWordLastflag = tr.Find("cde") 26 | if flag || preWordLastflag != 2 { 27 | t.Error("Error") 28 | } 29 | 30 | flag, preWordLastflag = tr.Find("abcdf") 31 | if flag || preWordLastflag != 4 { 32 | t.Error("Error") 33 | } 34 | 35 | flag, preWordLastflag = tr.Find("abcg") 36 | if flag || preWordLastflag != 2 { 37 | t.Error("Error") 38 | } 39 | } 40 | 41 | func TestParticiple(t *testing.T) { 42 | 43 | tr := NewTrie() 44 | tr.Insert("我A") 45 | tr.Insert("B是") 46 | tr.Insert("我AB是D") 47 | tr.Insert("我") 48 | 49 | ss := tr.Participle("我A我AB是DB是D我AB") // 我A/我AB是D/B是/D/我A/B, 其中D和B不是词,会被丢掉 50 | for _, v := range ss { 51 | fmt.Printf("%s/", v) 52 | } 53 | fmt.Println() 54 | 55 | ss = tr.Participle("我A我AB是DB是擦擦擦我AB") // 我A/我AB是D/B是/擦/擦/擦/我A/B,其中擦和B不是词,会被丢掉 56 | for _, v := range ss { 57 | fmt.Printf("%s/", v) 58 | } 59 | fmt.Println() 60 | 61 | ss = tr.Participle("D我AB是D") 62 | for _, v := range ss { 63 | fmt.Printf("%s/", v) 64 | } 65 | fmt.Println() 66 | 67 | ss = tr.Participle("D我AB是") 68 | for _, v := range ss { 69 | fmt.Printf("%s/", v) 70 | } 71 | fmt.Println() 72 | 73 | ss = tr.Participle("我A") 74 | for _, v := range ss { 75 | fmt.Printf("%s/", v) 76 | } 77 | fmt.Println() 78 | 79 | ss = tr.Participle("我") 80 | for _, v := range ss { 81 | fmt.Printf("%s/", v) 82 | } 83 | fmt.Println() 84 | } 85 | -------------------------------------------------------------------------------- /indexing/participleutil/datrie/datrie_test.go: -------------------------------------------------------------------------------- 1 | package datrie 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestFind(t *testing.T) { 9 | 10 | tr := NewDATrie() 11 | tr.Insert("ab") 12 | tr.Insert("cd") 13 | tr.Insert("abcd") 14 | tr.Insert("abcde") 15 | 16 | if tr.Num() != 4 { 17 | t.Error("Error") 18 | } 19 | 20 | flag, preWordLastflag := tr.Find("ab") 21 | if !flag { 22 | t.Error("Error") 23 | } 24 | 25 | flag, preWordLastflag = tr.Find("cde") 26 | if flag || preWordLastflag != 2 { 27 | t.Error("Error") 28 | } 29 | 30 | flag, preWordLastflag = tr.Find("abcdf") 31 | if flag || preWordLastflag != 4 { 32 | t.Error("Error") 33 | } 34 | 35 | flag, preWordLastflag = tr.Find("abcg") 36 | if flag || preWordLastflag != 2 { 37 | t.Error("Error") 38 | } 39 | } 40 | 41 | func TestParticiple(t *testing.T) { 42 | 43 | tr := NewDATrie() 44 | tr.Insert("我A") 45 | tr.Insert("B是") 46 | tr.Insert("我AB是D") 47 | tr.Insert("我") 48 | 49 | ss := tr.Participle("我A我AB是DB是D我AB") // 我A/我AB是D/B是/D/我A/B, 其中D和B不是词,会被丢掉 50 | for _, v := range ss { 51 | fmt.Printf("%s/", v) 52 | } 53 | fmt.Println() 54 | 55 | ss = tr.Participle("我A我AB是DB是擦擦擦我AB") // 我A/我AB是D/B是/擦/擦/擦/我A/B,其中擦和B不是词,会被丢掉 56 | for _, v := range ss { 57 | fmt.Printf("%s/", v) 58 | } 59 | fmt.Println() 60 | 61 | ss = tr.Participle("D我AB是D") 62 | for _, v := range ss { 63 | fmt.Printf("%s/", v) 64 | } 65 | fmt.Println() 66 | 67 | ss = tr.Participle("D我AB是") 68 | for _, v := range ss { 69 | fmt.Printf("%s/", v) 70 | } 71 | fmt.Println() 72 | 73 | ss = tr.Participle("我A") 74 | for _, v := range ss { 75 | fmt.Printf("%s/", v) 76 | } 77 | fmt.Println() 78 | 79 | ss = tr.Participle("我") 80 | for _, v := range ss { 81 | fmt.Printf("%s/", v) 82 | } 83 | fmt.Println() 84 | } 85 | -------------------------------------------------------------------------------- /indexing/participleutil/participle/participle_test.go: -------------------------------------------------------------------------------- 1 | package participle 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestFind(t *testing.T) { 9 | p := NewParticiple() 10 | p.Insert("ab") 11 | p.Insert("你好啊") 12 | p.Insert("") 13 | 14 | if p.Num() != 3 { 15 | t.Error("Error") 16 | } 17 | 18 | if p.ForwardFind("cc") || p.BackwardFind("cc") { 19 | t.Error("Error") 20 | } 21 | 22 | if !p.ForwardFind("ab") || !p.BackwardFind("ab") { 23 | t.Error("Error") 24 | } 25 | 26 | if !p.ForwardFind("你好啊") || !p.BackwardFind("你好啊") { 27 | t.Error("Error") 28 | } 29 | 30 | if !p.ForwardFind("") || !p.BackwardFind("") { 31 | t.Error("Error") 32 | } 33 | } 34 | 35 | func TestForwardMaxMatch(t *testing.T) { 36 | p := NewParticiple() 37 | p.Insert("我A") 38 | p.Insert("B是") 39 | p.Insert("我AB是D") 40 | 41 | ss := p.ForwardMaxMatch("我A我AB是DB是D我AB") 42 | for _, v := range ss { 43 | fmt.Printf("%s/", v) 44 | } 45 | fmt.Println() 46 | 47 | ss = p.ForwardMaxMatch("D我AB是") 48 | for _, v := range ss { 49 | fmt.Printf("%s/", v) 50 | } 51 | fmt.Println() 52 | 53 | p.Insert("学") 54 | p.Insert("历") 55 | p.Insert("史") 56 | p.Insert("学") 57 | p.Insert("好") 58 | p.Insert("学历") 59 | p.Insert("历史") 60 | p.Insert("史学") 61 | p.Insert("学好") 62 | 63 | ss = p.ForwardMaxMatch("学历史学好") 64 | for _, v := range ss { 65 | fmt.Printf("%s/", v) 66 | } 67 | fmt.Println() 68 | } 69 | 70 | func TestBackwardMaxMatch(t *testing.T) { 71 | p := NewParticiple() 72 | p.Insert("学") 73 | p.Insert("历") 74 | p.Insert("史") 75 | p.Insert("学") 76 | p.Insert("好") 77 | p.Insert("学历") 78 | p.Insert("历史") 79 | p.Insert("史学") 80 | p.Insert("学好") 81 | 82 | ss := p.BackwardMaxMatch("学历史学好") 83 | for _, v := range ss { 84 | fmt.Printf("%s/", v) 85 | } 86 | fmt.Println() 87 | } 88 | -------------------------------------------------------------------------------- /spider/README.md: -------------------------------------------------------------------------------- 1 | 2 | 这是一个独立的爬虫程序。 3 | 4 | ./spider.conf 是爬虫的配置文件,内容如下: 5 | 6 | { 7 | "EntryPath": "D:/golib/src/github.com/gansidui/gose/spider/config/entry.txt", 8 | "UrlQueuePath": "D:/golib/src/github.com/gansidui/gose/spider/config/urlqueue.db", 9 | "FilterExtPath": "D:/golib/src/github.com/gansidui/gose/spider/config/filterext.txt", 10 | "LimitedToThisPath": "D:/golib/src/github.com/gansidui/gose/spider/config/limitedtothis.txt", 11 | "DownWebpagePath": "D:/SearchEngine/down/webpage/", 12 | "DownUrlDataPath": "D:/SearchEngine/down/downurldata.db", 13 | "MaxNum": 20000, 14 | "IntervalTime": "50ms" 15 | } 16 | 17 | 18 | 这是一个json数据结构,需要严格按照json的格式填写。 19 | 20 | 带有 / 后缀的是文件夹,必须保证.txt的格式为UTF-8无BOM,windows下默认是有BOM头的,用notepad转换下无BOM即可。 21 | 22 | .db文件是sqlite数据库。 23 | 24 | 25 | 26 | "EntryPath": 是爬虫的入口地址,存放在一个文本文件中,每行为一个url, 也就是种子地址,url需要带有scheme,例如 http:// 和 https:// 。 27 | 28 | "UrlQueuePath": 是爬虫上次爬行过程中的url队列,这些url等待爬虫的分析,分析url后提取网页中的url,再将提取出来的url插入到队列尾部,如此循环。 29 | 在程序中断后,可以根据UrlQueue中保存的url数据继续爬取互联网,不需要再从入口地址开始爬一遍。这个UrlQueue也可以看做是入口地址。 30 | 数据库中只有一个表,名称为data,字段有 md5, url 。 31 | 32 | "FilterExtPath": 过滤掉指定扩展名的文件,每行一个扩展名。 33 | 34 | "LimitedToThisPath": 指明爬虫仅仅抓取包含了这些子串的url,url只需包含其中一条即可。其他所有的url都将被过滤。每行一个字符串。这条规则允许你可以抓取指定的网站数据。 35 | 36 | "DownWebpagePath": 爬虫抓取下来的网页保存在该文件夹中,文件名为url的md5值。 37 | 38 | "DownUrlDataPath": 是一个sqlite数据库,以前爬过的网页数据,这样爬虫此次爬的时候就会过滤掉这些ulr。数据库中只有一个表,名称为data,字段有 md5, url, path 。 (path为网页的本地存储路径) 39 | 40 | "MaxNum": 预期爬下的网页数 41 | 42 | "IntervalTime": 间歇时间,每次爬取一个网页后休息一会,为0的话也意味着放弃本时间片,抓取频率太高有被封掉的风险。建议设置为 500ms 43 | 44 | 45 | 46 | 爬虫的爬取策略: 47 | 48 | 每次运行爬虫,首先从 DownUrlData 中读取历史数据,然后从 UrlQueue 中读入起始url,再读入 Entry 中的入口地址,当然读入的时候得先判断是否以前爬取了这个网页。 49 | 50 | 多个线程负责爬取网页,一个线程负责将网页数据写入磁盘。为了均衡网站的负荷,每次抓取都暂停 IntervalTime 。 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /analyze/extractutil/extractutil.go: -------------------------------------------------------------------------------- 1 | package extractutil 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | ) 7 | 8 | // 提取标题(title) 9 | func ExtractTitle(content string) string { 10 | reg := regexp.MustCompile("?\\s*([^>]*?)\\s*<?/title>") 11 | allsubmatch := reg.FindAllStringSubmatch(content, -1) 12 | ans := "" 13 | for _, v2 := range allsubmatch { 14 | for k, v := range v2 { 15 | if k > 0 { 16 | ans = ans + v 17 | } 18 | } 19 | } 20 | return ans 21 | } 22 | 23 | // 提取正文(body) 24 | func ExtractBody(content string) string { 25 | // 去掉head标签 26 | reg := regexp.MustCompile("<head>([\\s\\S]*?)</head>") 27 | content = reg.ReplaceAllString(content, "$1") 28 | 29 | // 去掉script中的所有内容,包括script标签 30 | reg = regexp.MustCompile("<script>?[\\s\\S]*?</script>") 31 | content = reg.ReplaceAllString(content, "") 32 | 33 | // 去掉style中的所有内容,包括style标签 34 | reg = regexp.MustCompile("<style>?[\\s\\S]*?</style>") 35 | content = reg.ReplaceAllString(content, "") 36 | 37 | // 将td换成空格,li 换成 \t, tr,br,p 换成 \r\n 38 | reg = regexp.MustCompile("<td[^>]*>") 39 | content = reg.ReplaceAllString(content, " ") 40 | rep := strings.NewReplacer("<li>", "\t", "<tr>", "\r\n", "<br>", "\r\n", "<p>", "\r\n") 41 | content = rep.Replace(content) 42 | 43 | // 去掉所有的成对的尖括号<> 44 | reg = regexp.MustCompile("<[^>]*>") 45 | content = reg.ReplaceAllString(content, "") 46 | 47 | // 将 等转义字符替换成相应的符号 48 | rep = strings.NewReplacer("<", "<", ">", ">", "&", "&", " ", " ", """, "\"", "'", "'") 49 | content = rep.Replace(content) 50 | reg = regexp.MustCompile("&#.{2,6};") 51 | content = reg.ReplaceAllString(content, " ") 52 | 53 | // 去掉多余的空行等 54 | reg = regexp.MustCompile(" +") 55 | content = reg.ReplaceAllString(content, " ") 56 | reg = regexp.MustCompile("(\\s*\\t)+") 57 | content = reg.ReplaceAllString(content, "\t") 58 | reg = regexp.MustCompile("(\\s*\\r\\n)+") 59 | content = reg.ReplaceAllString(content, "\r\n") 60 | 61 | return content 62 | } 63 | -------------------------------------------------------------------------------- /indexing/participleutil/trie/trie.go: -------------------------------------------------------------------------------- 1 | package trie 2 | 3 | type trieNode struct { 4 | child map[rune]*trieNode 5 | flag bool 6 | } 7 | 8 | func newTrieNode() *trieNode { 9 | return &trieNode{ 10 | child: make(map[rune]*trieNode), 11 | flag: false, 12 | } 13 | } 14 | 15 | type Trie struct { 16 | root *trieNode 17 | num int 18 | } 19 | 20 | func NewTrie() *Trie { 21 | return &Trie{ 22 | root: newTrieNode(), 23 | num: 0, 24 | } 25 | } 26 | 27 | func (this *Trie) Insert(src string) { 28 | curNode := this.root 29 | for _, v := range src { 30 | if curNode.child[v] == nil { 31 | newNode := newTrieNode() 32 | curNode.child[v] = newNode 33 | } 34 | curNode = curNode.child[v] 35 | } 36 | curNode.flag = true 37 | this.num++ 38 | } 39 | 40 | // 若不存在src,则flag为false,且preWordLastIndex保存该路径上离失配地点最近的一个词的最后一个rune的末位置 41 | // 若存在src,则flag为true,且应该忽视preWordLastIndex 42 | func (this *Trie) Find(src string) (flag bool, preWordLastIndex int) { 43 | curNode := this.root 44 | ff := false 45 | for k, v := range src { 46 | if ff { 47 | preWordLastIndex = k 48 | ff = false 49 | } 50 | if curNode.child[v] == nil { 51 | return false, preWordLastIndex 52 | } 53 | curNode = curNode.child[v] 54 | if curNode.flag { 55 | ff = true 56 | } 57 | } 58 | return curNode.flag, preWordLastIndex 59 | } 60 | 61 | func (this *Trie) Num() int { 62 | return this.num 63 | } 64 | 65 | // 正向最大匹配分词,按照词典将src分词,分词结果以[]string形式返回 66 | func (this *Trie) Participle(src string) (target []string) { 67 | if len(src) == 0 { 68 | return 69 | } 70 | 71 | flag, preWordLastIndex, length := false, 0, len(src) 72 | left, right := 0, length 73 | 74 | for left < right { 75 | flag, preWordLastIndex = this.Find(src[left:right]) 76 | preWordLastIndex += left 77 | if flag { 78 | target = append(target, src[left:right]) 79 | left = right 80 | right = length 81 | } else { 82 | if preWordLastIndex == left { 83 | left++ // 多个字节的rune一定会到这里多次 :) 84 | } else { 85 | right = preWordLastIndex 86 | } 87 | } 88 | } 89 | return 90 | } 91 | -------------------------------------------------------------------------------- /indexing/participleutil/datrie/datrie.go: -------------------------------------------------------------------------------- 1 | // double array trie 2 | // base(s) + c --> t 3 | 4 | package datrie 5 | 6 | type trieNode struct { 7 | c rune 8 | flag bool 9 | } 10 | 11 | func newTrieNode() *trieNode { 12 | return &trieNode{ 13 | c: 0, 14 | flag: false, 15 | } 16 | } 17 | 18 | type pair struct { 19 | s *trieNode 20 | c rune 21 | } 22 | 23 | type DATrie struct { 24 | root *trieNode 25 | darr map[pair]*trieNode 26 | num int 27 | } 28 | 29 | func NewDATrie() *DATrie { 30 | return &DATrie{ 31 | root: newTrieNode(), 32 | darr: make(map[pair]*trieNode), 33 | num: 0, 34 | } 35 | } 36 | 37 | func (this *DATrie) Insert(src string) { 38 | curNode := this.root 39 | for _, v := range src { 40 | p := pair{s: curNode, c: v} 41 | if this.darr[p] == nil { 42 | newNode := newTrieNode() 43 | this.darr[p] = newNode 44 | } 45 | curNode = this.darr[p] 46 | } 47 | curNode.flag = true 48 | this.num++ 49 | } 50 | 51 | // 若不存在src,则flag为false,且preWordLastIndex保存该路径上离失配地点最近的一个词的最后一个rune的末位置 52 | // 若存在src,则flag为true,且应该忽视preWordLastIndex 53 | func (this *DATrie) Find(src string) (flag bool, preWordLastIndex int) { 54 | curNode := this.root 55 | ff := false 56 | for k, v := range src { 57 | if ff { 58 | preWordLastIndex = k 59 | ff = false 60 | } 61 | p := pair{s: curNode, c: v} 62 | if this.darr[p] == nil { 63 | return false, preWordLastIndex 64 | } 65 | curNode = this.darr[p] 66 | if curNode.flag { 67 | ff = true 68 | } 69 | } 70 | return curNode.flag, preWordLastIndex 71 | } 72 | 73 | func (this *DATrie) Num() int { 74 | return this.num 75 | } 76 | 77 | // 正向最大匹配分词,按照词典将src分词,分词结果以[]string形式返回 78 | func (this *DATrie) Participle(src string) (target []string) { 79 | if len(src) == 0 { 80 | return 81 | } 82 | 83 | flag, preWordLastIndex, length := false, 0, len(src) 84 | left, right := 0, length 85 | 86 | for left < right { 87 | flag, preWordLastIndex = this.Find(src[left:right]) 88 | preWordLastIndex += left 89 | if flag { 90 | target = append(target, src[left:right]) 91 | left = right 92 | right = length 93 | } else { 94 | if preWordLastIndex == left { 95 | left++ // 多个字节的rune一定会到这里多次 :) 96 | } else { 97 | right = preWordLastIndex 98 | } 99 | } 100 | } 101 | return 102 | } 103 | -------------------------------------------------------------------------------- /local-file-search/scan.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "database/sql" 6 | "fmt" 7 | _ "github.com/mattn/go-sqlite3" 8 | "log" 9 | "os" 10 | "path/filepath" 11 | "time" 12 | ) 13 | 14 | type FileInfo struct { 15 | name string 16 | path string 17 | } 18 | 19 | var isSkipDir map[string]bool // 需要过滤的目录 20 | var infoSlice []*FileInfo // 保存所有文件信息 21 | 22 | // 读取需要过滤掉的目录 23 | func ReadSkipDir() { 24 | file, err := os.Open("./skipdir.txt") 25 | if err != nil { 26 | log.Fatal(err) 27 | } 28 | defer file.Close() 29 | 30 | isSkipDir = make(map[string]bool, 0) 31 | re := bufio.NewReader(file) 32 | for { 33 | line, _, err := re.ReadLine() 34 | if err != nil { 35 | break 36 | } 37 | isSkipDir[string(line)] = true 38 | } 39 | } 40 | 41 | func WalkFunc(path string, info os.FileInfo, err error) error { 42 | // 有错误就跳过这个目录 43 | if err != nil { 44 | return filepath.SkipDir 45 | } 46 | // 需要过滤的path 47 | if isSkipDir[path] { 48 | return filepath.SkipDir 49 | } 50 | 51 | infoSlice = append(infoSlice, &FileInfo{name: info.Name(), path: path}) 52 | return nil 53 | } 54 | 55 | func main() { 56 | // 为log添加短文件名,方便查看行数 57 | log.SetFlags(log.Lshortfile | log.LstdFlags) 58 | 59 | // 初始化 60 | ReadSkipDir() 61 | infoSlice = make([]*FileInfo, 0) 62 | 63 | // 遍历所有磁盘,获取文件信息 64 | start := time.Now() 65 | for i := 0; i < 26; i++ { 66 | root := fmt.Sprintf("%c:", 'A'+i) 67 | filepath.Walk(root, WalkFunc) 68 | } 69 | fmt.Println("aquire file info used time:", time.Since(start)) 70 | 71 | // 打开数据库 72 | db, err := sql.Open("sqlite3", "./localfile.db") 73 | if err != nil { 74 | log.Fatal(err) 75 | } 76 | defer db.Close() 77 | 78 | // 先删表,再建表, 如果info不存在,drop时返回的err应该忽略 79 | db.Exec("drop table info") 80 | _, err = db.Exec("create table info(name nvarchar(256), path nvarchar(256))") 81 | if err != nil { 82 | log.Fatal(err) 83 | } 84 | 85 | // 将数据写入到info表中 86 | start = time.Now() 87 | // 开始一个事务 88 | tx, err := db.Begin() 89 | if err != nil { 90 | log.Fatal(err) 91 | } 92 | for _, v := range infoSlice { 93 | _, err := tx.Exec("insert into info(name, path) values(?, ?)", v.name, v.path) 94 | if err != nil { 95 | break 96 | } 97 | } 98 | tx.Commit() 99 | fmt.Println("write data to database used time:", time.Since(start)) 100 | } 101 | -------------------------------------------------------------------------------- /indexing/doc-mark/doc_mark_main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "database/sql" 5 | "encoding/json" 6 | _ "github.com/mattn/go-sqlite3" 7 | "io/ioutil" 8 | "log" 9 | "os" 10 | "time" 11 | ) 12 | 13 | // 配置文件 14 | type Config struct { 15 | ExtractUrlDataPath string 16 | } 17 | 18 | var conf *Config // 配置 19 | 20 | // 初始化 21 | func init() { 22 | setLogOutput() 23 | conf = NewConfig() 24 | } 25 | 26 | // 读取配置文件 27 | func NewConfig() *Config { 28 | file, err := ioutil.ReadFile("./docmark.conf") 29 | if err != nil { 30 | log.Fatal(err, "\r\n") 31 | } 32 | 33 | var conf Config 34 | err = json.Unmarshal(file, &conf) 35 | if err != nil { 36 | log.Fatal(err, "\r\n") 37 | } 38 | return &conf 39 | } 40 | 41 | // 设置log输出 42 | func setLogOutput() { 43 | // 为log添加短文件名,方便查看行数 44 | log.SetFlags(log.Lshortfile | log.LstdFlags) 45 | logfile, err := os.OpenFile("./docmark.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, os.ModePerm) 46 | // 注意这里不能关闭logfile 47 | if err != nil { 48 | log.Printf("%v\r\n", err) 49 | } 50 | log.SetOutput(logfile) 51 | } 52 | 53 | func main() { 54 | log.Printf("%v\r\n", "start......") 55 | start := time.Now() 56 | 57 | var curMaxId int = 0 // 目前的最大标号 58 | docs := make([]string, 0) // 待标号的文档的md5值 59 | 60 | // 读取待标号文档 61 | docdb, err := sql.Open("sqlite3", conf.ExtractUrlDataPath) 62 | if err != nil { 63 | log.Fatal(err, "\r\n") 64 | } 65 | defer docdb.Close() 66 | 67 | rows, err := docdb.Query("select * from data") 68 | if err != nil { 69 | log.Fatal(err, "\r\n") 70 | } 71 | defer rows.Close() 72 | 73 | var md5 string 74 | for rows.Next() { 75 | rows.Scan(&md5) 76 | docs = append(docs, md5) 77 | } 78 | 79 | // 开始标号 80 | markdb, err := sql.Open("sqlite3", "./docmark.db") 81 | if err != nil { 82 | log.Fatal(err, "\r\n") 83 | } 84 | defer markdb.Close() 85 | 86 | markdb.Exec("create table data(md5 varchar(32) not null primary key, id integer not null)") 87 | // 获取最大标号 88 | rows, err = markdb.Query("select max(id) from data") 89 | if err != nil { 90 | curMaxId = 0 91 | } 92 | rows.Next() 93 | rows.Scan(&curMaxId) 94 | rows.Close() 95 | 96 | tx, err := markdb.Begin() 97 | for _, v := range docs { 98 | curMaxId++ 99 | tx.Exec("insert into data(md5, id) values(?, ?)", v, curMaxId) 100 | } 101 | tx.Commit() 102 | 103 | log.Printf("used time: %v", time.Since(start)) 104 | } 105 | -------------------------------------------------------------------------------- /indexing/participleutil/participle/participle.go: -------------------------------------------------------------------------------- 1 | // 采用了double array trie 2 | // 可以随时替换成 trie 3 | 4 | package participle 5 | 6 | import ( 7 | "github.com/gansidui/gose/indexing/participleutil/datrie" 8 | ) 9 | 10 | type Participle struct { 11 | forwardTrie *datrie.DATrie 12 | backwardTrie *datrie.DATrie 13 | } 14 | 15 | // 正向最大匹配构造一个datrie即可,逆向最大匹配刚好相反,在插入前先反转,查询的时候也先反转再查询 16 | func NewParticiple() *Participle { 17 | p := new(Participle) 18 | p.forwardTrie = datrie.NewDATrie() 19 | p.backwardTrie = datrie.NewDATrie() 20 | return p 21 | } 22 | 23 | func ReverseString(src string) string { 24 | b := []rune(src) 25 | length := len(b) 26 | for i := 0; i < length; i++ { 27 | if i < length-i-1 { 28 | b[i], b[length-i-1] = b[length-i-1], b[i] 29 | } 30 | } 31 | return string(b) 32 | } 33 | 34 | func ReverseStringArray(src []string) { 35 | length := len(src) 36 | for i := 0; i < length; i++ { 37 | if i < length-i-1 { 38 | src[i], src[length-i-1] = src[length-i-1], src[i] 39 | } 40 | } 41 | } 42 | 43 | func (this *Participle) Insert(src string) { 44 | this.forwardTrie.Insert(src) 45 | // 反转后再插入到backwardTrie中 46 | this.backwardTrie.Insert(ReverseString(src)) 47 | } 48 | 49 | func (this *Participle) ForwardFind(src string) bool { 50 | flag, _ := this.forwardTrie.Find(src) 51 | return flag 52 | } 53 | 54 | func (this *Participle) BackwardFind(src string) bool { 55 | flag, _ := this.backwardTrie.Find(ReverseString(src)) 56 | return flag 57 | } 58 | 59 | func (this *Participle) Num() int { 60 | return this.forwardTrie.Num() 61 | } 62 | 63 | // 正向最大匹配分词,按照dic中的词典将src分词,分词结果以[]string形式返回 64 | func (this *Participle) ForwardMaxMatch(src string) (target []string) { 65 | return this.forwardTrie.Participle(src) 66 | } 67 | 68 | // 逆向最大匹配分词,和正向恰好相反 69 | func (this *Participle) BackwardMaxMatch(src string) (target []string) { 70 | ss := this.backwardTrie.Participle(ReverseString(src)) 71 | ReverseStringArray(ss) 72 | for k, v := range ss { 73 | ss[k] = ReverseString(v) 74 | } 75 | return ss 76 | } 77 | 78 | // 双向最大匹配,就是进行正向 + 逆向最大匹配 79 | // 如果 正反向分词结果一样,说明没有歧义,就是分词成功 80 | // 如果 正反向结果不一样,说明有歧义,就要处理 81 | // 处理策略:一个词减一分,一个单字减一分,选取分高的一种方法, 若正向和逆向相同,则选取逆向 82 | // 遵循最少切分法原则,目的是选取词少的,单字也少的 83 | func (this *Participle) BidirectionalMatch(src string) (target []string) { 84 | fs := this.ForwardMaxMatch(src) 85 | fb := this.BackwardMaxMatch(src) 86 | 87 | fsLen := len(fs) 88 | fbLen := len(fb) 89 | same := true 90 | if fsLen != fbLen { 91 | same = false 92 | } else { 93 | for i := 0; i < fsLen; i++ { 94 | if fs[i] != fb[i] { 95 | same = false 96 | break 97 | } 98 | } 99 | } 100 | 101 | if same { 102 | return fs 103 | } 104 | 105 | fsScore := -fsLen 106 | fbScore := -fbLen 107 | 108 | for i := 0; i < fsLen; i++ { 109 | if len(fs[i]) == 1 { 110 | fsScore-- 111 | } 112 | } 113 | for i := 0; i < fbLen; i++ { 114 | if len(fb[i]) == 1 { 115 | fbScore-- 116 | } 117 | } 118 | 119 | if fsScore > fbScore { 120 | return fs 121 | } 122 | return fb 123 | } 124 | -------------------------------------------------------------------------------- /indexing/calculate-idf/calculate_idf_main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "database/sql" 5 | "encoding/json" 6 | "fmt" 7 | "github.com/gansidui/gose/indexing/participleutil" 8 | _ "github.com/mattn/go-sqlite3" 9 | "io/ioutil" 10 | "log" 11 | "math" 12 | "os" 13 | "time" 14 | ) 15 | 16 | // 配置文件 17 | type Config struct { 18 | ExtractWebpagePath string 19 | ExtractUrlDataPath string 20 | } 21 | 22 | var conf *Config // 配置 23 | var numFile int // 语料库的文档总数 24 | var wordMap map[string]int // 包含该词的文档数 25 | 26 | // 初始化 27 | func init() { 28 | setLogOutput() 29 | participleutil.LoadDic("../participleutil/mydic.txt") 30 | conf = NewConfig() 31 | numFile = 0 32 | wordMap = make(map[string]int) 33 | } 34 | 35 | // 读取配置文件 36 | func NewConfig() *Config { 37 | file, err := ioutil.ReadFile("./idf.conf") 38 | if err != nil { 39 | log.Fatal(err, "\r\n") 40 | } 41 | 42 | var conf Config 43 | err = json.Unmarshal(file, &conf) 44 | if err != nil { 45 | log.Fatal(err, "\r\n") 46 | } 47 | return &conf 48 | } 49 | 50 | // 设置log输出 51 | func setLogOutput() { 52 | // 为log添加短文件名,方便查看行数 53 | log.SetFlags(log.Lshortfile | log.LstdFlags) 54 | logfile, err := os.OpenFile("./idf.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, os.ModePerm) 55 | // 注意这里不能关闭logfile 56 | if err != nil { 57 | log.Printf("%v\r\n", err) 58 | } 59 | log.SetOutput(logfile) 60 | } 61 | 62 | // 计算逆文档频率 63 | func calculateIDF() { 64 | start := time.Now() 65 | 66 | db, err := sql.Open("sqlite3", conf.ExtractUrlDataPath) 67 | if err != nil { 68 | log.Fatal(err, "\r\n") 69 | } 70 | defer db.Close() 71 | 72 | rows, err := db.Query("select * from data") 73 | if err != nil { 74 | log.Fatal(err, "\r\n") 75 | } 76 | defer rows.Close() 77 | 78 | var md5 string 79 | for rows.Next() { 80 | rows.Scan(&md5) 81 | // 计数 82 | numFile++ 83 | fmt.Println("numFile:", numFile) 84 | 85 | // 读取文档 86 | content, _ := ioutil.ReadFile(conf.ExtractWebpagePath + md5 + "_body.txt") 87 | // 得到分词结果 88 | ss := participleutil.Participle(string(content)) 89 | // 去重 90 | m := make(map[string]bool) 91 | for _, v := range ss { 92 | if !m[v] { 93 | m[v] = true 94 | } 95 | } 96 | // 保存结果 97 | for k, _ := range m { 98 | wordMap[k]++ 99 | } 100 | } 101 | 102 | fmt.Println("calculateIDF used time:", time.Since(start)) 103 | } 104 | 105 | // 将逆文档频率信息保存到数据库 106 | func writeDatabase() { 107 | start := time.Now() 108 | 109 | // 打开数据库 110 | db, err := sql.Open("sqlite3", "./idf.db") 111 | if err != nil { 112 | log.Fatal(err, "\r\n") 113 | } 114 | defer db.Close() 115 | 116 | // 先删表再建表 117 | db.Exec("drop table data") 118 | _, err = db.Exec("create table data(word varchar(30), idf float)") 119 | if err != nil { 120 | log.Fatal(err, "\r\n") 121 | } 122 | 123 | // 启动事务 124 | tx, err := db.Begin() 125 | if err != nil { 126 | log.Fatal(err, "\r\n") 127 | } 128 | for k, v := range wordMap { 129 | idf := math.Log10(float64(numFile) / float64(v+1)) 130 | if idf < 0 { 131 | idf = 0 132 | } 133 | _, err := tx.Exec("insert into data(word, idf) values(?, ?)", k, idf) 134 | if err != nil { 135 | log.Fatal(err, "\r\n") 136 | } 137 | } 138 | tx.Commit() 139 | 140 | fmt.Println("write database used time:", time.Since(start)) 141 | } 142 | 143 | // 将逆文档频率信息保存到文件 144 | func writeFile() { 145 | start := time.Now() 146 | 147 | file, err := os.OpenFile("./idf.txt", os.O_RDWR|os.O_CREATE, os.ModePerm) 148 | if err != nil { 149 | log.Fatal(err, "\r\n") 150 | } 151 | defer file.Close() 152 | 153 | for k, v := range wordMap { 154 | idf := math.Log10(float64(numFile) / float64(v+1)) 155 | if idf < 0 { 156 | idf = 0 157 | } 158 | file.WriteString(k + "-----" + fmt.Sprintf("%f", idf) + "\r\n") 159 | } 160 | 161 | fmt.Println("write file used time:", time.Since(start)) 162 | } 163 | 164 | func main() { 165 | log.Printf("%v\r\n", "start......") 166 | start := time.Now() 167 | calculateIDF() 168 | writeDatabase() 169 | writeFile() 170 | log.Printf("used time: %v", time.Since(start)) 171 | } 172 | -------------------------------------------------------------------------------- /analyze/analyze_main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "database/sql" 5 | "encoding/json" 6 | "github.com/gansidui/gose/analyze/extractutil" 7 | _ "github.com/mattn/go-sqlite3" 8 | "io/ioutil" 9 | "log" 10 | "os" 11 | "path/filepath" 12 | "time" 13 | ) 14 | 15 | // 配置文件 16 | type Config struct { 17 | DownUrlDataPath string 18 | ExtractWebpagePath string 19 | ExtractUrlDataPath string 20 | } 21 | 22 | // 分析 23 | type Analyze struct { 24 | conf *Config // 配置信息 25 | doneUrls map[string]bool // 已经分析过的网页,用md5标记 26 | } 27 | 28 | // 返回一个初始化了的Analyze实例 29 | func NewAnalyze() *Analyze { 30 | var an Analyze 31 | an.conf = NewConfig() 32 | an.doneUrls = make(map[string]bool) 33 | an.readExtractUrlData() 34 | return &an 35 | } 36 | 37 | // 读取已经分析过的网页数据 38 | func (this *Analyze) readExtractUrlData() { 39 | // 创建父目录 40 | err := os.MkdirAll(filepath.Dir(this.conf.ExtractUrlDataPath), os.ModePerm) 41 | if err != nil { 42 | log.Fatal(err, "\r\n") 43 | } 44 | 45 | // 打开数据库 46 | db, err := sql.Open("sqlite3", this.conf.ExtractUrlDataPath) 47 | if err != nil { 48 | log.Fatal(err, "\r\n") 49 | } 50 | defer db.Close() 51 | 52 | // 先建表,若表已经存在则建表失败 53 | db.Exec("create table data(md5 varchar(32))") 54 | 55 | // 读取数据 56 | rows, err := db.Query("select * from data") 57 | if err != nil { 58 | log.Fatal(err, "\r\n") 59 | } 60 | defer rows.Close() 61 | 62 | var md5 string 63 | for rows.Next() { 64 | rows.Scan(&md5) 65 | this.doneUrls[md5] = true 66 | } 67 | } 68 | 69 | // 开始网页分析 70 | func (this *Analyze) Do() { 71 | // 创建父目录 72 | err := os.MkdirAll(filepath.Dir(this.conf.ExtractWebpagePath), os.ModePerm) 73 | if err != nil { 74 | log.Fatal(err, "\r\n") 75 | } 76 | 77 | // 打开保存下载网页信息数据的数据库 78 | db, err := sql.Open("sqlite3", this.conf.DownUrlDataPath) 79 | if err != nil { 80 | log.Fatal(err, "\r\n") 81 | } 82 | defer db.Close() 83 | 84 | // 打开保存分析记录的数据库 85 | exdb, err := sql.Open("sqlite3", this.conf.ExtractUrlDataPath) 86 | if err != nil { 87 | log.Fatal(err, "\r\n") 88 | } 89 | defer exdb.Close() 90 | 91 | // 读取数据 92 | rows, err := db.Query("select * from data") 93 | if err != nil { 94 | log.Fatal(err, "\r\n") 95 | } 96 | defer rows.Close() 97 | 98 | tx, err := exdb.Begin() // 启动事务 99 | if err != nil { 100 | log.Fatal(err, "\r\n") 101 | } 102 | 103 | var md5, url, path string 104 | for rows.Next() { 105 | rows.Scan(&md5, &url, &path) 106 | if !this.doneUrls[md5] { 107 | content, err := ioutil.ReadFile(path) 108 | if err != nil { 109 | log.Fatal(err, "\r\n") 110 | } 111 | 112 | // 抽取标题和正文,并写入文件 113 | title := extractutil.ExtractTitle(string(content)) 114 | ioutil.WriteFile(this.conf.ExtractWebpagePath+md5+"_title.txt", []byte(title), os.ModePerm) 115 | 116 | body := extractutil.ExtractBody(string(content)) 117 | ioutil.WriteFile(this.conf.ExtractWebpagePath+md5+"_body.txt", []byte(body), os.ModePerm) 118 | 119 | // 标记已经分析过了并写入数据库 120 | this.doneUrls[md5] = true 121 | _, err = tx.Exec("insert into data(md5) values(?)", md5) 122 | if err != nil { 123 | log.Fatal(err, "\r\n") 124 | } 125 | } 126 | } 127 | tx.Commit() 128 | } 129 | 130 | // 初始化 131 | func init() { 132 | setLogOutput() 133 | } 134 | 135 | // 读取配置文件 136 | func NewConfig() *Config { 137 | file, err := ioutil.ReadFile("./analyze.conf") 138 | if err != nil { 139 | log.Fatal(err, "\r\n") 140 | } 141 | 142 | var conf Config 143 | err = json.Unmarshal(file, &conf) 144 | if err != nil { 145 | log.Fatal(err, "\r\n") 146 | } 147 | return &conf 148 | } 149 | 150 | // 设置log输出 151 | func setLogOutput() { 152 | // 为log添加短文件名,方便查看行数 153 | log.SetFlags(log.Lshortfile | log.LstdFlags) 154 | logfile, err := os.OpenFile("./analyze.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, os.ModePerm) 155 | // 注意这里不能关闭logfile 156 | if err != nil { 157 | log.Printf("%v\r\n", err) 158 | } 159 | log.SetOutput(logfile) 160 | } 161 | 162 | func main() { 163 | log.Printf("%v\r\n", "start......") 164 | start := time.Now() 165 | an := NewAnalyze() 166 | an.Do() 167 | log.Printf("used time: %v", time.Since(start)) 168 | } 169 | -------------------------------------------------------------------------------- /indexing/calculate-tf-idf/calculate_tf_idf_main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "database/sql" 5 | "encoding/json" 6 | "fmt" 7 | "github.com/gansidui/gose/indexing/participleutil" 8 | _ "github.com/mattn/go-sqlite3" 9 | "io/ioutil" 10 | "log" 11 | "os" 12 | "time" 13 | ) 14 | 15 | // 配置文件 16 | type Config struct { 17 | ExtractWebpagePath string 18 | ExtractUrlDataPath string 19 | } 20 | 21 | var conf *Config // 配置 22 | var numFile int // 文档数量 23 | var numWord int // 关键词数量 24 | var wordIDF map[string]float32 // word -- IDF值 25 | var docID map[string]int // doc -- id 文档对应的标号 26 | var words []string // 关键词 27 | var docids []int // 文档标号 28 | var tfidfs []float32 // TF-IDF值 29 | 30 | // 初始化 31 | func init() { 32 | setLogOutput() 33 | participleutil.LoadDic("../participleutil/mydic.txt") 34 | conf = NewConfig() 35 | numFile = 0 36 | numWord = 0 37 | wordIDF = make(map[string]float32) 38 | docID = make(map[string]int) 39 | words = make([]string, 0) 40 | docids = make([]int, 0) 41 | tfidfs = make([]float32, 0) 42 | readIDF() 43 | readDocId() 44 | initDatabase() 45 | } 46 | 47 | // 读取配置文件 48 | func NewConfig() *Config { 49 | file, err := ioutil.ReadFile("./tfidf.conf") 50 | if err != nil { 51 | log.Fatal(err, "\r\n") 52 | } 53 | 54 | var conf Config 55 | err = json.Unmarshal(file, &conf) 56 | if err != nil { 57 | log.Fatal(err, "\r\n") 58 | } 59 | return &conf 60 | } 61 | 62 | // 设置log输出 63 | func setLogOutput() { 64 | // 为log添加短文件名,方便查看行数 65 | log.SetFlags(log.Lshortfile | log.LstdFlags) 66 | logfile, err := os.OpenFile("./tfidf.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, os.ModePerm) 67 | // 注意这里不能关闭logfile 68 | if err != nil { 69 | log.Printf("%v\r\n", err) 70 | } 71 | log.SetOutput(logfile) 72 | } 73 | 74 | // 读取 IDF 值 75 | func readIDF() { 76 | db, err := sql.Open("sqlite3", "../calculate-idf/idf.db") 77 | if err != nil { 78 | log.Fatal(err, "\r\n") 79 | } 80 | defer db.Close() 81 | 82 | rows, err := db.Query("select * from data") 83 | if err != nil { 84 | log.Fatal(err, "\r\n") 85 | } 86 | defer rows.Close() 87 | 88 | var word string 89 | var idf float32 90 | for rows.Next() { 91 | rows.Scan(&word, &idf) 92 | wordIDF[word] = idf 93 | } 94 | } 95 | 96 | // 读取文档编号 97 | func readDocId() { 98 | db, err := sql.Open("sqlite3", "../doc-mark/docmark.db") 99 | if err != nil { 100 | log.Fatal(err, "\r\n") 101 | } 102 | defer db.Close() 103 | 104 | rows, err := db.Query("select * from data") 105 | if err != nil { 106 | log.Fatal(err, "\r\n") 107 | } 108 | defer rows.Close() 109 | 110 | var md5 string 111 | var id int 112 | for rows.Next() { 113 | rows.Scan(&md5, &id) 114 | docID[md5] = id 115 | } 116 | } 117 | 118 | // 计算TF-IDF 119 | func calculateTFIDF() { 120 | start := time.Now() 121 | // 读取文档数据 122 | db, err := sql.Open("sqlite3", conf.ExtractUrlDataPath) 123 | if err != nil { 124 | log.Fatal(err, "\r\n") 125 | } 126 | defer db.Close() 127 | 128 | rows, err := db.Query("select * from data") 129 | if err != nil { 130 | log.Fatal(err, "\r\n") 131 | } 132 | defer rows.Close() 133 | 134 | var md5 string 135 | for rows.Next() { 136 | rows.Scan(&md5) 137 | // 计数 138 | numFile++ 139 | fmt.Println("numFile:", numFile) 140 | 141 | // 读取正文文档 142 | content, _ := ioutil.ReadFile(conf.ExtractWebpagePath + md5 + "_body.txt") 143 | // 得到分词结果 144 | ss := participleutil.Participle(string(content)) 145 | totalWord := len(ss) // 文档的总词数 146 | // 统计每个词在这篇文档中出现的次数 147 | m := make(map[string]int) 148 | for _, v := range ss { 149 | m[v]++ 150 | } 151 | 152 | // 读取标题文档 153 | content, _ = ioutil.ReadFile(conf.ExtractWebpagePath + md5 + "_title.txt") 154 | ss = participleutil.Participle(string(content)) 155 | for _, v := range ss { 156 | m[v] += 5 157 | } 158 | 159 | docid := docID[md5] // 文档ID 160 | 161 | for k, v := range m { 162 | tf := float32(float32(v) / float32(totalWord)) // 词频 163 | idf := wordIDF[k] // 逆文档频率 164 | words = append(words, k) // 关键词 165 | docids = append(docids, docid) // 文档标号 166 | tfidfs = append(tfidfs, tf*idf) // TF-IDF值 167 | numWord++ 168 | if numWord%2000000 == 0 { 169 | writeDatabase() 170 | words = []string{} 171 | docids = []int{} 172 | tfidfs = []float32{} 173 | } 174 | } 175 | } 176 | 177 | writeDatabase() 178 | fmt.Println("calculateTFIDF used time:", time.Since(start)) 179 | } 180 | 181 | // 初始化数据库 182 | func initDatabase() { 183 | // 打开数据库 184 | db, err := sql.Open("sqlite3", "./tfidf.db") 185 | if err != nil { 186 | log.Fatal(err, "\r\n") 187 | } 188 | defer db.Close() 189 | 190 | // 先删表再建表 191 | db.Exec("drop table data") 192 | _, err = db.Exec("create table data(word varchar(30), docid integer, tfidf float)") 193 | if err != nil { 194 | log.Fatal(err, "\r\n") 195 | } 196 | } 197 | 198 | // 将TF-IDF信息保存到数据库 199 | func writeDatabase() { 200 | // 打开数据库 201 | db, err := sql.Open("sqlite3", "./tfidf.db") 202 | if err != nil { 203 | log.Fatal(err, "\r\n") 204 | } 205 | defer db.Close() 206 | 207 | // 启动事务 208 | tx, err := db.Begin() 209 | if err != nil { 210 | log.Fatal(err, "\r\n") 211 | } 212 | 213 | for k, _ := range words { 214 | tx.Exec("insert into data(word, docid, tfidf) values(?, ?, ?)", words[k], docids[k], tfidfs[k]) 215 | } 216 | 217 | tx.Commit() 218 | } 219 | 220 | func main() { 221 | log.Printf("%v\r\n", "start......") 222 | start := time.Now() 223 | 224 | calculateTFIDF() 225 | 226 | log.Printf("记录总数: %d\r\n", numWord) 227 | log.Printf("平均每篇文档有%d个不同的词\r\n", numWord/numFile) 228 | log.Printf("used time: %v", time.Since(start)) 229 | } 230 | -------------------------------------------------------------------------------- /search/search.go: -------------------------------------------------------------------------------- 1 | package search 2 | 3 | import ( 4 | "database/sql" 5 | "encoding/json" 6 | "github.com/gansidui/gose/indexing/participleutil" 7 | _ "github.com/mattn/go-sqlite3" 8 | "html/template" 9 | "io/ioutil" 10 | "log" 11 | "sort" 12 | "strings" 13 | ) 14 | 15 | // 配置文件 16 | type Config struct { 17 | DownUrlDataPath string 18 | ExtractWebpagePath string 19 | ExtractUrlDataPath string 20 | DicPath string 21 | TfIdfPath string 22 | DocMarkPath string 23 | } 24 | 25 | // 关键词索引信息 26 | type WordIndexInfo struct { 27 | id int // 文档编号 28 | tfidf float32 // TF-IDF值 29 | } 30 | 31 | // 网页信息 32 | type UrlInfo struct { 33 | url string // 原始url 34 | path string // 本地保存路径 35 | } 36 | 37 | // 搜索结果信息 38 | type SearchResultInfo struct { 39 | id int // 文档编号 40 | tfidfs float32 // 该文档针对搜索串中的每个关键词的TF_IDF值之和 41 | } 42 | 43 | // 文章信息 44 | type ArticleInfo struct { 45 | Title template.HTML // 标题 46 | Summary template.HTML // 摘要 47 | Url string // 原始url 48 | Path string // 本地路径 49 | } 50 | 51 | // word --> []*WordIndexInfo 按 id 从小到大排序,便于二分查找 52 | type ById []*WordIndexInfo 53 | 54 | func (a ById) Len() int { return len(a) } 55 | func (a ById) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 56 | func (a ById) Less(i, j int) bool { return a[i].id < a[j].id } 57 | 58 | // 按 tfidfs 从大到小排序 59 | type ByTfIdfs []*SearchResultInfo 60 | 61 | func (a ByTfIdfs) Len() int { return len(a) } 62 | func (a ByTfIdfs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 63 | func (a ByTfIdfs) Less(i, j int) bool { return a[i].tfidfs > a[j].tfidfs } 64 | 65 | // 按字符串长度从大到小排序 66 | type ByLength []string 67 | 68 | func (a ByLength) Len() int { return len(a) } 69 | func (a ByLength) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 70 | func (a ByLength) Less(i, j int) bool { return len(a[i]) > len(a[j]) } 71 | 72 | var conf *Config // 配置 73 | var wordMapIndexInfo map[string][]*WordIndexInfo // word --> []*WordIndexInfo 74 | var idMapMd5 map[int]string // id --> md5 75 | var md5MapUrlInfo map[string]*UrlInfo // md5 --> *UrlInfo 76 | 77 | // 读取配置文件 78 | func ReadConfig(confpath string) { 79 | file, err := ioutil.ReadFile(confpath) 80 | if err != nil { 81 | log.Fatal(err, "\r\n") 82 | } 83 | 84 | err = json.Unmarshal(file, &conf) 85 | if err != nil { 86 | log.Fatal(err, "\r\n") 87 | } 88 | } 89 | 90 | // 初始化 91 | func InitSearch() { 92 | participleutil.LoadDic(conf.DicPath) 93 | wordMapIndexInfo = make(map[string][]*WordIndexInfo) 94 | idMapMd5 = make(map[int]string) 95 | md5MapUrlInfo = make(map[string]*UrlInfo) 96 | readWordMapIndexInfo() 97 | readIdMapMd5() 98 | readMd5MapUrlInfo() 99 | } 100 | 101 | // 读取 word --> []*WordIndexInfo, 并排序 102 | func readWordMapIndexInfo() { 103 | db, err := sql.Open("sqlite3", conf.TfIdfPath) 104 | if err != nil { 105 | log.Fatal(err, "\r\n") 106 | } 107 | defer db.Close() 108 | 109 | rows, err := db.Query("select * from data") 110 | if err != nil { 111 | log.Fatal(err, "\r\n") 112 | } 113 | defer rows.Close() 114 | 115 | var ( 116 | word string 117 | id int 118 | tfidf float32 119 | ) 120 | 121 | for rows.Next() { 122 | rows.Scan(&word, &id, &tfidf) 123 | wordMapIndexInfo[word] = append(wordMapIndexInfo[word], &WordIndexInfo{id: id, tfidf: tfidf}) 124 | } 125 | 126 | // 排序 127 | for _, v := range wordMapIndexInfo { 128 | sort.Sort(ById(v)) 129 | } 130 | } 131 | 132 | // 读取 id --> md5 133 | func readIdMapMd5() { 134 | db, err := sql.Open("sqlite3", conf.DocMarkPath) 135 | if err != nil { 136 | log.Fatal(err, "\r\n") 137 | } 138 | defer db.Close() 139 | 140 | rows, err := db.Query("select * from data") 141 | if err != nil { 142 | log.Fatal(err, "\r\n") 143 | } 144 | defer rows.Close() 145 | 146 | var ( 147 | md5 string 148 | id int 149 | ) 150 | 151 | for rows.Next() { 152 | rows.Scan(&md5, &id) 153 | idMapMd5[id] = md5 154 | } 155 | } 156 | 157 | // 读取 md5 --> *UrlInfo 158 | func readMd5MapUrlInfo() { 159 | // 先读取已经分析过了的网页数据 160 | extracted := make(map[string]bool) 161 | exdb, err := sql.Open("sqlite3", conf.ExtractUrlDataPath) 162 | if err != nil { 163 | log.Fatal(err, "\r\n") 164 | } 165 | defer exdb.Close() 166 | 167 | exrows, err := exdb.Query("select * from data") 168 | if err != nil { 169 | log.Fatal(err, "\r\n") 170 | } 171 | defer exrows.Close() 172 | 173 | var md5 string 174 | 175 | for exrows.Next() { 176 | exrows.Scan(&md5) 177 | extracted[md5] = true 178 | } 179 | 180 | // 读取 已经分析过了的网页数据 的 UrlInfo 181 | db, err := sql.Open("sqlite3", conf.DownUrlDataPath) 182 | if err != nil { 183 | log.Fatal(err, "\r\n") 184 | } 185 | defer db.Close() 186 | 187 | rows, err := db.Query("select * from data") 188 | if err != nil { 189 | log.Fatal(err, "\r\n") 190 | } 191 | defer rows.Close() 192 | 193 | var ( 194 | url string 195 | path string 196 | ) 197 | 198 | for rows.Next() { 199 | rows.Scan(&md5, &url, &path) 200 | if extracted[md5] { 201 | md5MapUrlInfo[md5] = &UrlInfo{url: url, path: path} 202 | } 203 | } 204 | } 205 | 206 | // 根据id得到 *UrlInfo 207 | func getUrlInfoById(id int) *UrlInfo { 208 | md5 := idMapMd5[id] 209 | return md5MapUrlInfo[md5] 210 | } 211 | 212 | // 根据word和id得到 tfidf, 二分查找 213 | func getTfIdfByWordId(word string, id int) (result float32) { 214 | left, right := 0, len(wordMapIndexInfo[word])-1 215 | var mid, tmpid int 216 | 217 | for left <= right { 218 | mid = (left + right) >> 1 219 | tmpid = wordMapIndexInfo[word][mid].id 220 | if tmpid < id { 221 | left = mid + 1 222 | } else if tmpid == id { 223 | result = wordMapIndexInfo[word][mid].tfidf 224 | return result 225 | } else { 226 | right = mid - 1 227 | } 228 | } 229 | return -1 230 | } 231 | 232 | // 求并集 233 | func union(sets [][]int) (result []int) { 234 | m := make(map[int]bool) 235 | for _, v := range sets { 236 | for _, vv := range v { 237 | m[vv] = true 238 | } 239 | } 240 | for k, _ := range m { 241 | result = append(result, k) 242 | } 243 | return result 244 | } 245 | 246 | // 求交集 247 | func inter(sets [][]int) (result []int) { 248 | m := make(map[int]int) 249 | for _, v := range sets { 250 | for _, vv := range v { 251 | m[vv]++ 252 | } 253 | } 254 | setNum := len(sets) 255 | for k, v := range m { 256 | if v >= setNum { 257 | result = append(result, k) 258 | } 259 | } 260 | return result 261 | } 262 | 263 | // 对字符串数组去重 264 | func clearRepeat(ss []string) (result []string) { 265 | m := make(map[string]bool) 266 | for _, v := range ss { 267 | if !m[v] { 268 | m[v] = true 269 | result = append(result, v) 270 | } 271 | } 272 | return result 273 | } 274 | 275 | // 搜索,返回 [start, end] 之间的结果(文档id集合)以及搜索到的文档总数 276 | func search(searchString string, start, end int) (result []int, total int) { 277 | // 先按空格将搜索串分成多个句子,并过滤掉空句子 278 | var sentences []string 279 | texts := strings.Split(searchString, " ") 280 | for _, sen := range texts { 281 | if sen != "" { 282 | sentences = append(sentences, sen) 283 | } 284 | } 285 | 286 | var ( 287 | tempResult []int // 临时结果 288 | words []string // 搜索串的关键词集合 289 | searchResultInfo []*SearchResultInfo // 用来根据tfidfs排序 290 | ) 291 | 292 | // 对每个句子进行分词,句子内对每个词的id集合求并集,句子间对id集合求交集 293 | var outidsets [][]int 294 | for _, sen := range sentences { 295 | ws := participleutil.Participle(sen) 296 | var inidsets [][]int 297 | 298 | for _, w := range ws { 299 | var ids []int 300 | for _, v := range wordMapIndexInfo[w] { 301 | ids = append(ids, v.id) 302 | } 303 | inidsets = append(inidsets, ids) 304 | words = append(words, w) 305 | } 306 | 307 | outidsets = append(outidsets, union(inidsets)) // 对句内的集合求并集 308 | } 309 | tempResult = inter(outidsets) // 对句间的集合求交集 310 | 311 | // 对tempResult进行排序 312 | words = clearRepeat(words) // 去重 313 | for _, id := range tempResult { 314 | var tfidfs float32 = 0.0 315 | for _, w := range words { 316 | tfidfs += getTfIdfByWordId(w, id) 317 | } 318 | searchResultInfo = append(searchResultInfo, &SearchResultInfo{id: id, tfidfs: tfidfs}) 319 | } 320 | sort.Sort(ByTfIdfs(searchResultInfo)) 321 | 322 | // 选取 [start, end] 之间的id作为结果 323 | if start < 0 { 324 | start = 0 325 | } 326 | if end >= len(searchResultInfo) { 327 | end = len(searchResultInfo) - 1 328 | } 329 | for i := start; i <= end; i++ { 330 | result = append(result, searchResultInfo[i].id) 331 | } 332 | total = len(searchResultInfo) 333 | 334 | return result, total 335 | } 336 | 337 | // 将关键词标红 338 | func markRedKeywords(content string, keywords []string) (result string) { 339 | patterns := []string{} 340 | sort.Sort(ByLength(keywords)) 341 | for _, oldstr := range keywords { 342 | patterns = append(patterns, oldstr) 343 | newstr := "<font color=\"red\">" + oldstr + "</font>" 344 | patterns = append(patterns, newstr) 345 | } 346 | replacer := strings.NewReplacer(patterns...) 347 | result = replacer.Replace(content) 348 | return result 349 | } 350 | 351 | // 向调用者返回搜索结果 352 | // searchString为搜索串,返回第[start, end]篇文档的信息,result保存文档信息, total为搜索到的文档总数 353 | func GetSearchResult(searchString string, start, end int) (result []ArticleInfo, total int) { 354 | // 定义一个提取摘要的函数, 提取前100个rune 355 | getSummary := func(content string) string { 356 | num := 0 357 | var res string 358 | for _, v := range content { 359 | s := string(v) 360 | if s != " " && s != "\t" && s != "\r" && s != "\n" { 361 | num++ 362 | res = res + s 363 | if num > 100 { 364 | break 365 | } 366 | } 367 | } 368 | return res 369 | } 370 | 371 | // 得到分词用来结果标红 372 | keywords := participleutil.Participle(searchString) 373 | 374 | res, tot := search(searchString, start, end) 375 | total = tot 376 | var articleInfo ArticleInfo 377 | for _, id := range res { 378 | md5 := idMapMd5[id] 379 | urlInfo := md5MapUrlInfo[md5] 380 | 381 | articleInfo.Url = urlInfo.url 382 | articleInfo.Path = urlInfo.path 383 | 384 | content, err := ioutil.ReadFile(conf.ExtractWebpagePath + md5 + "_body.txt") 385 | if err != nil { 386 | log.Printf("%v\r\n", err) 387 | } else { 388 | articleInfo.Summary = template.HTML(markRedKeywords(getSummary(string(content)), keywords)) 389 | } 390 | 391 | title, err := ioutil.ReadFile(conf.ExtractWebpagePath + md5 + "_title.txt") 392 | if err != nil { 393 | log.Printf("%v\r\n", err) 394 | } else { 395 | articleInfo.Title = template.HTML(markRedKeywords(string(title), keywords)) 396 | } 397 | 398 | result = append(result, articleInfo) 399 | } 400 | 401 | return result, total 402 | } 403 | -------------------------------------------------------------------------------- /spider/spider_main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "crypto/md5" 6 | "database/sql" 7 | "encoding/json" 8 | "errors" 9 | "fmt" 10 | _ "github.com/mattn/go-sqlite3" 11 | "io" 12 | "io/ioutil" 13 | "log" 14 | "net/http" 15 | "os" 16 | "path/filepath" 17 | "regexp" 18 | "runtime" 19 | "strings" 20 | "sync" 21 | "sync/atomic" 22 | "time" 23 | ) 24 | 25 | // 配置文件 26 | type Config struct { 27 | EntryPath string 28 | UrlQueuePath string 29 | FilterExtPath string 30 | LimitedToThisPath string 31 | DownWebpagePath string 32 | DownUrlDataPath string 33 | MaxNum int32 34 | IntervalTime string 35 | } 36 | 37 | // 网页信息 38 | type UrlInfo struct { 39 | md5 string // 修剪之后的url的md5值,如,md5(www.baidu.com) 40 | url string // 完整url, http://www.baidu.com/ 41 | path string // 网页本地保存路径 42 | content string // 网页的数据 43 | } 44 | 45 | // 爬虫 46 | type Spider struct { 47 | conf *Config // 配置信息 48 | filterExts map[string]bool // 扩展名过滤 49 | limitedToThis []string // 仅仅抓取包含了其中至少一个子串的url,若limitedToThis为空,则该规则无效 50 | doneUrls map[string]bool // 已经爬下的url,用md5标记 51 | exceptionUrls map[string]bool // 异常url以及被过滤的url等,用md5标记 52 | chUrlsInfo chan *UrlInfo // 爬取到的所有url的Info 53 | chUrl chan string // 存储url,供多个gorountine去处理 54 | chHttp chan bool // 控制同时下载url的gorountine数量 55 | chStopIO chan bool // 主线程通知结束磁盘IO gorountine 56 | chExit chan bool // 磁盘IO结束后再通知主线程结束 57 | wg sync.WaitGroup // 等待所有gorountine结束 58 | pageNum int32 // 当前爬取的网页数量 59 | intervalTime time.Duration // 间歇时间,如 "5ms" 60 | } 61 | 62 | // 返回一个初始化了的Spider实例 63 | func NewSpider() *Spider { 64 | // 磁盘处理只需一个goroutine,网络可以适当多几个goroutine,在未达到到带宽限制的情况下有利于抢占网络资源, 65 | // 磁盘IO阻塞后也可以保证有goroutine正在下载资源保存到内存中,另外正则匹配需要开多个goroutine处理 66 | runtime.GOMAXPROCS(runtime.NumCPU()) 67 | 68 | var sp Spider 69 | sp.conf = NewConfig() 70 | sp.filterExts = make(map[string]bool) 71 | sp.limitedToThis = make([]string, 0) 72 | sp.doneUrls = make(map[string]bool) 73 | sp.exceptionUrls = make(map[string]bool) 74 | sp.chUrlsInfo = make(chan *UrlInfo, 100) 75 | sp.chUrl = make(chan string, 1000000) 76 | sp.chHttp = make(chan bool, 5) 77 | sp.chStopIO = make(chan bool) 78 | sp.chExit = make(chan bool) 79 | sp.pageNum = 0 80 | intervalTime, err := time.ParseDuration(sp.conf.IntervalTime) 81 | if err != nil { 82 | sp.intervalTime = 500 * time.Millisecond 83 | } else { 84 | sp.intervalTime = intervalTime 85 | } 86 | 87 | sp.readDownUrlData() 88 | sp.readEntry() 89 | sp.readUrlQueue() 90 | sp.readFilterExt() 91 | sp.readLimitedToThis() 92 | 93 | return &sp 94 | } 95 | 96 | // 读取以前爬过的网页数据 97 | func (this *Spider) readDownUrlData() { 98 | // 创建父目录 99 | err := os.MkdirAll(filepath.Dir(this.conf.DownUrlDataPath), os.ModePerm) 100 | if err != nil { 101 | log.Fatal(err, "\r\n") 102 | } 103 | 104 | // 打开数据库 105 | db, err := sql.Open("sqlite3", this.conf.DownUrlDataPath) 106 | if err != nil { 107 | log.Fatal(err, "\r\n") 108 | } 109 | defer db.Close() 110 | 111 | // 先建表,若表已经存在则建表失败 112 | db.Exec("create table data(md5 varchar(32), url varchar(256), path varchar(256))") 113 | 114 | // 读取数据 115 | rows, err := db.Query("select * from data") 116 | if err != nil { 117 | log.Fatal(err, "\r\n") 118 | } 119 | defer rows.Close() 120 | 121 | var md5, url, path string 122 | for rows.Next() { 123 | rows.Scan(&md5, &url, &path) 124 | this.doneUrls[md5] = true 125 | } 126 | } 127 | 128 | // 读取入口地址 129 | func (this *Spider) readEntry() { 130 | file, err := os.Open(this.conf.EntryPath) 131 | if err != nil { 132 | log.Fatal(err, "\r\n") 133 | } 134 | defer file.Close() 135 | 136 | re := bufio.NewReader(file) 137 | for { 138 | urlbyte, _, err := re.ReadLine() 139 | if err != nil { 140 | break 141 | } 142 | if string(urlbyte) != "" && !this.doneUrls[getMd5FromUrl(string(urlbyte))] { 143 | this.chUrl <- string(urlbyte) 144 | } 145 | } 146 | } 147 | 148 | // 读取爬虫上次爬行过程中Url队列 149 | func (this *Spider) readUrlQueue() { 150 | // 打开数据库 151 | db, err := sql.Open("sqlite3", this.conf.UrlQueuePath) 152 | if err != nil { 153 | log.Fatal(err, "\r\n") 154 | } 155 | defer db.Close() 156 | 157 | // 先建表,若表已经存在则建表失败 158 | db.Exec("create table data(md5 varchar(32), url varchar(256))") 159 | 160 | // 读取数据 161 | rows, err := db.Query("select * from data") 162 | if err != nil { 163 | log.Fatal(err, "\r\n") 164 | } 165 | defer rows.Close() 166 | 167 | var md5, url string 168 | for rows.Next() { 169 | rows.Scan(&md5, &url) 170 | if !this.doneUrls[md5] { 171 | this.chUrl <- url 172 | this.doneUrls[md5] = true 173 | } 174 | } 175 | } 176 | 177 | // 读取过滤的扩展名 178 | func (this *Spider) readFilterExt() { 179 | file, err := os.OpenFile(this.conf.FilterExtPath, os.O_RDWR|os.O_CREATE|os.O_APPEND, os.ModePerm) 180 | if err != nil { 181 | log.Printf("%v\r\n", err) 182 | return 183 | } 184 | defer file.Close() 185 | 186 | re := bufio.NewReader(file) 187 | for { 188 | extbyte, _, err := re.ReadLine() 189 | if err != nil { 190 | break 191 | } 192 | if string(extbyte) != "" { 193 | this.filterExts[string(extbyte)] = true 194 | } 195 | } 196 | } 197 | 198 | // 读取指定抓取的url信息 199 | func (this *Spider) readLimitedToThis() { 200 | file, err := os.OpenFile(this.conf.LimitedToThisPath, os.O_RDWR|os.O_CREATE|os.O_APPEND, os.ModePerm) 201 | if err != nil { 202 | log.Printf("%v\r\n", err) 203 | return 204 | } 205 | defer file.Close() 206 | 207 | re := bufio.NewReader(file) 208 | for { 209 | limbyte, _, err := re.ReadLine() 210 | if err != nil { 211 | break 212 | } 213 | if string(limbyte) != "" { 214 | this.limitedToThis = append(this.limitedToThis, string(limbyte)) 215 | } 216 | } 217 | } 218 | 219 | // 将这次爬下的UrlInfo写入到数据库中 220 | func (this *Spider) writeUrlInfo() { 221 | // IO结束后运行主线程退出 222 | defer func() { 223 | this.chExit <- true 224 | }() 225 | 226 | // md5, url, path 写入 this.conf.DownUrlDataPath数据库中 227 | // 打开数据库 228 | db, err := sql.Open("sqlite3", this.conf.DownUrlDataPath) 229 | if err != nil { 230 | log.Fatal(err, "\r\n") 231 | } 232 | defer db.Close() 233 | // 先建表,若表已经存在则建表失败 234 | db.Exec("create table data(md5 varchar(32), url varchar(256), path varchar(256))") 235 | 236 | // content 写入 this.conf.DownWebpagePath/xxx.html 237 | // 创建父目录 238 | err = os.MkdirAll(this.conf.DownWebpagePath, os.ModePerm) 239 | if err != nil { 240 | log.Fatal(err, "\r\n") 241 | } 242 | 243 | // 收到 停止IO 命令后不能马上退出,因为还需要等待chUrlsInfo中数据处理完才可 244 | var urlInfo *UrlInfo 245 | var canStop bool = false 246 | for { 247 | select { 248 | case <-this.chStopIO: //收到结束IO通知 249 | canStop = true 250 | if len(this.chUrlsInfo) == 0 { 251 | return 252 | } 253 | case urlInfo = <-this.chUrlsInfo: 254 | fmt.Printf("[%s]---正在写入文件...\n", urlInfo.url) 255 | // 保存网页 256 | ioutil.WriteFile(urlInfo.path, []byte(urlInfo.content), os.ModePerm) 257 | // 将网页信息插入到数据库中,忽略错误 258 | db.Exec("insert into data(md5, url, path) values(?, ?, ?)", urlInfo.md5, urlInfo.url, urlInfo.path) 259 | this.pageNum = atomic.AddInt32(&this.pageNum, 1) 260 | fmt.Printf("[%s]---写入完成.\n", urlInfo.url) 261 | if canStop && len(this.chUrlsInfo) == 0 { 262 | return 263 | } 264 | } 265 | } 266 | } 267 | 268 | // 将url队列写入到数据库中用于下次从这里开始爬行 269 | func (this *Spider) writeUrlQueue(urls []string) { 270 | // 打开数据库 271 | db, err := sql.Open("sqlite3", this.conf.UrlQueuePath) 272 | if err != nil { 273 | log.Printf("%v\r\n", err) 274 | return 275 | } 276 | defer db.Close() 277 | 278 | tx, err := db.Begin() // 启动一个事务 279 | if err != nil { 280 | log.Printf("%v\r\n", err) 281 | } else { 282 | for _, vv := range urls { 283 | vv = trimUrl(vv) 284 | md5 := getMd5FromUrl(vv) 285 | if !this.doneUrls[md5] && !this.exceptionUrls[md5] && !this.beFiltered(vv) { 286 | _, err = tx.Exec("insert into data(md5, url) values(?, ?)", md5, vv) 287 | if err != nil { 288 | log.Printf("%v\r\n", err) 289 | break 290 | } 291 | } 292 | if len(urls) == 0 { 293 | break 294 | } 295 | } 296 | } 297 | tx.Commit() // 提交事务 298 | } 299 | 300 | // 判断爬取的网页数已经达到预期 301 | func (this *Spider) isFinished() bool { 302 | // 这里的atomic并不能保证该函数同时只能被一个goroutine调用 303 | if atomic.LoadInt32(&this.pageNum) >= this.conf.MaxNum { 304 | log.Printf("%v\r\n", "爬取的网页数已达预期!!!") 305 | return true 306 | } 307 | return false 308 | } 309 | 310 | // 主线程,开始爬取 311 | func (this *Spider) Fetch() { 312 | if len(this.chUrl) == 0 { 313 | log.Fatal("entry url is empty.\r\n") 314 | return 315 | } 316 | 317 | go this.writeUrlInfo() 318 | this.work() 319 | 320 | this.chStopIO <- true //通知结束writeUrlInfo() 321 | <-this.chExit // 等待writeUrlInfo结束 322 | } 323 | 324 | // 工作线程 325 | func (this *Spider) work() { 326 | for url := range this.chUrl { 327 | this.chHttp <- true //控制下载网页的goroutine数量 328 | 329 | go func(url string) { 330 | this.wg.Add(1) 331 | log.Printf("%v\r\n", "线程下载开始") 332 | defer func() { 333 | <-this.chHttp 334 | this.wg.Done() 335 | log.Printf("%v\r\n", "线程下载完成") 336 | }() 337 | this.do(url) 338 | }(url) 339 | 340 | log.Printf("len(chUrlsInfo)==%d --- len(chUrl)==%d --- len(chHttp)==%d\r\n", len(this.chUrlsInfo), len(this.chUrl), len(this.chHttp)) 341 | time.Sleep(this.intervalTime) // 慢点爬,怕被网站封IP 342 | 343 | if this.isFinished() { 344 | log.Printf("%v\r\n", "正在等待各线程结束......") 345 | this.wg.Wait() 346 | log.Printf("%v\r\n", "各线程已经结束!!!") 347 | if len(this.chUrl) == 0 { 348 | break 349 | } 350 | 351 | // 保存this.chUrl中剩余的urls 352 | urls := make([]string, 0) 353 | for v := range this.chUrl { 354 | urls = append(urls, v) 355 | if len(this.chUrl) == 0 { 356 | break 357 | } 358 | } 359 | this.writeUrlQueue(urls) 360 | break 361 | } 362 | } 363 | } 364 | 365 | // 处理url 366 | func (this *Spider) do(url string) { 367 | client := &http.Client{ 368 | CheckRedirect: doRedirect, 369 | } 370 | // 若url重定向,则client.Get(url)里面调用自定义的doRedirect函数处理, 371 | // 然后将doRedirect的error返回给这里 372 | resp, err := client.Get(url) 373 | if err != nil { 374 | log.Printf("%s\r\n", err) 375 | this.exceptionUrls[getMd5FromUrl(url)] = true 376 | return 377 | } 378 | defer resp.Body.Close() 379 | 380 | // 不为OK就返回,因为有些可能是500等错误 381 | if resp.StatusCode != http.StatusOK { 382 | log.Printf("[%s] resp.StatusCode == [%d]\r\n", url, resp.StatusCode) 383 | this.exceptionUrls[getMd5FromUrl(url)] = true 384 | return 385 | } 386 | 387 | fmt.Printf("[%s]---正在下载...\n", url) 388 | content, err := ioutil.ReadAll(resp.Body) 389 | if err != nil { 390 | log.Printf("%s\r\n", err) 391 | this.exceptionUrls[getMd5FromUrl(url)] = true 392 | fmt.Printf("[%s]---异常,结束下载.\n", url) 393 | return 394 | } 395 | fmt.Printf("[%s]---下载完成.\n", url) 396 | 397 | // 保存UrlInfo 398 | md5 := getMd5FromUrl(url) 399 | path := this.conf.DownWebpagePath + md5 + ".html" 400 | this.chUrlsInfo <- &UrlInfo{md5: md5, url: url, path: path, content: string(content)} 401 | 402 | fmt.Printf("[%s]---正在分析...\n", url) 403 | // 得到新的url 404 | urls := getURLs(content) 405 | for i, v := range urls { 406 | // 已经完成任务,将url队列写入到数据库中用于下次从这里开始爬行 407 | if this.isFinished() { 408 | this.writeUrlQueue(urls[i:]) 409 | break 410 | } 411 | 412 | // 还未到达预期,继续爬取 413 | v = trimUrl(v) 414 | md5 := getMd5FromUrl(v) 415 | if !this.doneUrls[md5] && !this.exceptionUrls[md5] { 416 | if this.beFiltered(v) { 417 | this.exceptionUrls[md5] = true 418 | } else { 419 | this.chUrl <- v 420 | this.doneUrls[md5] = true 421 | } 422 | } 423 | } 424 | fmt.Printf("[%s]---分析完成.\n", url) 425 | } 426 | 427 | // 过滤 428 | func (this *Spider) beFiltered(url string) bool { 429 | b1 := this.filterExts[filepath.Ext(url)] // 后缀过滤 430 | b2 := true 431 | // 若limitedToThis不为空,且limitedToThis中的一个串是url的一个子串,就不过滤,否则过滤掉url 432 | if len(this.limitedToThis) > 0 { 433 | for _, v := range this.limitedToThis { 434 | if strings.Contains(url, v) { 435 | b2 = false 436 | break 437 | } 438 | } 439 | } else { 440 | b2 = false 441 | } 442 | 443 | return b1 || b2 444 | } 445 | 446 | // 重定向处理, StatusCode == 302 447 | func doRedirect(req *http.Request, via []*http.Request) error { 448 | return errors.New(req.URL.String() + " was as an exception url to do.") 449 | } 450 | 451 | // 初始化 452 | func init() { 453 | setLogOutput() 454 | } 455 | 456 | // 读取配置文件 457 | func NewConfig() *Config { 458 | file, err := ioutil.ReadFile("./spider.conf") 459 | if err != nil { 460 | log.Fatal(err, "\r\n") 461 | } 462 | 463 | var conf Config 464 | err = json.Unmarshal(file, &conf) 465 | if err != nil { 466 | log.Fatal(err, "\r\n") 467 | } 468 | return &conf 469 | } 470 | 471 | // 设置log输出 472 | func setLogOutput() { 473 | // 为log添加短文件名,方便查看行数 474 | log.SetFlags(log.Lshortfile | log.LstdFlags) 475 | logfile, err := os.OpenFile("./spider.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, os.ModePerm) 476 | // 注意这里不能关闭logfile 477 | if err != nil { 478 | log.Printf("%v\r\n", err) 479 | } 480 | log.SetOutput(logfile) 481 | } 482 | 483 | // 修剪url, 把 # 后面的字符去掉 484 | func trimUrl(url string) string { 485 | p := strings.Index(url, "#") 486 | if p != -1 { 487 | url = url[:p] 488 | } 489 | return url 490 | } 491 | 492 | // 修剪之后的url,另外去掉最后的斜杠和scheme 如 md5(www.baidu.com) 493 | func getMd5FromUrl(url string) string { 494 | url = strings.TrimRight(url, "/") 495 | url = strings.TrimPrefix(url, "http://") 496 | url = strings.TrimPrefix(url, "https://") 497 | m := md5.New() 498 | io.WriteString(m, url) 499 | str := fmt.Sprintf("%x", m.Sum(nil)) // 将md5值格式化成字符串 500 | return str 501 | } 502 | 503 | // 从html页面中提取所有的url 504 | func getURLs(content []byte) (urls []string) { 505 | re := regexp.MustCompile("href\\s*=\\s*['\"]?\\s*(https?://[^'\"\\s]+)\\s*['\"]?") 506 | allsubmatch := re.FindAllSubmatch([]byte(content), -1) 507 | for _, v2 := range allsubmatch { 508 | for k, v := range v2 { 509 | // k == 0 是表示匹配的全部元素 510 | if k > 0 { 511 | urls = append(urls, string(v)) 512 | } 513 | } 514 | } 515 | return urls 516 | } 517 | 518 | func main() { 519 | log.Printf("%v\r\n", "start......") 520 | start := time.Now() 521 | sp := NewSpider() 522 | sp.Fetch() 523 | log.Printf("used time: %v", time.Since(start)) 524 | } 525 | --------------------------------------------------------------------------------