├── .gitignore ├── .gitkeep ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── dist ├── .gitkeep └── commoncrawler ├── output └── .gitkeep ├── src ├── analyze.go ├── config.go ├── crawl.go ├── extract.go └── scan.go └── wet.paths /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | 26 | output/crawl-data 27 | output/match-data 28 | -------------------------------------------------------------------------------- /.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisCates/CommonCrawler/b7a83b2518d8a26830dce9c2afa4a11976daa6c7/.gitkeep -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | matrix: 4 | include: 5 | - os: linux 6 | go: 1.12.4 7 | - os: osx 8 | go: 1.12.4 9 | - os: windows 10 | go: 1.12.4 11 | 12 | services: 13 | - docker 14 | 15 | before_install: 16 | - go get github.com/logrusorgru/aurora 17 | 18 | script: 19 | # Enable Mac OS Dockerization once Docker is supported 20 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then go run src/*.go; fi 21 | # Linux Docker build procedure 22 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then docker build -t commoncrawler .; fi 23 | # Enable Windows Dockerization once Docker has better support 24 | - if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then go run src/*.go; fi 25 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Start from golang v1.11 base image 2 | FROM golang 3 | 4 | ENV GO111MODULE=on 5 | 6 | # Maintainer info 7 | LABEL maintainer="Chris Cates , Onuwa Nnachi Isaac " 8 | 9 | # Set current working directory inside the container 10 | WORKDIR /app 11 | 12 | # Copy everything from the source directory to destination directory inside the container 13 | COPY . . 14 | 15 | # Download all dependencies 16 | RUN go get -d -v ./... 17 | 18 | # Install and build the package 19 | RUN go build -i -o ./dist/commoncrawler ./src/*.go 20 | 21 | # Run the binary 22 | CMD ["./dist/commoncrawler"] 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Chris Cates 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Common Crawler 2 | 3 | ## 🕸 A simple and easy way to extract data from Common Crawl with little or no hassle. 4 | 5 | ![Go Version](https://img.shields.io/badge/Go-v1.12.4-blue.svg) 6 | ![License](https://img.shields.io/badge/license-MIT-blue.svg) 7 | [![Build Status](https://travis-ci.org/ChrisCates/CommonCrawler.svg?branch=master)](https://travis-ci.org/ChrisCates/CommonCrawler) 8 | [![Go Report Card](https://goreportcard.com/badge/github.com/ChrisCates/CommonCrawler)](https://goreportcard.com/report/github.com/ChrisCates/CommonCrawler) 9 | 10 | ### Notice in regards to development 11 | 12 | Currently I do not have the capacity to hire full time, however, I do have the intention of hiring someone to help build infrastructure related to CommonCrawl. All Gitcoin bounties are currently on hold. When I do have time to further invest in this project, will discuss full time devops developer to work on said project. All payment will be done in DAI and resource allocation will be approximately 5k/mo. 13 | 14 | ## As a GUI 15 | 16 | An electron based interface that works with a Go server will be available. 17 | 18 | ## As a library 19 | 20 | Install as a dependency: 21 | 22 | ```bash 23 | go get https://github.com/ChrisCates/CommonCrawler 24 | ``` 25 | 26 | Access the library functions by `import`ing it: 27 | 28 | ```golang 29 | import( 30 | cc "github.com/ChrisCates/CommonCrawler" 31 | ) 32 | 33 | func main() { 34 | cc.scan() 35 | cc.download() 36 | cc.extract() 37 | // And so forth 38 | } 39 | ``` 40 | 41 | ## As a command line tool 42 | 43 | Install from source: 44 | 45 | ```bash 46 | go install https://github.com/ChrisCates/CommonCrawler 47 | ``` 48 | 49 | Or you can curl from Github: 50 | 51 | ```bash 52 | curl https://github.com/ChrisCates/CommonCrawler/raw/master/dist/commoncrawler -o commoncrawler 53 | ``` 54 | 55 | Then run as a binary: 56 | 57 | ```bash 58 | # Output help 59 | commoncrawler --help 60 | 61 | # Specify configuration 62 | commoncrawler --base-uri https://commoncrawl.s3.amazonaws.com/ 63 | commoncrawler --wet-paths wet.paths 64 | commoncrawler --data-folder output/crawl-data 65 | commoncrawler --start 0 66 | commoncrawler --stop 5 # -1 will loop through all wet files from wet.paths 67 | 68 | # Start crawling the web 69 | commoncrawler start --stop -1 70 | ``` 71 | 72 | ## Compilation and Configuration 73 | 74 | ### Installing dependencies 75 | 76 | ```bash 77 | go get github.com/logrusorgru/aurora 78 | ``` 79 | 80 | ### Downloading data with the application 81 | 82 | First configure the type of data you want to extract. 83 | 84 | ```golang 85 | // Config is the preset variables for your extractor 86 | type Config struct { 87 | baseURI string 88 | wetPaths string 89 | dataFolder string 90 | matchFolder string 91 | start int 92 | stop int 93 | } 94 | 95 | //Defaults 96 | Config{ 97 | start: 0, 98 | stop: 5, 99 | baseURI: "https://commoncrawl.s3.amazonaws.com/", 100 | wetPaths: path.Join(cwd, "wet.paths"), 101 | dataFolder: path.Join(cwd, "/output/crawl-data"), 102 | matchFolder: path.Join(cwd, "/output/match-data"), 103 | } 104 | ``` 105 | 106 | ### With Docker 107 | 108 | ```bash 109 | docker build -t commoncrawler . 110 | docker run commoncrawler 111 | ``` 112 | 113 | ### Without Docker 114 | 115 | ```bash 116 | go build -i -o ./dist/commoncrawler ./src/*.go 117 | ./dist/commoncrawler 118 | ``` 119 | 120 | Or you can run simply just run it. 121 | 122 | ```bash 123 | go run src/*.go 124 | ``` 125 | 126 | ### Resources 127 | 128 | - MIT Licensed 129 | 130 | - If people are interested or need it. I can create a documentation and tutorial page on https://commoncrawl.chriscates.ca 131 | 132 | - You can post issues if they are valid, and, I could potentially fund them based on priority. 133 | -------------------------------------------------------------------------------- /dist/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisCates/CommonCrawler/b7a83b2518d8a26830dce9c2afa4a11976daa6c7/dist/.gitkeep -------------------------------------------------------------------------------- /dist/commoncrawler: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisCates/CommonCrawler/b7a83b2518d8a26830dce9c2afa4a11976daa6c7/dist/commoncrawler -------------------------------------------------------------------------------- /output/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChrisCates/CommonCrawler/b7a83b2518d8a26830dce9c2afa4a11976daa6c7/output/.gitkeep -------------------------------------------------------------------------------- /src/analyze.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "os" 8 | "strconv" 9 | "strings" 10 | "sync" 11 | 12 | aurora "github.com/logrusorgru/aurora" 13 | ) 14 | 15 | func analyzeFile(filePath string, path string) error { 16 | file, err := os.Open(filePath) 17 | if err != nil { 18 | return err 19 | } 20 | defer file.Close() 21 | 22 | data, err := os.Create(path) 23 | if err != nil { 24 | return err 25 | } 26 | defer data.Close() 27 | 28 | matched := make(chan matchedWarc) 29 | 30 | go analyze("ninja", file, matched) 31 | 32 | for m := range matched { 33 | fmt.Println(aurora.Blue("\t Found " + strconv.Itoa(m.matches) + " matches in this wet file...")) 34 | data.WriteString(fmt.Sprintf("%sMATCHES: %d\n\n", m.warcData, m.matches)) 35 | } 36 | 37 | return nil 38 | } 39 | 40 | type warcRecord struct { 41 | header string 42 | body string 43 | } 44 | 45 | //readWarcRecord reads one warc record from Reader 46 | // warc-record = header CRLF 47 | // block CRLF CRLF 48 | func readWarcRecord(in *bufio.Reader) (warcRecord, error) { 49 | 50 | var ret warcRecord 51 | 52 | line, err := in.ReadBytes('\n') 53 | if err != nil { 54 | return ret, err 55 | } 56 | 57 | firstLine := string(line) 58 | 59 | //Warc record starts with version e.g. "WARC/1.0" 60 | if firstLine != "WARC/1.0\r\n" { 61 | return ret, fmt.Errorf("warc version expected '%s' found", firstLine) 62 | } 63 | var warcHeaderBuilder strings.Builder 64 | 65 | var contentLength = -1 66 | 67 | //read header till end (\n) 68 | for ; string(line) != "\r\n"; line, err = in.ReadBytes('\n') { 69 | 70 | if err != nil { 71 | return ret, err 72 | } 73 | 74 | //each header must contains Content-Length 75 | //alse named headers are case insensitive 76 | if strings.HasPrefix(strings.ToLower(string(line)), "content-length:") { 77 | 78 | if contentLength > 0 { 79 | return ret, fmt.Errorf("exactly one content-length should be present in a WARC header") 80 | } 81 | 82 | keyAndValue := strings.SplitN(string(line), ":", 2) 83 | if len(keyAndValue) != 2 { 84 | return ret, fmt.Errorf("Content-Length field must contains a value. '%s' found)", line) 85 | } 86 | //field value may be preceded by any amount of linear whitespace 87 | strValue := strings.TrimSpace(keyAndValue[1]) 88 | contentLength, err = strconv.Atoi(strValue) 89 | if err != nil { 90 | return ret, err 91 | } 92 | } 93 | 94 | warcHeaderBuilder.Write(line) 95 | } 96 | 97 | //content length sould be non-negative 98 | if contentLength < 0 { 99 | return ret, fmt.Errorf("exactly one content-length should be present in a WARC header. WARC header: %s", warcHeaderBuilder.String()) 100 | } 101 | 102 | //early return if body is empty 103 | if contentLength == 0 { 104 | return warcRecord{warcHeaderBuilder.String(), ""}, nil 105 | } 106 | 107 | //body buffer 108 | body := make([]byte, contentLength) 109 | 110 | n := 0 111 | //put reader date to body buffer 112 | for k, err := in.Read(body); n < contentLength; k, err = in.Read(body[n:]) { 113 | if err != nil && err != io.EOF { 114 | return ret, err 115 | } 116 | if err == io.EOF && (n+k) < contentLength { 117 | return ret, fmt.Errorf("WARC record finished unexpectedly. Content-Length : %d, got %d", contentLength, n) 118 | } 119 | n += k 120 | } 121 | 122 | return warcRecord{warcHeaderBuilder.String(), string(body)}, err 123 | } 124 | 125 | type matchedWarc struct { 126 | matches int 127 | warcData string 128 | } 129 | 130 | func analyze(searchFor string, in io.Reader, matched chan matchedWarc) { 131 | defer close(matched) 132 | bufin := bufio.NewReader(in) 133 | var wg sync.WaitGroup 134 | 135 | for record, err := readWarcRecord(bufin); err == nil; record, err = readWarcRecord(bufin) { 136 | wg.Add(1) 137 | go func(r warcRecord) { 138 | found := strings.Count(r.body, searchFor) 139 | 140 | if found > 0 { 141 | matched <- matchedWarc{found, r.header} 142 | } 143 | wg.Done() 144 | }(record) 145 | bufin.ReadBytes('\n') 146 | bufin.ReadBytes('\n') 147 | } 148 | 149 | wg.Wait() 150 | } 151 | -------------------------------------------------------------------------------- /src/config.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | "path" 6 | ) 7 | 8 | // Config is the preset variables for your extractor 9 | type Config struct { 10 | baseURI string 11 | wetPaths string 12 | dataFolder string 13 | matchFolder string 14 | start int 15 | stop int 16 | } 17 | 18 | func getConfiguration() Config { 19 | cwd, err := os.Getwd() 20 | 21 | if err != nil { 22 | panic(err) 23 | } 24 | 25 | return Config{ 26 | start: 0, 27 | stop: 1, 28 | baseURI: "https://commoncrawl.s3.amazonaws.com/", 29 | wetPaths: path.Join(cwd, "wet.paths"), 30 | dataFolder: path.Join(cwd, "/output/crawl-data"), 31 | matchFolder: path.Join(cwd, "/output/match-data"), 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/crawl.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | aurora "github.com/logrusorgru/aurora" 8 | ) 9 | 10 | func main() { 11 | fmt.Println(aurora.Green("Getting configurations for Common Crawl Extractor...")) 12 | config := getConfiguration() 13 | 14 | fmt.Println(aurora.Blue(" Creating folder: " + config.dataFolder)) 15 | os.Mkdir(config.dataFolder, 0740) 16 | fmt.Println(aurora.Blue(" Creating folder: " + config.matchFolder)) 17 | os.Mkdir(config.matchFolder, 0740) 18 | 19 | fmt.Println(aurora.Green("Starting scanning...")) 20 | scan(config) 21 | } 22 | -------------------------------------------------------------------------------- /src/extract.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "compress/gzip" 5 | "fmt" 6 | "io" 7 | "net/http" 8 | "os" 9 | "path/filepath" 10 | ) 11 | 12 | func download(uri string, path string) error { 13 | //check if file exists 14 | if _, err := os.Stat(path); !os.IsNotExist(err) { 15 | return fmt.Errorf("file \"%s\" has already been downloaded", path) 16 | } 17 | //create output file 18 | out, err := os.Create(path) 19 | if err != nil { 20 | return err 21 | } 22 | defer out.Close() 23 | 24 | //make a GET to the specified URL 25 | resp, err := http.Get(uri) 26 | if err != nil { 27 | return err 28 | } 29 | defer resp.Body.Close() 30 | 31 | // check the server response 32 | if resp.StatusCode != http.StatusOK { 33 | return fmt.Errorf("bad status: %s", resp.Status) 34 | } 35 | 36 | //redirect get responce to file 37 | _, err = io.Copy(out, resp.Body) 38 | if err != nil { 39 | return err 40 | } 41 | 42 | return nil 43 | } 44 | 45 | func extract(path string) error { 46 | //get extracted file path 47 | _, fname := filepath.Split(path) 48 | ext := filepath.Ext(fname) 49 | extractedPath := path[:len(path)-len(ext)] 50 | //create extruction destination 51 | 52 | out, err := os.Create(extractedPath) 53 | if err != nil { 54 | return err 55 | } 56 | defer out.Close() 57 | 58 | //open gzip file 59 | fi, err := os.Open(path) 60 | if err != nil { 61 | return err 62 | } 63 | defer fi.Close() 64 | //create gz reader 65 | fz, err := gzip.NewReader(fi) 66 | if err != nil { 67 | return err 68 | } 69 | defer fz.Close() 70 | 71 | //write extracted to file 72 | _, err = io.Copy(out, fz) 73 | if err != nil { 74 | return err 75 | } 76 | 77 | return nil 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/scan.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "os" 7 | "path" 8 | "strconv" 9 | 10 | aurora "github.com/logrusorgru/aurora" 11 | ) 12 | 13 | func scan(config Config) { 14 | paths, err := os.Open(config.wetPaths) 15 | 16 | if err != nil { 17 | panic(err) 18 | } 19 | 20 | scanner := bufio.NewScanner(paths) 21 | index := 0 22 | 23 | for scanner.Scan() { 24 | uri := config.baseURI + scanner.Text() 25 | 26 | if index < config.start { 27 | continue 28 | } else if index >= config.stop { 29 | fmt.Println(aurora.Green("\nFinished scanning, you can review results in the output folders...\n")) 30 | break 31 | } 32 | 33 | index++ 34 | 35 | filePath := path.Join(config.dataFolder, "wetfile_"+strconv.Itoa(index)+".wet.gz") 36 | 37 | fmt.Printf("\n Download uri %s\n\t", uri) 38 | err := download(uri, filePath) 39 | if err != nil { 40 | fmt.Println(aurora.Red(fmt.Sprintf("\n Download was not successful: %s\n\t", err))) 41 | continue 42 | } 43 | 44 | fmt.Println(aurora.Green("\n Download was successful extracting:\n\t" + uri)) 45 | 46 | err = extract(filePath) 47 | if err != nil { 48 | fmt.Println(aurora.Red(fmt.Sprintf("\n Exctraction %s err: %s\n\t", filePath, err))) 49 | continue 50 | } 51 | 52 | fmt.Println(aurora.Green("\n Finished extracting:\n\t" + uri)) 53 | 54 | extractedPath := path.Join(config.dataFolder, "wetfile_"+strconv.Itoa(index)+".wet") 55 | scanPath := path.Join(config.matchFolder, "info."+strconv.Itoa(index)+".txt") 56 | 57 | err = analyzeFile(extractedPath, scanPath) 58 | 59 | if err != nil { 60 | fmt.Println(aurora.Red(fmt.Sprintf("\n There was a problem analyzing, make sure to look into this file:\n\t%s\n", extractedPath))) 61 | fmt.Println(aurora.Red(fmt.Sprintf("\t The error is: %s", err))) 62 | continue 63 | } 64 | 65 | fmt.Println(aurora.Green("\n Finished analyzing:\n\t" + extractedPath)) 66 | fmt.Println(aurora.Green(" Wrote results to" + scanPath)) 67 | 68 | } 69 | } 70 | --------------------------------------------------------------------------------