├── .gitignore ├── testfiles ├── sha1.txt └── go-logo-blue.svg ├── .github └── workflows │ └── tests.yml ├── LICENSE ├── hscan_test.go ├── README.md └── hscan.go /.gitignore: -------------------------------------------------------------------------------- 1 | hscan 2 | hscan.exe 3 | *.csv 4 | *.db 5 | *.db-journal 6 | -------------------------------------------------------------------------------- /testfiles/sha1.txt: -------------------------------------------------------------------------------- 1 | 64725786589f263f0ecc1da55c2bcac7eb18e681 2 | 0741e65ae292d5a68c7c167f04d0538254da8e8b 3 | 274FE3DC04269ECB6B5E2A3B659779B8DF4BBF07 4 | 5 | 12d81f50767d4e09aa7877da077ad9d1b915d75b 6 | 894b7cbc31d7647667b11eb9efe0526d55252711zzz 7 | 8 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Go-tests 2 | 3 | on: push 4 | 5 | jobs: 6 | go-test: 7 | name: Test app 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout code 11 | uses: actions/checkout@master 12 | - name: Install deps1 13 | run: go get github.com/gabriel-vasile/mimetype 14 | - name: Install deps2 15 | run: go get github.com/gammazero/workerpool 16 | - name: Install deps3 17 | run: go get github.com/saracen/walker 18 | - name: Install deps4 19 | run: go get github.com/mattn/go-sqlite3 20 | - name: Test 21 | run: go test 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 JeffProd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /testfiles/go-logo-blue.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hscan_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "encoding/hex" 4 | import "testing" 5 | 6 | func TestSha1sum(t *testing.T) { 7 | s := sha1sum("testfiles/go-logo-blue.svg") 8 | got := hex.EncodeToString(s[:]) 9 | want := "274fe3dc04269ecb6b5e2a3b659779b8df4bbf07" 10 | if got != want { 11 | t.Errorf("got %q want %q", got, want) 12 | } 13 | } 14 | 15 | func TestGetMimeType(t *testing.T) { 16 | got, err := getMimeType("nope") 17 | want := "" 18 | if got != want { 19 | t.Errorf("got %q want %q", got, want) 20 | } 21 | if err == nil { 22 | t.Errorf("err should be : ERROR reading file nope") 23 | } 24 | 25 | got, err = getMimeType("testfiles/go-logo-blue.svg") 26 | want = "image/svg+xml" 27 | if err != nil { 28 | t.Errorf("err %v", err) 29 | } 30 | if got != want { 31 | t.Errorf("got %q want %q", got, want) 32 | } 33 | 34 | got, err = getMimeType("testfiles/sha1.txt") 35 | want = "text/plain; charset=utf-8" 36 | if err != nil { 37 | t.Errorf("err %v", err) 38 | } 39 | if got != want { 40 | t.Errorf("got %q want %q", got, want) 41 | } 42 | 43 | got, err = getMimeType("/etc/shadow") 44 | want = "" 45 | if got != want { 46 | t.Errorf("got %q want %q", got, want) 47 | } 48 | } 49 | 50 | func TestDirExists(t *testing.T) { 51 | got := dirExists("nope") 52 | want := false 53 | if got != want { 54 | t.Errorf("got %t want %t", got, want) 55 | } 56 | 57 | got = dirExists("testfiles/go-logo-blue.svg") 58 | want = false 59 | if got != want { 60 | t.Errorf("got %t want %t", got, want) 61 | } 62 | 63 | got = dirExists("testfiles/") 64 | want = true 65 | if got != want { 66 | t.Errorf("got %t want %t", got, want) 67 | } 68 | } 69 | 70 | func TestLoadChecksumFile(t *testing.T) { 71 | // sha1 72 | loadChecksumFile("testfiles/sha1.txt", 3) 73 | got := arrSha1 74 | if len(got) != 5 { 75 | t.Errorf("got length %d instead of 4", len(got)) 76 | } 77 | if got["64725786589f263f0ecc1da55c2bcac7eb18e681"] != 3 { 78 | t.Errorf("got %q want %q", got, "64725786589f263f0ecc1da55c2bcac7eb18e681") 79 | } 80 | if got["0741e65ae292d5a68c7c167f04d0538254da8e8b"] != 3 { 81 | t.Errorf("got %q want %q", got, "0741e65ae292d5a68c7c167f04d0538254da8e8b") 82 | } 83 | if got["274fe3dc04269ecb6b5e2a3b659779b8df4bbf07"] != 3 { 84 | t.Errorf("got %q want %q", got, "274fe3dc04269ecb6b5e2a3b659779b8df4bbf07") 85 | } 86 | if got["12d81f50767d4e09aa7877da077ad9d1b915d75b"] != 3 { 87 | t.Errorf("got %q want %q", got, "12d81f50767d4e09aa7877da077ad9d1b915d75b") 88 | } 89 | if got["894b7cbc31d7647667b11eb9efe0526d55252711"] != 3 { 90 | t.Errorf("got %q want %q", got, "894b7cbc31d7647667b11eb9efe0526d55252711") 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HSCAN 2 | 3 | Scans recursively a path to match given sha1 checksums. 4 | Usefull to find duplicate files, or to find relevant/irrelevant/unknown files. 5 | 6 | ## USAGE 7 | 8 | ```bash 9 | hscan -d -db 10 | -d string 11 | Directory to scan recursively 12 | -db string 13 | Directory containing text files with sha1 to search (1 checksum by line) 14 | ``` 15 | 16 | ## EXAMPLE 17 | 18 | You have the file `dbpath/sha1.txt` : 19 | 20 | ``` 21 | fed5cdfb1c9b121ea6d042dd54842407df3b4a6b 22 | 64725786589f263f0ecc1da55c2bcac7eb18e681 23 | 12d81f50767d4e09aa7877da077ad9d1b915d75b 24 | ``` 25 | 26 | Searching for files having those checksums in the directory `test/` : 27 | 28 | ```bash 29 | hscan -d test -db dbpath 30 | 31 | # result : 32 | Loading database file "dbpath/sha1.txt"... 3 uniq checksum found in "46.975µs" 33 | 34 | Scanning path "tmp"... 35 | 1964 files - 0 unreadable files - 492 dirs - 0 unreadable dirs - 3 matches 36 | 37 | RESULT 38 | sha1tmp.txt : 3 matches 39 | Total : 3 matches 40 | 41 | Done in 292.09673ms 42 | ``` 43 | 44 | Matching files, unknown files, and errors are written in real time into `result.csv` : 45 | 46 | ```csv 47 | # sha1,dbfile,filename,error 48 | dff8a1731f59ccad056b346102d1e1d014b843f3,nsrl_uniq.txt,/home/jeff/tmp/.vscode/settings.json, 49 | 0841f15b7436126cb2877b094d632dbc2707eda0,,/home/jeff/tmp/img_20190502_175115.jpg, 50 | 98fb7452234c1d7666a54a53eb7340e501d8c173,sha1test.txt,/home/jeff/tmp/602352874.jpg, 51 | ,,/home/jeff/tmp/mysqltmp/undo_001,open /home/jeff/tmp/mysqltmp/undo_001: permission denied 52 | ``` 53 | 54 | A SQLite3 database named `result.db` with the same data as the CSV is created at the end of the process. 55 | 56 | ## INSTALL 57 | 58 | Get the [latest release](https://github.com/Tazeg/hscan/releases) or download and install from source : 59 | 60 | ```bash 61 | git config --global --add url."git@github.com:".insteadOf "https://github.com/" 62 | go get github.com/Tazeg/hscan 63 | cd ~/go/src/github.com/Tazeg/hscan 64 | 65 | # Linux 66 | env GOOS=linux GOARCH=amd64 go build hscan.go 67 | 68 | # Windows 69 | env GOOS=windows GOARCH=amd64 go build -o hscan.exe hscan.go 70 | 71 | # Raspberry Pi 72 | env GOARM=7 GOARCH=arm go build hscan.go 73 | 74 | go install 75 | ``` 76 | 77 | ## TEST 78 | 79 | ```bash 80 | go test 81 | ``` 82 | 83 | ## BENCHMARKS 84 | 85 | Tried on : 86 | 87 | - OS : Linux 88 | - HDD : 128 Gb SSD + 2 Tb HDD 89 | - CPU: Intel(R) Xeon(R) CPU E5-1660 v3 @ 3.00GHz 90 | - Memory: 32 Gb 91 | 92 | Loading a NIST/NSRL file of 1,2Gb containing 29,459,433 took 22.14s. 93 | Scanning 2Tb and 128 Gb of data took 1h32m34s. This depends on the data stored and the free space on the drive. Further tests will be done shortly. 94 | 95 | ```bash 96 | $> hscan -d / -db bases_hash/ 97 | Loading database file "bases_hash/nsrl_sha1_uniq.txt"... 29459433 uniq checksum found in "22.146464941s" 98 | 99 | Scanning path "/"... 100 | 2012574 files - 12091 unreadable files - 274715 dirs - 2510 unreadable dirs - 287870 matches 101 | 102 | RESULT 103 | nsrl_sha1_uniq.txt : 287870 matches 104 | Total : 287870 matches 105 | 106 | Done in 1h32m34.505006098s 107 | ``` 108 | -------------------------------------------------------------------------------- /hscan.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | _ "github.com/mattn/go-sqlite3" 5 | "bufio" 6 | "crypto/sha1" 7 | "database/sql" 8 | "flag" 9 | "fmt" 10 | "github.com/gabriel-vasile/mimetype" 11 | "github.com/gammazero/workerpool" 12 | "github.com/saracen/walker" 13 | "io/ioutil" 14 | "os" 15 | "path" 16 | "strings" 17 | "sync" 18 | "time" 19 | ) 20 | 21 | //----------------------------------------------------------------------------- 22 | // global vars 23 | //----------------------------------------------------------------------------- 24 | 25 | // path to scan 26 | var argDir = flag.String("d", "", "Directory to scan recursively") 27 | var argDbSha1 = flag.String("db", "", "Directory containing text files with sha1 to search (1 checksum by line)") 28 | 29 | // stats 30 | var nbFiles = 0 31 | var nbDirs = 0 32 | var nbUnreadableDir = 0 33 | var nbUnreadableFile = 0 34 | var nbSha1Match = map[uint8]int{} // nbSha1Match[filename_index] = count matches 35 | var nbTotalSha1Match = 0 36 | 37 | // maps of relevant sha1 to look for 38 | var arrSha1 = map[string]uint8{} // arrSha1[strSha1] = db index filename 39 | 40 | // output csv file 41 | var csvFile *os.File 42 | // limited worker pool to calculate hash files to avoid "too many open files" 43 | var wp = workerpool.New(5) 44 | 45 | // const 46 | const strVersion = "1.1.0" 47 | 48 | // hash databases file names, i.e. checksumFilenames[0]="nsrl.txt" 49 | var checksumFilenames []string 50 | 51 | // concurrency to update map 52 | var l = sync.Mutex{} 53 | 54 | // sqlite3 55 | var stmt *sql.Stmt 56 | 57 | 58 | //----------------------------------------------------------------------------- 59 | // main 60 | //----------------------------------------------------------------------------- 61 | 62 | func main() { 63 | checkArgs() 64 | var err error 65 | 66 | // walk function called for every path found, see https://golang.org/pkg/os/#FileInfo 67 | walkFn := func(path string, info os.FileInfo) error { 68 | if info.IsDir() { 69 | nbDirs++ 70 | return nil 71 | } 72 | 73 | // skip symbolic links and 0 size files (i.e. /dev/dri/card0) 74 | if !(info.Mode() & os.ModeSymlink == os.ModeSymlink) && info.Size() > 0 { 75 | // fmt.Printf("path:%q name:%q size:%d\n", path, info.Name(), info.Size()) 76 | 77 | wp.Submit(func() { 78 | // arbitraty skip files > 238,41 Mb (250000000 b) 79 | if info.Size() > 250000000 { 80 | writeResult("", "", path, "skip file size > 238 Mb") 81 | return 82 | } 83 | workerPoolAction(path) 84 | }) 85 | 86 | nbFiles++ 87 | showInfos() 88 | } 89 | return nil 90 | } 91 | 92 | // error function called for every error encountered 93 | errorCallbackOption := walker.WithErrorCallback(func(path string, err error) error { 94 | writeResult("", "", path, fmt.Sprintf("%v", err)) 95 | nbUnreadableDir++ 96 | return nil 97 | }) 98 | 99 | // create csv file 100 | csvFile, err = os.OpenFile("result.csv", os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) 101 | if err != nil { 102 | panic(err) 103 | } 104 | defer csvFile.Close() 105 | 106 | // create db 107 | var db *sql.DB 108 | db, err = sql.Open("sqlite3", "./result.db") 109 | if err != nil { 110 | panic(err) 111 | } 112 | defer db.Close() 113 | dbExec(db, `CREATE TABLE IF NOT EXISTS "files" ( 114 | "sha1" TEXT, 115 | "dbfile" TEXT NOT NULL, 116 | "filename" TEXT NOT NULL, 117 | "error" TEXT 118 | )`) 119 | dbExec(db, "DELETE FROM files") 120 | var tx *sql.Tx 121 | tx, err = db.Begin() 122 | if err != nil { 123 | panic(err) 124 | } 125 | defer tx.Commit() 126 | stmt, err = tx.Prepare("INSERT INTO files VALUES (?,?,?,?)") 127 | if err != nil { 128 | panic(err) 129 | } 130 | defer stmt.Close() 131 | 132 | // load sha1 files 133 | loadDbFiles(*argDbSha1) 134 | 135 | // start scan 136 | startTime := time.Now() 137 | fmt.Println() 138 | fmt.Println("Scanning path \"" + *argDir + "\"...") 139 | walker.Walk(*argDir, walkFn, errorCallbackOption) 140 | 141 | wp.StopWait() 142 | endTime := time.Now() 143 | showInfos() // last refresh 144 | fmt.Println() 145 | fmt.Println() 146 | fmt.Println("RESULT") 147 | for k, v := range nbSha1Match { 148 | fmt.Printf(" %-40s : %d matches\n", checksumFilenames[k], v) 149 | } 150 | fmt.Printf(" %-40s : %d matches\n", "Total", nbTotalSha1Match) 151 | fmt.Println() 152 | fmt.Println("Done in", endTime.Sub(startTime)) 153 | } 154 | 155 | func checkArgs() { 156 | flag.Parse() 157 | if *argDir == "" { 158 | showUsage() 159 | os.Exit(0) 160 | } 161 | if *argDbSha1 == "" { 162 | showUsage() 163 | os.Exit(0) 164 | } 165 | if *argDbSha1 != "" && !dirExists(*argDbSha1) { 166 | fmt.Printf("ERROR loading databases: %q does not exists or is not a directory\n", *argDbSha1) 167 | os.Exit(0) 168 | } 169 | } 170 | 171 | // Returns the mime type of a file 172 | // @param {string} filename, full path of a file, ex: "/home/user/file.txt" 173 | // @returns {string} "text/plain" or "" if unknown 174 | func getMimeType(filename string) (string, error) { 175 | mime, err := mimetype.DetectFile(filename) 176 | if err != nil { 177 | return "", err 178 | } 179 | return mime.String(), nil 180 | } 181 | 182 | func showUsage() { 183 | fmt.Println("NAME") 184 | fmt.Println(" hscan") 185 | fmt.Println(" Look for files recursively matching a list of checksums (sha-1 20 bytes base 16)") 186 | fmt.Println() 187 | fmt.Println("VERSION") 188 | fmt.Printf(" v%s\n", strVersion) 189 | fmt.Println() 190 | fmt.Println("USAGE") 191 | fmt.Println(" hscan -d -db ") 192 | flag.PrintDefaults() 193 | fmt.Println(" Results are written in log files in current directory") 194 | fmt.Println() 195 | fmt.Println("EXAMPLES") 196 | fmt.Println(" hscan -db /home/user/sha1files/ -d /mnt/dir/") 197 | fmt.Println(" Loads text files containing sha1 checksums from the directory /home/user/sha1files.") 198 | fmt.Println(" Those files must have one checksum per line.") 199 | fmt.Println(" The path /mnt/dir is scanned recursively to look for matches.") 200 | fmt.Println() 201 | fmt.Println("AUTHOR") 202 | fmt.Println(" Written by Twitter:@JeffProd") 203 | fmt.Println() 204 | fmt.Println("LICENCE") 205 | fmt.Println(" MIT License - Copyright (c) 2020 JeffProd.com") 206 | } 207 | 208 | // load txt files from the given path 209 | // each line must contain 1 sha1sum 210 | // @param {string} rootpath "/home/user/dir" or relative path 211 | func loadDbFiles(rootpath string) { 212 | files, err := ioutil.ReadDir(rootpath) 213 | if err != nil { 214 | fmt.Printf("ERROR accessing path %q: %v\n", rootpath, err) 215 | os.Exit(1) 216 | } 217 | cpt := uint8(0) 218 | for _, file := range files { 219 | if file.IsDir() { continue } // skip dirs 220 | strFilename := path.Join(rootpath, file.Name()) 221 | strType, err := getMimeType(strFilename) 222 | if err != nil { 223 | fmt.Println(err) 224 | os.Exit(1) 225 | } 226 | if !strings.HasPrefix(strType, "text/plain") { continue } // skip non txt files 227 | checksumFilenames = append(checksumFilenames, file.Name()) 228 | loadChecksumFile(path.Join(rootpath, file.Name()), cpt) 229 | cpt++ 230 | } 231 | if cpt == 0 { 232 | fmt.Printf("No database text file found in %q\n", rootpath) 233 | os.Exit(0) 234 | } 235 | } // loadDbFiles 236 | 237 | // progress information 238 | func showInfos() { 239 | fmt.Printf("\r %d files - %d unreadable files - %d dirs - %d unreadable dirs - %d matches", nbFiles, nbUnreadableFile, nbDirs, nbUnreadableDir, nbTotalSha1Match) 240 | // if range nbSha1Match here: concurrent map iteration and map write, so we use nbTotalSha1Match 241 | } 242 | 243 | // Calculate SHA1 of a file 244 | // filename, ex: "/home/user/file.txt" 245 | func sha1sum(filename string) [20]byte { 246 | data, err := ioutil.ReadFile(filename) 247 | if err != nil { 248 | writeResult("", "", filename, fmt.Sprintf("%v", err)) 249 | nbUnreadableFile++ 250 | return [20]byte{} 251 | } 252 | return sha1.Sum(data) 253 | } 254 | 255 | // dirExists checks if a directory exists 256 | func dirExists(path string) bool { 257 | info, err := os.Stat(path) 258 | if os.IsNotExist(err) { 259 | return false 260 | } 261 | if err != nil { return false } 262 | return info.IsDir() 263 | } 264 | 265 | // loads in memory the sha1 checksums from a txt file 266 | // using a map for fastest check if hash key exists instead of parsing a []string 267 | // @params {string} filename "/home/user/toto.txt" 268 | // @params {uint8} index of the file in checksumFilenames 269 | func loadChecksumFile(filename string, idx uint8) { 270 | startTime := time.Now() 271 | fmt.Printf("Loading database file %q... ", filename) 272 | 273 | file, err := os.Open(filename) 274 | if err != nil { 275 | fmt.Printf("\nError reading the SHA1 checksums file %q: %v\n", filename, err) 276 | os.Exit(0) 277 | } 278 | defer file.Close() 279 | 280 | l := "" 281 | cpt := 0 282 | previousLen := len(arrSha1) 283 | scanner := bufio.NewScanner(file) 284 | for scanner.Scan() { 285 | l = strings.TrimSpace(scanner.Text()) 286 | if len(l) > 40 { l = l[0:40] } 287 | l = strings.ToLower(l) 288 | if l == "" { continue } 289 | arrSha1[l] = idx 290 | cpt++ 291 | } 292 | 293 | // init hash match count for this db text file 294 | nbSha1Match[idx] = 0 295 | 296 | endTime := time.Now() 297 | fmt.Printf("%d lines, %d uniq checksum found in %q\n", cpt, len(arrSha1) - previousLen, endTime.Sub(startTime)) 298 | } 299 | 300 | func writeResult(sha1 string, dbfile string, filename string, errStr string) { 301 | _, err := stmt.Exec(sha1, dbfile, filename, errStr) 302 | if err != nil { 303 | panic(err) 304 | } 305 | if _, err1 := csvFile.WriteString(sha1 + "," + dbfile + "," + filename + "," + errStr + "\n"); err1 != nil { panic(err1) } 306 | } 307 | 308 | // Action on a file within the worker pool. 309 | // This is a file, not a directory. 310 | // @params {string} filename, i.e. "/home/user/file.txt" 311 | func workerPoolAction(filename string) { 312 | bSha1 := sha1sum(filename) // binary 313 | sSha1 := fmt.Sprintf("%x", bSha1) // string 314 | if sSha1 == "0000000000000000000000000000000000000000" { return } // we had an error already logged and we got empty sha1 315 | 316 | // sha1 exists in arrSha1 ? 317 | // info : arrSha1[sSha1] is the db txt file name 318 | if _, ok := arrSha1[sSha1]; ok { 319 | // we have a sha1 match 320 | writeResult(sSha1, checksumFilenames[arrSha1[sSha1]], filename, "") 321 | 322 | // can't update a map concurrently otherwhite we get "fatal error: concurrent map writes" 323 | l.Lock() 324 | nbSha1Match[arrSha1[sSha1]]++ 325 | l.Unlock() 326 | 327 | nbTotalSha1Match++ 328 | showInfos() 329 | return 330 | } 331 | 332 | // log also unknown file 333 | writeResult(sSha1, "", filename, "") 334 | } 335 | 336 | func dbExec (db *sql.DB, sql string) { 337 | _, err := db.Exec(sql) 338 | if err != nil { 339 | panic(err) 340 | } 341 | } 342 | --------------------------------------------------------------------------------