├── .gitignore
├── testfiles
├── sha1.txt
└── go-logo-blue.svg
├── .github
└── workflows
│ └── tests.yml
├── LICENSE
├── hscan_test.go
├── README.md
└── hscan.go
/.gitignore:
--------------------------------------------------------------------------------
1 | hscan
2 | hscan.exe
3 | *.csv
4 | *.db
5 | *.db-journal
6 |
--------------------------------------------------------------------------------
/testfiles/sha1.txt:
--------------------------------------------------------------------------------
1 | 64725786589f263f0ecc1da55c2bcac7eb18e681
2 | 0741e65ae292d5a68c7c167f04d0538254da8e8b
3 | 274FE3DC04269ECB6B5E2A3B659779B8DF4BBF07
4 |
5 | 12d81f50767d4e09aa7877da077ad9d1b915d75b
6 | 894b7cbc31d7647667b11eb9efe0526d55252711zzz
7 |
8 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Go-tests
2 |
3 | on: push
4 |
5 | jobs:
6 | go-test:
7 | name: Test app
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Checkout code
11 | uses: actions/checkout@master
12 | - name: Install deps1
13 | run: go get github.com/gabriel-vasile/mimetype
14 | - name: Install deps2
15 | run: go get github.com/gammazero/workerpool
16 | - name: Install deps3
17 | run: go get github.com/saracen/walker
18 | - name: Install deps4
19 | run: go get github.com/mattn/go-sqlite3
20 | - name: Test
21 | run: go test
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 JeffProd
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/testfiles/go-logo-blue.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/hscan_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import "encoding/hex"
4 | import "testing"
5 |
6 | func TestSha1sum(t *testing.T) {
7 | s := sha1sum("testfiles/go-logo-blue.svg")
8 | got := hex.EncodeToString(s[:])
9 | want := "274fe3dc04269ecb6b5e2a3b659779b8df4bbf07"
10 | if got != want {
11 | t.Errorf("got %q want %q", got, want)
12 | }
13 | }
14 |
15 | func TestGetMimeType(t *testing.T) {
16 | got, err := getMimeType("nope")
17 | want := ""
18 | if got != want {
19 | t.Errorf("got %q want %q", got, want)
20 | }
21 | if err == nil {
22 | t.Errorf("err should be : ERROR reading file nope")
23 | }
24 |
25 | got, err = getMimeType("testfiles/go-logo-blue.svg")
26 | want = "image/svg+xml"
27 | if err != nil {
28 | t.Errorf("err %v", err)
29 | }
30 | if got != want {
31 | t.Errorf("got %q want %q", got, want)
32 | }
33 |
34 | got, err = getMimeType("testfiles/sha1.txt")
35 | want = "text/plain; charset=utf-8"
36 | if err != nil {
37 | t.Errorf("err %v", err)
38 | }
39 | if got != want {
40 | t.Errorf("got %q want %q", got, want)
41 | }
42 |
43 | got, err = getMimeType("/etc/shadow")
44 | want = ""
45 | if got != want {
46 | t.Errorf("got %q want %q", got, want)
47 | }
48 | }
49 |
50 | func TestDirExists(t *testing.T) {
51 | got := dirExists("nope")
52 | want := false
53 | if got != want {
54 | t.Errorf("got %t want %t", got, want)
55 | }
56 |
57 | got = dirExists("testfiles/go-logo-blue.svg")
58 | want = false
59 | if got != want {
60 | t.Errorf("got %t want %t", got, want)
61 | }
62 |
63 | got = dirExists("testfiles/")
64 | want = true
65 | if got != want {
66 | t.Errorf("got %t want %t", got, want)
67 | }
68 | }
69 |
70 | func TestLoadChecksumFile(t *testing.T) {
71 | // sha1
72 | loadChecksumFile("testfiles/sha1.txt", 3)
73 | got := arrSha1
74 | if len(got) != 5 {
75 | t.Errorf("got length %d instead of 4", len(got))
76 | }
77 | if got["64725786589f263f0ecc1da55c2bcac7eb18e681"] != 3 {
78 | t.Errorf("got %q want %q", got, "64725786589f263f0ecc1da55c2bcac7eb18e681")
79 | }
80 | if got["0741e65ae292d5a68c7c167f04d0538254da8e8b"] != 3 {
81 | t.Errorf("got %q want %q", got, "0741e65ae292d5a68c7c167f04d0538254da8e8b")
82 | }
83 | if got["274fe3dc04269ecb6b5e2a3b659779b8df4bbf07"] != 3 {
84 | t.Errorf("got %q want %q", got, "274fe3dc04269ecb6b5e2a3b659779b8df4bbf07")
85 | }
86 | if got["12d81f50767d4e09aa7877da077ad9d1b915d75b"] != 3 {
87 | t.Errorf("got %q want %q", got, "12d81f50767d4e09aa7877da077ad9d1b915d75b")
88 | }
89 | if got["894b7cbc31d7647667b11eb9efe0526d55252711"] != 3 {
90 | t.Errorf("got %q want %q", got, "894b7cbc31d7647667b11eb9efe0526d55252711")
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HSCAN
2 |
3 | Scans recursively a path to match given sha1 checksums.
4 | Usefull to find duplicate files, or to find relevant/irrelevant/unknown files.
5 |
6 | ## USAGE
7 |
8 | ```bash
9 | hscan -d -db
10 | -d string
11 | Directory to scan recursively
12 | -db string
13 | Directory containing text files with sha1 to search (1 checksum by line)
14 | ```
15 |
16 | ## EXAMPLE
17 |
18 | You have the file `dbpath/sha1.txt` :
19 |
20 | ```
21 | fed5cdfb1c9b121ea6d042dd54842407df3b4a6b
22 | 64725786589f263f0ecc1da55c2bcac7eb18e681
23 | 12d81f50767d4e09aa7877da077ad9d1b915d75b
24 | ```
25 |
26 | Searching for files having those checksums in the directory `test/` :
27 |
28 | ```bash
29 | hscan -d test -db dbpath
30 |
31 | # result :
32 | Loading database file "dbpath/sha1.txt"... 3 uniq checksum found in "46.975µs"
33 |
34 | Scanning path "tmp"...
35 | 1964 files - 0 unreadable files - 492 dirs - 0 unreadable dirs - 3 matches
36 |
37 | RESULT
38 | sha1tmp.txt : 3 matches
39 | Total : 3 matches
40 |
41 | Done in 292.09673ms
42 | ```
43 |
44 | Matching files, unknown files, and errors are written in real time into `result.csv` :
45 |
46 | ```csv
47 | # sha1,dbfile,filename,error
48 | dff8a1731f59ccad056b346102d1e1d014b843f3,nsrl_uniq.txt,/home/jeff/tmp/.vscode/settings.json,
49 | 0841f15b7436126cb2877b094d632dbc2707eda0,,/home/jeff/tmp/img_20190502_175115.jpg,
50 | 98fb7452234c1d7666a54a53eb7340e501d8c173,sha1test.txt,/home/jeff/tmp/602352874.jpg,
51 | ,,/home/jeff/tmp/mysqltmp/undo_001,open /home/jeff/tmp/mysqltmp/undo_001: permission denied
52 | ```
53 |
54 | A SQLite3 database named `result.db` with the same data as the CSV is created at the end of the process.
55 |
56 | ## INSTALL
57 |
58 | Get the [latest release](https://github.com/Tazeg/hscan/releases) or download and install from source :
59 |
60 | ```bash
61 | git config --global --add url."git@github.com:".insteadOf "https://github.com/"
62 | go get github.com/Tazeg/hscan
63 | cd ~/go/src/github.com/Tazeg/hscan
64 |
65 | # Linux
66 | env GOOS=linux GOARCH=amd64 go build hscan.go
67 |
68 | # Windows
69 | env GOOS=windows GOARCH=amd64 go build -o hscan.exe hscan.go
70 |
71 | # Raspberry Pi
72 | env GOARM=7 GOARCH=arm go build hscan.go
73 |
74 | go install
75 | ```
76 |
77 | ## TEST
78 |
79 | ```bash
80 | go test
81 | ```
82 |
83 | ## BENCHMARKS
84 |
85 | Tried on :
86 |
87 | - OS : Linux
88 | - HDD : 128 Gb SSD + 2 Tb HDD
89 | - CPU: Intel(R) Xeon(R) CPU E5-1660 v3 @ 3.00GHz
90 | - Memory: 32 Gb
91 |
92 | Loading a NIST/NSRL file of 1,2Gb containing 29,459,433 took 22.14s.
93 | Scanning 2Tb and 128 Gb of data took 1h32m34s. This depends on the data stored and the free space on the drive. Further tests will be done shortly.
94 |
95 | ```bash
96 | $> hscan -d / -db bases_hash/
97 | Loading database file "bases_hash/nsrl_sha1_uniq.txt"... 29459433 uniq checksum found in "22.146464941s"
98 |
99 | Scanning path "/"...
100 | 2012574 files - 12091 unreadable files - 274715 dirs - 2510 unreadable dirs - 287870 matches
101 |
102 | RESULT
103 | nsrl_sha1_uniq.txt : 287870 matches
104 | Total : 287870 matches
105 |
106 | Done in 1h32m34.505006098s
107 | ```
108 |
--------------------------------------------------------------------------------
/hscan.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | _ "github.com/mattn/go-sqlite3"
5 | "bufio"
6 | "crypto/sha1"
7 | "database/sql"
8 | "flag"
9 | "fmt"
10 | "github.com/gabriel-vasile/mimetype"
11 | "github.com/gammazero/workerpool"
12 | "github.com/saracen/walker"
13 | "io/ioutil"
14 | "os"
15 | "path"
16 | "strings"
17 | "sync"
18 | "time"
19 | )
20 |
21 | //-----------------------------------------------------------------------------
22 | // global vars
23 | //-----------------------------------------------------------------------------
24 |
25 | // path to scan
26 | var argDir = flag.String("d", "", "Directory to scan recursively")
27 | var argDbSha1 = flag.String("db", "", "Directory containing text files with sha1 to search (1 checksum by line)")
28 |
29 | // stats
30 | var nbFiles = 0
31 | var nbDirs = 0
32 | var nbUnreadableDir = 0
33 | var nbUnreadableFile = 0
34 | var nbSha1Match = map[uint8]int{} // nbSha1Match[filename_index] = count matches
35 | var nbTotalSha1Match = 0
36 |
37 | // maps of relevant sha1 to look for
38 | var arrSha1 = map[string]uint8{} // arrSha1[strSha1] = db index filename
39 |
40 | // output csv file
41 | var csvFile *os.File
42 | // limited worker pool to calculate hash files to avoid "too many open files"
43 | var wp = workerpool.New(5)
44 |
45 | // const
46 | const strVersion = "1.1.0"
47 |
48 | // hash databases file names, i.e. checksumFilenames[0]="nsrl.txt"
49 | var checksumFilenames []string
50 |
51 | // concurrency to update map
52 | var l = sync.Mutex{}
53 |
54 | // sqlite3
55 | var stmt *sql.Stmt
56 |
57 |
58 | //-----------------------------------------------------------------------------
59 | // main
60 | //-----------------------------------------------------------------------------
61 |
62 | func main() {
63 | checkArgs()
64 | var err error
65 |
66 | // walk function called for every path found, see https://golang.org/pkg/os/#FileInfo
67 | walkFn := func(path string, info os.FileInfo) error {
68 | if info.IsDir() {
69 | nbDirs++
70 | return nil
71 | }
72 |
73 | // skip symbolic links and 0 size files (i.e. /dev/dri/card0)
74 | if !(info.Mode() & os.ModeSymlink == os.ModeSymlink) && info.Size() > 0 {
75 | // fmt.Printf("path:%q name:%q size:%d\n", path, info.Name(), info.Size())
76 |
77 | wp.Submit(func() {
78 | // arbitraty skip files > 238,41 Mb (250000000 b)
79 | if info.Size() > 250000000 {
80 | writeResult("", "", path, "skip file size > 238 Mb")
81 | return
82 | }
83 | workerPoolAction(path)
84 | })
85 |
86 | nbFiles++
87 | showInfos()
88 | }
89 | return nil
90 | }
91 |
92 | // error function called for every error encountered
93 | errorCallbackOption := walker.WithErrorCallback(func(path string, err error) error {
94 | writeResult("", "", path, fmt.Sprintf("%v", err))
95 | nbUnreadableDir++
96 | return nil
97 | })
98 |
99 | // create csv file
100 | csvFile, err = os.OpenFile("result.csv", os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600)
101 | if err != nil {
102 | panic(err)
103 | }
104 | defer csvFile.Close()
105 |
106 | // create db
107 | var db *sql.DB
108 | db, err = sql.Open("sqlite3", "./result.db")
109 | if err != nil {
110 | panic(err)
111 | }
112 | defer db.Close()
113 | dbExec(db, `CREATE TABLE IF NOT EXISTS "files" (
114 | "sha1" TEXT,
115 | "dbfile" TEXT NOT NULL,
116 | "filename" TEXT NOT NULL,
117 | "error" TEXT
118 | )`)
119 | dbExec(db, "DELETE FROM files")
120 | var tx *sql.Tx
121 | tx, err = db.Begin()
122 | if err != nil {
123 | panic(err)
124 | }
125 | defer tx.Commit()
126 | stmt, err = tx.Prepare("INSERT INTO files VALUES (?,?,?,?)")
127 | if err != nil {
128 | panic(err)
129 | }
130 | defer stmt.Close()
131 |
132 | // load sha1 files
133 | loadDbFiles(*argDbSha1)
134 |
135 | // start scan
136 | startTime := time.Now()
137 | fmt.Println()
138 | fmt.Println("Scanning path \"" + *argDir + "\"...")
139 | walker.Walk(*argDir, walkFn, errorCallbackOption)
140 |
141 | wp.StopWait()
142 | endTime := time.Now()
143 | showInfos() // last refresh
144 | fmt.Println()
145 | fmt.Println()
146 | fmt.Println("RESULT")
147 | for k, v := range nbSha1Match {
148 | fmt.Printf(" %-40s : %d matches\n", checksumFilenames[k], v)
149 | }
150 | fmt.Printf(" %-40s : %d matches\n", "Total", nbTotalSha1Match)
151 | fmt.Println()
152 | fmt.Println("Done in", endTime.Sub(startTime))
153 | }
154 |
155 | func checkArgs() {
156 | flag.Parse()
157 | if *argDir == "" {
158 | showUsage()
159 | os.Exit(0)
160 | }
161 | if *argDbSha1 == "" {
162 | showUsage()
163 | os.Exit(0)
164 | }
165 | if *argDbSha1 != "" && !dirExists(*argDbSha1) {
166 | fmt.Printf("ERROR loading databases: %q does not exists or is not a directory\n", *argDbSha1)
167 | os.Exit(0)
168 | }
169 | }
170 |
171 | // Returns the mime type of a file
172 | // @param {string} filename, full path of a file, ex: "/home/user/file.txt"
173 | // @returns {string} "text/plain" or "" if unknown
174 | func getMimeType(filename string) (string, error) {
175 | mime, err := mimetype.DetectFile(filename)
176 | if err != nil {
177 | return "", err
178 | }
179 | return mime.String(), nil
180 | }
181 |
182 | func showUsage() {
183 | fmt.Println("NAME")
184 | fmt.Println(" hscan")
185 | fmt.Println(" Look for files recursively matching a list of checksums (sha-1 20 bytes base 16)")
186 | fmt.Println()
187 | fmt.Println("VERSION")
188 | fmt.Printf(" v%s\n", strVersion)
189 | fmt.Println()
190 | fmt.Println("USAGE")
191 | fmt.Println(" hscan -d -db ")
192 | flag.PrintDefaults()
193 | fmt.Println(" Results are written in log files in current directory")
194 | fmt.Println()
195 | fmt.Println("EXAMPLES")
196 | fmt.Println(" hscan -db /home/user/sha1files/ -d /mnt/dir/")
197 | fmt.Println(" Loads text files containing sha1 checksums from the directory /home/user/sha1files.")
198 | fmt.Println(" Those files must have one checksum per line.")
199 | fmt.Println(" The path /mnt/dir is scanned recursively to look for matches.")
200 | fmt.Println()
201 | fmt.Println("AUTHOR")
202 | fmt.Println(" Written by Twitter:@JeffProd")
203 | fmt.Println()
204 | fmt.Println("LICENCE")
205 | fmt.Println(" MIT License - Copyright (c) 2020 JeffProd.com")
206 | }
207 |
208 | // load txt files from the given path
209 | // each line must contain 1 sha1sum
210 | // @param {string} rootpath "/home/user/dir" or relative path
211 | func loadDbFiles(rootpath string) {
212 | files, err := ioutil.ReadDir(rootpath)
213 | if err != nil {
214 | fmt.Printf("ERROR accessing path %q: %v\n", rootpath, err)
215 | os.Exit(1)
216 | }
217 | cpt := uint8(0)
218 | for _, file := range files {
219 | if file.IsDir() { continue } // skip dirs
220 | strFilename := path.Join(rootpath, file.Name())
221 | strType, err := getMimeType(strFilename)
222 | if err != nil {
223 | fmt.Println(err)
224 | os.Exit(1)
225 | }
226 | if !strings.HasPrefix(strType, "text/plain") { continue } // skip non txt files
227 | checksumFilenames = append(checksumFilenames, file.Name())
228 | loadChecksumFile(path.Join(rootpath, file.Name()), cpt)
229 | cpt++
230 | }
231 | if cpt == 0 {
232 | fmt.Printf("No database text file found in %q\n", rootpath)
233 | os.Exit(0)
234 | }
235 | } // loadDbFiles
236 |
237 | // progress information
238 | func showInfos() {
239 | fmt.Printf("\r %d files - %d unreadable files - %d dirs - %d unreadable dirs - %d matches", nbFiles, nbUnreadableFile, nbDirs, nbUnreadableDir, nbTotalSha1Match)
240 | // if range nbSha1Match here: concurrent map iteration and map write, so we use nbTotalSha1Match
241 | }
242 |
243 | // Calculate SHA1 of a file
244 | // filename, ex: "/home/user/file.txt"
245 | func sha1sum(filename string) [20]byte {
246 | data, err := ioutil.ReadFile(filename)
247 | if err != nil {
248 | writeResult("", "", filename, fmt.Sprintf("%v", err))
249 | nbUnreadableFile++
250 | return [20]byte{}
251 | }
252 | return sha1.Sum(data)
253 | }
254 |
255 | // dirExists checks if a directory exists
256 | func dirExists(path string) bool {
257 | info, err := os.Stat(path)
258 | if os.IsNotExist(err) {
259 | return false
260 | }
261 | if err != nil { return false }
262 | return info.IsDir()
263 | }
264 |
265 | // loads in memory the sha1 checksums from a txt file
266 | // using a map for fastest check if hash key exists instead of parsing a []string
267 | // @params {string} filename "/home/user/toto.txt"
268 | // @params {uint8} index of the file in checksumFilenames
269 | func loadChecksumFile(filename string, idx uint8) {
270 | startTime := time.Now()
271 | fmt.Printf("Loading database file %q... ", filename)
272 |
273 | file, err := os.Open(filename)
274 | if err != nil {
275 | fmt.Printf("\nError reading the SHA1 checksums file %q: %v\n", filename, err)
276 | os.Exit(0)
277 | }
278 | defer file.Close()
279 |
280 | l := ""
281 | cpt := 0
282 | previousLen := len(arrSha1)
283 | scanner := bufio.NewScanner(file)
284 | for scanner.Scan() {
285 | l = strings.TrimSpace(scanner.Text())
286 | if len(l) > 40 { l = l[0:40] }
287 | l = strings.ToLower(l)
288 | if l == "" { continue }
289 | arrSha1[l] = idx
290 | cpt++
291 | }
292 |
293 | // init hash match count for this db text file
294 | nbSha1Match[idx] = 0
295 |
296 | endTime := time.Now()
297 | fmt.Printf("%d lines, %d uniq checksum found in %q\n", cpt, len(arrSha1) - previousLen, endTime.Sub(startTime))
298 | }
299 |
300 | func writeResult(sha1 string, dbfile string, filename string, errStr string) {
301 | _, err := stmt.Exec(sha1, dbfile, filename, errStr)
302 | if err != nil {
303 | panic(err)
304 | }
305 | if _, err1 := csvFile.WriteString(sha1 + "," + dbfile + "," + filename + "," + errStr + "\n"); err1 != nil { panic(err1) }
306 | }
307 |
308 | // Action on a file within the worker pool.
309 | // This is a file, not a directory.
310 | // @params {string} filename, i.e. "/home/user/file.txt"
311 | func workerPoolAction(filename string) {
312 | bSha1 := sha1sum(filename) // binary
313 | sSha1 := fmt.Sprintf("%x", bSha1) // string
314 | if sSha1 == "0000000000000000000000000000000000000000" { return } // we had an error already logged and we got empty sha1
315 |
316 | // sha1 exists in arrSha1 ?
317 | // info : arrSha1[sSha1] is the db txt file name
318 | if _, ok := arrSha1[sSha1]; ok {
319 | // we have a sha1 match
320 | writeResult(sSha1, checksumFilenames[arrSha1[sSha1]], filename, "")
321 |
322 | // can't update a map concurrently otherwhite we get "fatal error: concurrent map writes"
323 | l.Lock()
324 | nbSha1Match[arrSha1[sSha1]]++
325 | l.Unlock()
326 |
327 | nbTotalSha1Match++
328 | showInfos()
329 | return
330 | }
331 |
332 | // log also unknown file
333 | writeResult(sSha1, "", filename, "")
334 | }
335 |
336 | func dbExec (db *sql.DB, sql string) {
337 | _, err := db.Exec(sql)
338 | if err != nil {
339 | panic(err)
340 | }
341 | }
342 |
--------------------------------------------------------------------------------