├── tests
    ├── data
    │   ├── simple
    │   │   ├── child
    │   │   │   ├── ignored.txt
    │   │   │   └── a.txt
    │   │   ├── .polyglot_code_scanner_ignore
    │   │   └── parent.clj
    │   ├── simple_linked
    │   │   ├── child
    │   │   ├── parent.clj
    │   │   └── .polyglot_code_scanner_ignore
    │   ├── languages
    │   │   ├── foo.unknown
    │   │   ├── non-utf8.properties
    │   │   └── pfunit_test.pf
    │   ├── zipped
    │   │   ├── git_sample.zip
    │   │   ├── rename_complex.zip
    │   │   └── rename_simple.zip
    │   └── builders
    │   │   ├── README.md
    │   │   └── renaming
    │   │       ├── build_rename_simple.sh
    │   │       └── build_rename_complex.sh
    ├── expected
    │   ├── simple_files.json
    │   ├── simple_files_with_indicators.json
    │   ├── integration_tests
    │   │   ├── loc_flare_test.json
    │   │   ├── git_flare_test.json
    │   │   └── git_detailed_flare_test.json
    │   └── git
    │   │   ├── git_sample_by_filename.json
    │   │   ├── git_sample.json
    │   │   └── git_sample_with_merges.json
    └── integration_tests.rs
├── .gitignore
├── release.toml
├── test_shared
    ├── Cargo.toml
    └── src
    │   └── lib.rs
├── TODO.md
├── src
    ├── toxicity_indicator_calculator.rs
    ├── postprocessing.rs
    ├── file_stats.rs
    ├── polyglot_data.rs
    ├── git_user_dictionary.rs
    ├── loc.rs
    ├── code_line_data.rs
    ├── lib.rs
    ├── indentation.rs
    ├── git_file_future.rs
    ├── file_walker.rs
    ├── main.rs
    ├── flare.rs
    ├── git_file_history.rs
    ├── git.rs
    └── git_logger.rs
├── LICENSE.txt
├── .github
    └── workflows
    │   ├── test-all.yml
    │   ├── macos-release.yml
    │   ├── windows-release.yml
    │   └── linux-release.yml
├── Cargo.toml
├── DesignDecisons.md
├── README.md
└── CHANGELOG.md


/tests/data/simple/child/ignored.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/data/simple_linked/child:
--------------------------------------------------------------------------------
1 | ../simple/child


--------------------------------------------------------------------------------
/tests/data/simple/child/a.txt:
--------------------------------------------------------------------------------
1 | test with
2 | two lines


--------------------------------------------------------------------------------
/tests/data/simple_linked/parent.clj:
--------------------------------------------------------------------------------
1 | ../simple/parent.clj


--------------------------------------------------------------------------------
/tests/data/simple/.polyglot_code_scanner_ignore:
--------------------------------------------------------------------------------
1 | **/ignored.txt


--------------------------------------------------------------------------------
/tests/data/simple/parent.clj:
--------------------------------------------------------------------------------
1 | (ns parent)
2 | 
3 | (do
4 |   (prn "wow"))


--------------------------------------------------------------------------------
/tests/data/languages/foo.unknown:
--------------------------------------------------------------------------------
1 | Unknown files
2 |   should be treated as code


--------------------------------------------------------------------------------
/tests/data/simple_linked/.polyglot_code_scanner_ignore:
--------------------------------------------------------------------------------
1 | ../simple/.polyglot_code_scanner_ignore


--------------------------------------------------------------------------------
/tests/data/languages/non-utf8.properties:
--------------------------------------------------------------------------------
1 | #test ISO 8859-1
2 | test-iso8859-1-chars=����������
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | .idea
4 | .vscode
5 | /.cargo
6 | /.rustc_info.json
7 | /debug
8 | 


--------------------------------------------------------------------------------
/tests/data/zipped/git_sample.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kornysietsma/polyglot-code-scanner/HEAD/tests/data/zipped/git_sample.zip


--------------------------------------------------------------------------------
/tests/data/zipped/rename_complex.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kornysietsma/polyglot-code-scanner/HEAD/tests/data/zipped/rename_complex.zip


--------------------------------------------------------------------------------
/tests/data/zipped/rename_simple.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kornysietsma/polyglot-code-scanner/HEAD/tests/data/zipped/rename_simple.zip


--------------------------------------------------------------------------------
/tests/data/builders/README.md:
--------------------------------------------------------------------------------
1 | # Test data builders
2 | 
3 | This directory contains scripts used to build test repositories.
4 | 
5 | You shouldn't need to run anything here unless you are changing tests - the generated tests will all be checked in to git.
6 | 


--------------------------------------------------------------------------------
/release.toml:
--------------------------------------------------------------------------------
1 | pre-release-replacements = [
2 |   {file="CHANGELOG.md", search="Unreleased", replace="{{version}}"},
3 |   {file="CHANGELOG.md", search="ReleaseDate", replace="{{date}}"},
4 |   {file="CHANGELOG.md", search="<!-- next-header -->", replace="<!-- next-header -->\n## [Unreleased] - ReleaseDate"},
5 | ]


--------------------------------------------------------------------------------
/tests/data/languages/pfunit_test.pf:
--------------------------------------------------------------------------------
 1 | module test_simple
 2 |    use funit
 3 | 
 4 | contains
 5 | 
 6 |    !!! Note: no test annotation !!!
 7 |    subroutine not_a_test()
 8 |       print*,'this procedure should not be called'
 9 |    end subroutine not_a_test
10 | 
11 |    @test
12 |    subroutine test_assert_true_and_false()
13 |       @assertTrue(1 == 1)
14 |       @assertFalse(1 == 2)
15 |    end subroutine test_assert_true_and_false
16 | 
17 | end module test_simple


--------------------------------------------------------------------------------
/test_shared/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "test_shared"
 3 | version = "0.0.1"
 4 | authors = ["Korny Sietsma <korny@sietsma.com>"]
 5 | description = "Shared test helpers for polyglot_code_scanner"
 6 | edition = "2021"
 7 | 
 8 | [dependencies]
 9 | serde = { version = "1.0.144",features = ["derive"] }
10 | serde_json = "1.0.85"
11 | regex = "1.6.0"
12 | anyhow = "1.0.65"
13 | tempfile = "3.3.0"
14 | zip = "0.6.2"
15 | pretty_assertions = "1.3.0"
16 | log = "0.4.17"
17 | fern = "0.6.1"


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | # TODO
 2 | 
 3 | See Trello for anything not short term (sorry for people looking at github, but
 4 | I had to look at cross-repo plans and goals)
 5 | 
 6 | Small / immediate things:
 7 | 
 8 | - add test that checks binary files and unknown text files (e.g. erb)
 9 | - refactoring - use Into more ? "fn new<S: Into<String>>(name: S, is_file: bool)" allows the caller to decide...
10 | - Can we get rid of test_shared's duplication in cargo.toml ?
11 | - "-P" cli option is confusing - it's pretty printing _for logs_ !
12 | - can we make the log default "warn"??
13 | 


--------------------------------------------------------------------------------
/tests/expected/simple_files.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "test",
 3 |   "id": "test-id",
 4 |   "version": "1.0.4",
 5 |   "metadata": {},
 6 |   "features": {
 7 |     "coupling": false,
 8 |     "git": false,
 9 |     "git_details": false,
10 |     "file_stats": false
11 |   },
12 |   "tree": {
13 |     "name": "<root>",
14 |     "children": [
15 |       {
16 |         "name": "child",
17 | 
18 |         "children": [
19 |           {
20 |             "name": "a.txt"
21 |           }
22 |         ]
23 |       },
24 |       {
25 |         "name": "parent.clj"
26 |       }
27 |     ]
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/tests/expected/simple_files_with_indicators.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "test",
 3 |   "id": "test-id",
 4 |   "version": "1.0.4",
 5 |   "metadata": {},
 6 |   "features": {
 7 |     "coupling": false,
 8 |     "git": false,
 9 |     "git_details": false,
10 |     "file_stats": false
11 |   },
12 |   "tree": {
13 |     "name": "<root>",
14 |     "children": [
15 |       {
16 |         "name": "child",
17 |         "children": [
18 |           {
19 |             "name": "a.txt!?"
20 |           }
21 |         ]
22 |       },
23 |       {
24 |         "name": "parent.clj!?"
25 |       }
26 |     ]
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/toxicity_indicator_calculator.rs:
--------------------------------------------------------------------------------
 1 | #![warn(clippy::all)]
 2 | 
 3 | use anyhow::Error;
 4 | use std::path::Path;
 5 | 
 6 | use crate::{flare::FlareTreeNode, polyglot_data::IndicatorMetadata};
 7 | 
 8 | /// Wrapper for the logic that calculates toxicity indicators
 9 | pub trait ToxicityIndicatorCalculator: std::fmt::Debug {
10 |     fn name(&self) -> String;
11 |     fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error>;
12 |     /// root-level metadata - output after all files added
13 |     fn apply_metadata(&self, metadata: &mut IndicatorMetadata) -> Result<(), Error>;
14 | }
15 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2019 Kornelis Sietsma
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/src/postprocessing.rs:
--------------------------------------------------------------------------------
 1 | use crate::{flare::FlareTreeNode, git::GitNodeData, ScannerConfig};
 2 | use anyhow::Error;
 3 | 
 4 | fn remove_details(node: &mut FlareTreeNode, config: &ScannerConfig) -> Result<(), Error> {
 5 |     if let Some(GitNodeData::File { data }) = &mut node.indicators_mut().git {
 6 |         if !config.features.git_details {
 7 |             data.details = Vec::new();
 8 |         }
 9 |         data.activity = Vec::new();
10 |     }
11 |     for child in node.get_children_mut() {
12 |         remove_details(child, config)?;
13 |     }
14 |     Ok(())
15 | }
16 | 
17 | pub fn postprocess_tree(tree: &mut FlareTreeNode, config: &ScannerConfig) -> Result<(), Error> {
18 |     info!("Postprocessing tree before persisting");
19 |     remove_details(tree, config)?;
20 |     Ok(())
21 | }
22 | 


--------------------------------------------------------------------------------
/.github/workflows/test-all.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | on:
 3 |   pull_request:
 4 |     branches:
 5 |       - "*"
 6 |   push:
 7 |     branches:
 8 |       - master
 9 |     tags-ignore:
10 |       - "*"
11 | 
12 | jobs:
13 |   test:
14 |     if: |
15 |       !contains(github.event.commits[0].message, '[ci skip]') &&
16 |       !contains(github.event.commits[0].message, '(cargo-release)')
17 |     env:
18 |       RUST_BACKTRACE: "full"
19 |     runs-on: ${{ matrix.os }}
20 |     strategy:
21 |       fail-fast: true
22 |       matrix:
23 |         os: [macos-latest, ubuntu-20.04, ubuntu-22.04] # removed windows as tests broken atm
24 | 
25 |     steps:
26 |       - uses: actions/checkout@v2
27 |       - name: Install Rust stable
28 |         uses: actions-rs/toolchain@v1
29 |         with:
30 |           profile: minimal
31 |           toolchain: stable
32 |           components: rustfmt, clippy
33 |       - uses: Swatinem/rust-cache@v1
34 |         with:
35 |           key: ${{ matrix.os }}
36 |       - name: Test
37 |         run: |
38 |           cargo fmt -- --check
39 |           cargo clippy --release
40 |           cargo test
41 | 


--------------------------------------------------------------------------------
/tests/expected/integration_tests/loc_flare_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "test",
 3 |   "id": "test-id",
 4 |   "version": "1.0.4",
 5 |   "metadata": {},
 6 |   "features": {
 7 |     "coupling": false,
 8 |     "git": false,
 9 |     "git_details": false,
10 |     "file_stats": false
11 |   },
12 |   "tree": {
13 |     "name": "<root>",
14 |     "children": [
15 |       {
16 |         "name": "child",
17 |         "children": [
18 |           {
19 |             "name": "a.txt",
20 |             "data": {
21 |               "loc": {
22 |                 "language": "Plain Text",
23 |                 "binary": false,
24 |                 "blanks": 0,
25 |                 "code": 0,
26 |                 "comments": 2,
27 |                 "lines": 2,
28 |                 "bytes": 19
29 |               }
30 |             }
31 |           }
32 |         ]
33 |       },
34 |       {
35 |         "name": "parent.clj",
36 |         "data": {
37 |           "loc": {
38 |             "language": "Clojure",
39 |             "binary": false,
40 |             "blanks": 1,
41 |             "code": 3,
42 |             "comments": 0,
43 |             "lines": 4,
44 |             "bytes": 31
45 |           }
46 |         }
47 |       }
48 |     ]
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "polyglot_code_scanner"
 3 | version = "0.4.5-alpha.0"
 4 | publish = false
 5 | authors = ["Korny Sietsma <korny@sietsma.com>"]
 6 | description = "Polyglot Code Scanner - scans source code and generates tree-structured JSON files for d3 visualisation"
 7 | edition = "2021"
 8 | 
 9 | [profile.release]
10 | debug = true
11 | 
12 | [dependencies]
13 | tokei = { git = "https://github.com/kornysietsma/tokei", tag = "PolyglotV1.0.1" }
14 | ignore = "0.4.18"
15 | serde = { version = "1.0.144",features = ["derive","rc"] }
16 | erased-serde = "0.3.23"
17 | serde_json = "1.0.85"
18 | regex = "1.6.0"
19 | clap = { version = "3.2.22", features = ["derive"] }
20 | log = "0.4.17"
21 | fern = "0.6.1"
22 | clap-verbosity-flag = "1.0.1"
23 | lazy_static = "1.4.0"
24 | git2 = "0.15.0"
25 | derive_builder = "0.11.2"
26 | derive-getters = "0.2.0"
27 | content_inspector = "0.2.4"
28 | encoding_rs_io = "0.1.7"
29 | grep-searcher = "0.1.10"
30 | hdrhistogram = "7.5.2"
31 | indicatif = "0.17.1"
32 | chrono = "0.4.22"
33 | openssl = { version = "0.10.42", features=["vendored"] }
34 | path-slash = "0.2.1"
35 | uuid = { version = "1.1.2", features = ["v4"] }
36 | anyhow = "1.0.65"
37 | filetime = "0.2.17"
38 | 
39 | [dev-dependencies]
40 | test_shared = { path = "test_shared" }
41 | tempfile = "3.3.0"
42 | zip = "0.6.2"
43 | pretty_assertions = "1.3.0"
44 | 


--------------------------------------------------------------------------------
/tests/data/builders/renaming/build_rename_simple.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -ex
 2 | 
 3 | if [[ -d "rename_simple" ]]; then
 4 |     rm -r rename_simple
 5 | fi
 6 | 
 7 | mkdir rename_simple
 8 | cd rename_simple
 9 | 
10 | git_dates() {
11 |     # really simple - sets the hour only, so dates are ordered
12 |     if [ -z "$1" ]; then
13 |         echo "needs a param"
14 |         exit 1
15 |     fi
16 |     export GIT_AUTHOR_DATE="2020-09-13T$1:00:00"
17 |     export GIT_COMMITTER_DATE="2020-09-13T$1:00:00"
18 | }
19 | 
20 | export GIT_AUTHOR_NAME="Kate Smith"
21 | export GIT_AUTHOR_EMAIL="kate@smith.com"
22 | export GIT_COMMITTER_NAME="Jay"
23 | export GIT_COMMITTER_EMAIL="Jay@smith.com"
24 | git_dates "01"
25 | 
26 | git init
27 | 
28 | cat <<EOF >a.txt
29 | a
30 | a
31 | a
32 | a
33 | EOF
34 | 
35 | git add .
36 | 
37 | git commit -m "initial commit"
38 | 
39 | git_dates "02"
40 | 
41 | cat <<EOF >b.txt
42 | b
43 | EOF
44 | 
45 | git add .
46 | 
47 | git commit -m "unrelated commit"
48 | 
49 | git_dates "03"
50 | 
51 | git mv a.txt c.txt
52 | 
53 | git add .
54 | 
55 | git commit -m "moving a to c"
56 | 
57 | git_dates "04"
58 | 
59 | git mv c.txt d.txt
60 | 
61 | echo "d" >>d.txt
62 | 
63 | git add .
64 | 
65 | git commit -m "moving and renaming"
66 | 
67 | cd ..
68 | 
69 | if [[ -f "rename_simple.zip" ]]; then
70 |     rm rename_simple.zip
71 | fi
72 | 
73 | zip -r rename_simple.zip rename_simple
74 | 


--------------------------------------------------------------------------------
/.github/workflows/macos-release.yml:
--------------------------------------------------------------------------------
 1 | name: macos-release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "v*"
 7 | 
 8 | jobs:
 9 |   build:
10 |     name: Build on macOS
11 |     runs-on: macos-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - name: Install Rust stable
15 |         uses: dtolnay/rust-toolchain@master
16 |         with:
17 |           toolchain: stable
18 |       - uses: Swatinem/rust-cache@v1
19 |       - name: Build
20 |         run: |
21 |           cargo build --release --locked
22 |       - name: Upload build artifact
23 |         uses: actions/upload-artifact@v3
24 |         with:
25 |           name: binary
26 |           path: target/release/polyglot_code_scanner
27 |   test:
28 |     name: Test on macOS
29 |     runs-on: macos-latest
30 |     steps:
31 |       - uses: actions/checkout@v2
32 |       - name: Install Rust stable
33 |         uses: actions-rs/toolchain@v1
34 |         with:
35 |           profile: minimal
36 |           toolchain: stable
37 |       - uses: Swatinem/rust-cache@v1
38 |       - name: Test
39 |         run: |
40 |           cargo fmt -- --check
41 |           cargo clippy --release
42 |           cargo test --release --locked
43 |   release:
44 |     runs-on: macos-latest
45 |     needs: [build, test]
46 |     steps:
47 |       - name: Set the release tag
48 |         id: set_tag
49 |         run: echo ::set-output name=RELEASE_TAG::${GITHUB_REF/refs\/tags\/v/}
50 |         shell: bash
51 |       - name: Restore artifact from previous job
52 |         uses: actions/download-artifact@v3
53 |         with:
54 |           name: binary
55 |       - name: Upload binaries to release
56 |         uses: svenstaro/upload-release-action@v1-release
57 |         with:
58 |           repo_token: ${{ secrets.GITHUB_TOKEN }}
59 |           file: polyglot_code_scanner
60 |           asset_name: polyglot-code-scanner-x86_64-macos
61 |           tag: ${{ github.ref }}
62 |           overwrite: true
63 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-release.yml:
--------------------------------------------------------------------------------
 1 | name: windows-release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "*"
 7 | 
 8 | jobs:
 9 |   build:
10 |     name: Build on Windows
11 |     runs-on: windows-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - name: Install Rust stable
15 |         uses: dtolnay/rust-toolchain@master
16 |         with:
17 |           toolchain: stable
18 |       - uses: Swatinem/rust-cache@v1
19 |       - name: Build
20 |         run: |
21 |           cargo build --release --locked
22 |       - name: Upload build artifact
23 |         uses: actions/upload-artifact@v3
24 |         with:
25 |           name: binary
26 |           path: target/release/polyglot_code_scanner.exe
27 |   test:
28 |     name: Test on Windows
29 |     if: ${{ false }} # disabled as windows tests have issues with file sizes at the moment
30 |     runs-on: windows-latest
31 |     steps:
32 |       - uses: actions/checkout@v2
33 |       - name: Install Rust stable
34 |         uses: actions-rs/toolchain@v1
35 |         with:
36 |           profile: minimal
37 |           toolchain: stable
38 |       - uses: Swatinem/rust-cache@v1
39 |       - name: Test
40 |         run: |
41 |           cargo fmt -- --check
42 |           cargo clippy --release
43 |           cargo test --release --locked
44 |   release:
45 |     runs-on: windows-latest
46 |     needs: [build]
47 |     steps:
48 |       - name: Set the release tag
49 |         id: set_tag
50 |         run: echo ::set-output name=RELEASE_TAG::${GITHUB_REF/refs\/tags\/v/}
51 |         shell: bash
52 |       - uses: actions/checkout@v2
53 |       - name: Restore artifact from previous job
54 |         uses: actions/download-artifact@v3
55 |         with:
56 |           name: binary
57 |       - name: Upload binaries to release
58 |         uses: svenstaro/upload-release-action@v1-release
59 |         with:
60 |           repo_token: ${{ secrets.GITHUB_TOKEN }}
61 |           file: polyglot_code_scanner.exe
62 |           asset_name: polyglot-code-scanner-x86_64-windows.exe
63 |           tag: ${{ github.ref }}
64 |           overwrite: true
65 | 


--------------------------------------------------------------------------------
/tests/expected/integration_tests/git_flare_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "test",
 3 |   "id": "test-id",
 4 |   "version": "1.0.4",
 5 |   "features": {
 6 |     "coupling": false,
 7 |     "git": true,
 8 |     "git_details": false,
 9 |     "file_stats": false
10 |   },
11 |   "tree": {
12 |     "name": "<root>",
13 |     "children": [
14 |       {
15 |         "name": "simple",
16 |         "children": [
17 |           {
18 |             "name": "child",
19 |             "children": [
20 |               {
21 |                 "name": "a_renamed.txt",
22 |                 "data": {
23 |                   "git": {
24 |                     "age_in_days": 0,
25 |                     "creation_date": 1558521386,
26 |                     "last_update": 1558533240,
27 |                     "user_count": 2,
28 |                     "users": [0, 1],
29 |                     "activity": [],
30 |                     "details": []
31 |                   }
32 |                 }
33 |               }
34 |             ]
35 |           },
36 |           {
37 |             "name": "parent.clj",
38 |             "data": {
39 |               "git": {
40 |                 "age_in_days": 0,
41 |                 "creation_date": 1558521386,
42 |                 "last_update": 1558524371,
43 |                 "user_count": 2,
44 |                 "users": [0, 1],
45 |                 "activity": [],
46 |                 "details": []
47 |               }
48 |             }
49 |           }
50 |         ]
51 |       }
52 |     ],
53 |     "data": {
54 |       "git": {
55 |         "head": "93ae0c7c7cd93b3c4ea1bf103bde4deafef798ad",
56 |         "remote_url": null
57 |       }
58 |     }
59 |   },
60 |   "metadata": {
61 |     "git": {
62 |       "users": [
63 |         {
64 |           "id": 0,
65 |           "user": {
66 |             "email": "korny@sietsma.com",
67 |             "name": "Korny Sietsma"
68 |           }
69 |         },
70 |         {
71 |           "id": 1,
72 |           "user": {
73 |             "email": "hgranger@durmstrang.de",
74 |             "name": "hermoine"
75 |           }
76 |         }
77 |       ]
78 |     }
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-release.yml:
--------------------------------------------------------------------------------
 1 | name: linux-release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "v*"
 7 | 
 8 | jobs:
 9 |   build:
10 |     name: Build on Linux
11 |     container: node:alpine
12 |     runs-on: ubuntu-20.04
13 |     env:
14 |       RUST_BACKTRACE: "full"
15 |     steps:
16 |       - uses: actions/checkout@v2
17 |       - name: Install dependencies
18 |         run: |
19 |           apk add --no-cache bash curl build-base openssl-dev perl tar
20 |       - name: Install Rust stable
21 |         uses: dtolnay/rust-toolchain@master
22 |         with:
23 |           toolchain: stable
24 |       - uses: Swatinem/rust-cache@v1
25 |         with:
26 |           key: ubuntu-latest
27 |       - name: Build
28 |         env:
29 |           RUSTFLAGS: "-C link-arg=-s"
30 |         run: |
31 |           cargo build --release --locked
32 |       - name: Upload build artifact
33 |         uses: actions/upload-artifact@v3
34 |         with:
35 |           name: binary
36 |           path: target/release/polyglot_code_scanner
37 |   test:
38 |     name: Test on Linux
39 |     runs-on: ubuntu-latest
40 |     steps:
41 |       - uses: actions/checkout@v2
42 |       - name: Install Rust stable
43 |         uses: actions-rs/toolchain@v1
44 |         with:
45 |           profile: minimal
46 |           toolchain: stable
47 |           components: rustfmt, clippy
48 |       - uses: Swatinem/rust-cache@v1
49 |         with:
50 |           key: ubuntu-latest
51 |       - name: Test
52 |         run: |
53 |           cargo fmt -- --check
54 |           cargo clippy --release
55 |           cargo test --release --locked
56 |   release:
57 |     runs-on: ubuntu-latest
58 |     needs: [build, test]
59 |     steps:
60 |       - name: Restore artifact from previous job
61 |         uses: actions/download-artifact@v3
62 |         with:
63 |           name: binary
64 |       - name: Upload binaries to release
65 |         uses: svenstaro/upload-release-action@v1-release
66 |         with:
67 |           repo_token: ${{ secrets.GITHUB_TOKEN }}
68 |           file: polyglot_code_scanner
69 |           asset_name: polyglot-code-scanner-x86_64-linux
70 |           tag: ${{ github.ref }}
71 |           overwrite: true
72 |       - uses: actions/checkout@v2
73 | 


--------------------------------------------------------------------------------
/DesignDecisons.md:
--------------------------------------------------------------------------------
 1 | # Software design decisions
 2 | 
 3 | This file arose as I wanted somewhere for notes on _why_ I make changes - a bit like Architecture Decision Records, but it's a bit grand to call them Architecture :)
 4 | 
 5 | Mostly as right now (Sep 2022) I'm reversing an original decision and without having a pair to talk to, making notes here is useful for me!
 6 | 
 7 | ## Sep 2022 - stopping using Value for Toxicity Calculators
 8 | 
 9 | Originally I built this scanner a bit too generically.  You'd think after decades of preaching "YAGNI - You Ain't Gonna Need It" I'd have learned better, but no...
10 | 
11 | So the scanner used to have these fairly generic `ToxicityIndicatorCalculator` structs, which have two methods:
12 | 
13 | * `calculate` which is the heart of the calculator, it is a pure-ish function that returns a JSON `Value` for the calculator - e.g. for the Lines of Code one it returns a set of code lines metrics - this is called for each file/dir scanned, and the returned `Value` is added to the `data` for each file/dir
14 | * `metadata` which is called at the end, to store any metadata that the calculator generates and needs to be saved.  This also used a `Value`
15 | 
16 | This seemed like a good idea at the time - nice to have side-effect-free functions, and the `Value` returns meant no coupling between the calculators and the rest of the app.  
17 | 
18 | But, once I moved the Explorer to TypeScript, I had to re-build the types used in the Scanner in TypeScript - and I realised that really the use of `Value` meant I was bypassing the type system.  And I only have 3 Indicators! So why all this effort for generic behaviour?  There's no point making the Rust flexible when the TypeScript isn't!
19 | 
20 | (I think when I started I had no idea how many indicators I would want, and I could see it being some kind of place I could plug in language-specific tools... like I said, YAGNI should have applied)
21 | 
22 | So, I want to go to Value-less code.  I can see two options:
23 | 
24 | 1. Instead of `calculate` returning a `Value` I make it generic, so `calculate<T>` returns a `T`
25 | 2. Make it a visitor instead. `calculate` takes a mutable `FileTreeNode` parameter, it changes the data it needs to.
26 | 
27 | I'm going for option `2` - it feels a lot simpler.
28 | 
29 | The only downside here is - this made some unit tests harder. The more-generic code could be tested by throwing fake `Value` objects around for tests - the new code only accepts 'real' types.  This is probably good overall, as it means the tests are closer to reality.  But some things aren't well tested, except in end-to-end tests.


--------------------------------------------------------------------------------
/tests/integration_tests.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Error;
 2 | use polyglot_code_scanner::ScannerConfig;
 3 | use serde_json::Value;
 4 | use std::io::Cursor;
 5 | use std::path::PathBuf;
 6 | use tempfile::tempdir;
 7 | use test_shared::*;
 8 | 
 9 | fn test_scanner_config(with_git: bool) -> ScannerConfig {
10 |     let mut config = ScannerConfig::default("test");
11 |     config.data_id = Some("test-id".to_string());
12 |     config.features.git = with_git;
13 |     config
14 | }
15 | 
16 | #[test]
17 | fn it_calculates_lines_of_code() -> Result<(), Error> {
18 |     let root = PathBuf::from("./tests/data/simple/");
19 | 
20 |     let mut buffer: Vec<u8> = Vec::new();
21 |     let out = Cursor::new(&mut buffer);
22 | 
23 |     let result =
24 |         polyglot_code_scanner::run(&root, &test_scanner_config(false), None, &["loc"], out);
25 | 
26 |     assert!(result.is_ok());
27 | 
28 |     let parsed_result: Value = serde_json::from_reader(buffer.as_slice())?;
29 | 
30 |     assert_eq_json_file(
31 |         &parsed_result,
32 |         "./tests/expected/integration_tests/loc_flare_test.json",
33 |     );
34 | 
35 |     Ok(())
36 | }
37 | 
38 | #[test]
39 | fn it_calculates_git_stats() -> Result<(), Error> {
40 |     let gitdir = tempdir()?;
41 |     let git_root = unzip_test_sample("git_sample", gitdir.path())?;
42 | 
43 |     let mut buffer: Vec<u8> = Vec::new();
44 |     let out = Cursor::new(&mut buffer);
45 | 
46 |     let result =
47 |         polyglot_code_scanner::run(&git_root, &test_scanner_config(true), None, &["git"], out);
48 | 
49 |     assert!(result.is_ok());
50 | 
51 |     let parsed_result: Value = serde_json::from_reader(buffer.as_slice())?;
52 | 
53 |     assert_eq_json_file(
54 |         &parsed_result,
55 |         "./tests/expected/integration_tests/git_flare_test.json",
56 |     );
57 | 
58 |     Ok(())
59 | }
60 | 
61 | #[test]
62 | fn it_calculates_detailed_git_stats() -> Result<(), Error> {
63 |     let gitdir = tempdir()?;
64 |     let git_root = unzip_test_sample("git_sample", gitdir.path())?;
65 | 
66 |     let mut buffer: Vec<u8> = Vec::new();
67 |     let out = Cursor::new(&mut buffer);
68 | 
69 |     let mut config = test_scanner_config(true);
70 |     config.features.git_details = true;
71 | 
72 |     let result = polyglot_code_scanner::run(&git_root, &config, None, &["git"], out);
73 | 
74 |     assert!(result.is_ok());
75 | 
76 |     let parsed_result: Value = serde_json::from_reader(buffer.as_slice())?;
77 | 
78 |     assert_eq_json_file(
79 |         &parsed_result,
80 |         "./tests/expected/integration_tests/git_detailed_flare_test.json",
81 |     );
82 | 
83 |     Ok(())
84 | }
85 | 
86 | // TODO: add a coupling e2e test!  Needs a lot of setup
87 | 


--------------------------------------------------------------------------------
/src/file_stats.rs:
--------------------------------------------------------------------------------
 1 | use std::{fs, path::Path};
 2 | 
 3 | use anyhow::Error;
 4 | use filetime::FileTime;
 5 | use serde::Serialize;
 6 | 
 7 | use crate::{
 8 |     flare::FlareTreeNode, polyglot_data::IndicatorMetadata,
 9 |     toxicity_indicator_calculator::ToxicityIndicatorCalculator,
10 | };
11 | 
12 | /// File creation and modification times, in seconds since unix epoch
13 | /// using the filetime crate so Windows times are converted to unix times!
14 | #[derive(Debug, PartialEq, Eq, Clone, Serialize, Default)]
15 | pub struct FileStats {
16 |     created: i64,
17 |     modified: i64,
18 | }
19 | 
20 | impl FileStats {
21 |     fn new(path: &Path) -> Result<Self, Error> {
22 |         let metadata = fs::metadata(path)?;
23 |         let ctime = FileTime::from_creation_time(&metadata);
24 |         let mtime = FileTime::from_last_modification_time(&metadata);
25 |         match (ctime, mtime) {
26 |             (Some(ctime), mtime) => Ok(FileStats {
27 |                 created: ctime.unix_seconds(),
28 |                 modified: mtime.unix_seconds(),
29 |             }),
30 |             (None, mtime) => {
31 |                 warn!("File has no ctime - using mtime");
32 |                 Ok(FileStats {
33 |                     created: mtime.unix_seconds(),
34 |                     modified: mtime.unix_seconds(),
35 |                 })
36 |             }
37 |         }
38 |     }
39 | }
40 | #[derive(Debug)]
41 | pub struct FileStatsCalculator {}
42 | 
43 | impl ToxicityIndicatorCalculator for FileStatsCalculator {
44 |     fn name(&self) -> String {
45 |         "file_stats".to_string()
46 |     }
47 | 
48 |     fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error> {
49 |         let file_stats = FileStats::new(path)?;
50 |         node.indicators_mut().file_stats = Some(file_stats);
51 | 
52 |         Ok(())
53 |     }
54 | 
55 |     fn apply_metadata(&self, _metadata: &mut IndicatorMetadata) -> Result<(), Error> {
56 |         Ok(())
57 |     }
58 | }
59 | 
60 | #[cfg(test)]
61 | mod test {
62 |     use std::time::SystemTime;
63 | 
64 |     use super::*;
65 |     use std::time::UNIX_EPOCH;
66 |     use tempfile::{NamedTempFile, TempDir};
67 | 
68 |     #[test]
69 |     fn can_get_stats_for_a_file() -> Result<(), Error> {
70 |         let newfile = NamedTempFile::new()?;
71 | 
72 |         let stats = FileStats::new(newfile.path())?;
73 |         let now: i64 = SystemTime::now()
74 |             .duration_since(UNIX_EPOCH)?
75 |             .as_secs()
76 |             .try_into()?;
77 | 
78 |         assert!(stats.created > now - 1 && stats.created < now + 1);
79 |         assert!(stats.modified > now - 1 && stats.modified < now + 1);
80 | 
81 |         Ok(())
82 |     }
83 |     #[test]
84 |     fn can_get_stats_for_a_dir() -> Result<(), Error> {
85 |         let newdir = TempDir::new()?;
86 | 
87 |         let stats = FileStats::new(newdir.path())?;
88 |         let now: i64 = SystemTime::now()
89 |             .duration_since(UNIX_EPOCH)?
90 |             .as_secs()
91 |             .try_into()?;
92 | 
93 |         assert!(stats.created > now - 1 && stats.created < now + 1);
94 |         assert!(stats.modified > now - 1 && stats.modified < now + 1);
95 | 
96 |         Ok(())
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/test_shared/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![forbid(unsafe_code)]
 2 | #![warn(clippy::all)]
 3 | #![warn(clippy::pedantic)]
 4 | #![warn(rust_2018_idioms)]
 5 | 
 6 | use anyhow::Error;
 7 | use pretty_assertions::assert_eq;
 8 | use serde::Serialize;
 9 | use serde_json::Value;
10 | use std::fs::File;
11 | use std::path::{Path, PathBuf};
12 | use zip::ZipArchive;
13 | 
14 | /// adapted from https://github.com/mvdnes/zip-rs/blob/master/examples/extract.rs
15 | /// Note zip files only store modification time, this sets ctime to the mtime for tests
16 | pub fn unzip_to_dir(dest: &Path, zipname: &str) -> Result<(), Error> {
17 |     let fname = std::path::Path::new(zipname);
18 |     let file = File::open(&fname)?;
19 | 
20 |     let mut archive = ZipArchive::new(file)?;
21 | 
22 |     for i in 0..archive.len() {
23 |         let mut file = archive.by_index(i)?;
24 |         let outpath = PathBuf::from(dest).join(file.mangled_name());
25 | 
26 |         if (&*file.name()).ends_with('/') {
27 |             std::fs::create_dir_all(&outpath)?;
28 |         } else {
29 |             if let Some(p) = outpath.parent() {
30 |                 if !p.exists() {
31 |                     std::fs::create_dir_all(&p)?;
32 |                 }
33 |             }
34 |             let mut outfile = std::fs::File::create(&outpath)?;
35 |             std::io::copy(&mut file, &mut outfile)?;
36 |         }
37 |     }
38 |     Ok(())
39 | }
40 | 
41 | /// unzip a zip file - assumes the name shortname.zip and contains a shortname directory in the file
42 | /// returns the working directory in the unzipped data
43 | pub fn unzip_test_sample(shortname: &str, workdir: &Path) -> Result<PathBuf, Error> {
44 |     let zip_name = "tests/data/zipped/".to_owned() + shortname + ".zip";
45 |     unzip_to_dir(workdir, &zip_name)?;
46 |     Ok(PathBuf::from(workdir).join(shortname))
47 | }
48 | 
49 | pub fn assert_eq_json_file<T: ?Sized>(actual: &T, expected_file: &str)
50 | where
51 |     T: Serialize,
52 | {
53 |     let expected = std::fs::read_to_string(Path::new(expected_file)).unwrap();
54 | 
55 |     assert_eq_json_str(&actual, &expected)
56 | }
57 | 
58 | pub fn assert_eq_json_str<T: ?Sized>(actual_serializable: &T, expected_json: &str)
59 | where
60 |     T: Serialize,
61 | {
62 |     let actual = serde_json::to_value(&actual_serializable).unwrap();
63 | 
64 |     let expected: Value = serde_json::from_str(expected_json).unwrap();
65 |     assert_eq!(&actual, &expected)
66 | }
67 | 
68 | pub fn assert_eq_json_value<T: ?Sized>(actual_serializable: &T, expected_json: &Value)
69 | where
70 |     T: Serialize,
71 | {
72 |     let actual = serde_json::to_value(&actual_serializable).unwrap();
73 | 
74 |     assert_eq!(&actual, expected_json)
75 | }
76 | 
77 | pub fn assert_eq_json(left: &str, right: &str) {
78 |     let left: Value = serde_json::from_str(left).unwrap();
79 |     let right: Value = serde_json::from_str(right).unwrap();
80 |     assert_eq!(left, right);
81 | }
82 | 
83 | /// install a test logger - call this in tests where you want to see log output!
84 | pub fn install_test_logger() {
85 |     // This'll fail if called twice; don't worry.
86 |     let _ = fern::Dispatch::new()
87 |         // ...
88 |         .level(log::LevelFilter::Debug)
89 |         .chain(fern::Output::call(|record| println!("{}", record.args())))
90 |         .apply();
91 | }
92 | 


--------------------------------------------------------------------------------
/tests/expected/integration_tests/git_detailed_flare_test.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "test",
  3 |   "id": "test-id",
  4 |   "version": "1.0.4",
  5 |   "features": {
  6 |     "coupling": false,
  7 |     "git": true,
  8 |     "git_details": true,
  9 |     "file_stats": false
 10 |   },
 11 |   "tree": {
 12 |     "name": "<root>",
 13 |     "children": [
 14 |       {
 15 |         "name": "simple",
 16 |         "children": [
 17 |           {
 18 |             "name": "child",
 19 |             "children": [
 20 |               {
 21 |                 "name": "a_renamed.txt",
 22 |                 "data": {
 23 |                   "git": {
 24 |                     "age_in_days": 0,
 25 |                     "creation_date": 1558521386,
 26 |                     "last_update": 1558533240,
 27 |                     "user_count": 2,
 28 |                     "users": [0, 1],
 29 |                     "activity": [],
 30 |                     "details": [
 31 |                       {
 32 |                         "commit_day": 1558483200,
 33 |                         "commits": 5,
 34 |                         "lines_added": 7,
 35 |                         "lines_deleted": 3,
 36 |                         "users": [0]
 37 |                       },
 38 |                       {
 39 |                         "commit_day": 1558483200,
 40 |                         "commits": 1,
 41 |                         "lines_added": 1,
 42 |                         "lines_deleted": 1,
 43 |                         "users": [0, 1]
 44 |                       }
 45 |                     ]
 46 |                   }
 47 |                 }
 48 |               }
 49 |             ]
 50 |           },
 51 |           {
 52 |             "name": "parent.clj",
 53 |             "data": {
 54 |               "git": {
 55 |                 "age_in_days": 0,
 56 |                 "creation_date": 1558521386,
 57 |                 "last_update": 1558524371,
 58 |                 "user_count": 2,
 59 |                 "users": [0, 1],
 60 |                 "activity": [],
 61 |                 "details": [
 62 |                   {
 63 |                     "commit_day": 1558483200,
 64 |                     "commits": 3,
 65 |                     "lines_added": 8,
 66 |                     "lines_deleted": 1,
 67 |                     "users": [0]
 68 |                   },
 69 |                   {
 70 |                     "commit_day": 1558483200,
 71 |                     "commits": 1,
 72 |                     "lines_added": 3,
 73 |                     "lines_deleted": 1,
 74 |                     "users": [0, 1]
 75 |                   }
 76 |                 ]
 77 |               }
 78 |             }
 79 |           }
 80 |         ]
 81 |       }
 82 |     ],
 83 |     "data": {
 84 |       "git": {
 85 |         "head": "93ae0c7c7cd93b3c4ea1bf103bde4deafef798ad",
 86 |         "remote_url": null
 87 |       }
 88 |     }
 89 |   },
 90 |   "metadata": {
 91 |     "git": {
 92 |       "users": [
 93 |         {
 94 |           "id": 0,
 95 |           "user": { "email": "korny@sietsma.com", "name": "Korny Sietsma" }
 96 |         },
 97 |         {
 98 |           "id": 1,
 99 |           "user": {
100 |             "email": "hgranger@durmstrang.de",
101 |             "name": "hermoine"
102 |           }
103 |         }
104 |       ]
105 |     }
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/tests/data/builders/renaming/build_rename_complex.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -e
  2 | 
  3 | if [[ -d "rename_complex" ]]; then
  4 |     rm -r rename_complex
  5 | fi
  6 | 
  7 | mkdir rename_complex
  8 | cd rename_complex
  9 | 
 10 | author_kate() {
 11 |     export GIT_AUTHOR_NAME="Kate Smith"
 12 |     export GIT_AUTHOR_EMAIL="kate@smith.com"
 13 | }
 14 | 
 15 | author_jay() {
 16 |     export GIT_AUTHOR_NAME="Jay"
 17 |     export GIT_AUTHOR_EMAIL="Jay@smith.com"
 18 | }
 19 | 
 20 | committer_jay() {
 21 |     export GIT_COMMITTER_NAME="Jay"
 22 |     export GIT_COMMITTER_EMAIL="Jay@smith.com"
 23 | }
 24 | 
 25 | author_dave() {
 26 |     export GIT_AUTHOR_NAME="Dave Smith"
 27 |     export GIT_AUTHOR_EMAIL="dave@smith.com"
 28 | }
 29 | 
 30 | git_dates() {
 31 |     # really simple - sets the hour only, so dates are ordered
 32 |     if [ -z "$1" ]; then
 33 |         echo "needs a param"
 34 |         exit 1
 35 |     fi
 36 |     export GIT_AUTHOR_DATE="2020-09-13T$1:00:00"
 37 |     export GIT_COMMITTER_DATE="2020-09-13T$1:00:00"
 38 | }
 39 | 
 40 | git init
 41 | 
 42 | author_kate
 43 | committer_jay
 44 | 
 45 | git_dates "01"
 46 | 
 47 | cat <<EOF >a.txt
 48 | a
 49 | a
 50 | a
 51 | a
 52 | EOF
 53 | 
 54 | cat <<EOF >z.txt
 55 | z
 56 | z
 57 | z
 58 | z
 59 | EOF
 60 | 
 61 | git add .
 62 | git commit -am "initial commit"
 63 | 
 64 | git_dates "02"
 65 | 
 66 | git mv a.txt a1.txt
 67 | git commit -am "rename a to a1"
 68 | 
 69 | git_dates "03"
 70 | 
 71 | author_dave
 72 | committer_jay
 73 | 
 74 | git checkout -b "dave_work"
 75 | git mv a1.txt a2.txt
 76 | echo "junk" >>a2.txt
 77 | 
 78 | cat <<EOF >bb.txt
 79 | b
 80 | b
 81 | b
 82 | b
 83 | EOF
 84 | 
 85 | git rm z.txt
 86 | 
 87 | git add .
 88 | git commit -am "rename a1 to a2, add bb, kill z"
 89 | 
 90 | git_dates "05"
 91 | 
 92 | git mv bb.txt b.txt
 93 | git mv a2.txt a.txt
 94 | 
 95 | git commit -am "rename bb to b, a2 back to a"
 96 | 
 97 | git checkout master
 98 | 
 99 | git_dates "04"
100 | author_jay
101 | committer_jay
102 | 
103 | git checkout -b "jay_work"
104 | 
105 | git mv a1.txt aa.txt
106 | echo "junk!" >>aa.txt
107 | 
108 | cat <<EOF >bee.txt
109 | B
110 | B
111 | B
112 | EOF
113 | 
114 | git add .
115 | git commit -am "rename a1 to aa, add bee"
116 | 
117 | git_dates "06"
118 | 
119 | git mv bee.txt b.txt
120 | git mv aa.txt a.txt
121 | git add .
122 | git commit -m "rename bee to b, aa back to a"
123 | 
124 | git checkout master
125 | 
126 | git_dates "07"
127 | author_kate
128 | committer_jay
129 | 
130 | git mv a1.txt a.txt
131 | git commit -m "rename a1 back to a prep merging"
132 | 
133 | git merge jay_work -m "merging jay work"
134 | 
135 | git_dates "08"
136 | 
137 | git merge dave_work -m "merging dave work" || true # will fail!
138 | 
139 | echo "fixing"
140 | 
141 | git_dates "09"
142 | 
143 | cat <<EOF >a.txt
144 | a
145 | a
146 | a
147 | a
148 | fixed
149 | EOF
150 | 
151 | cat <<EOF >b.txt
152 | b
153 | b
154 | b
155 | b
156 | fixed
157 | EOF
158 | 
159 | git commit -am "merging dave work with fixes"
160 | 
161 | git_dates "10"
162 | 
163 | cat <<EOF >z.txt
164 | z
165 | z
166 | z
167 | z
168 | fixed
169 | EOF
170 | 
171 | git add z.txt
172 | 
173 | git commit -m "restoring deleted z"
174 | 
175 | cd ..
176 | 
177 | if [[ -f "rename_complex.zip" ]]; then
178 |     rm rename_complex.zip
179 | fi
180 | 
181 | zip -r rename_complex.zip rename_complex
182 | 


--------------------------------------------------------------------------------
/src/polyglot_data.rs:
--------------------------------------------------------------------------------
  1 | #![warn(clippy::all)]
  2 | //! Data formats for JSON output from the scanner
  3 | //!
  4 | //! Data format should now follow semantic versioning - a major version change is incompatible, a minor version change is backward compatible, a patch version is mostly around bug fixes.
  5 | 
  6 | use serde::Serialize;
  7 | use uuid::Uuid;
  8 | 
  9 | use crate::{
 10 |     coupling::CouplingMetadata, flare::FlareTreeNode, git_user_dictionary::GitUserDictionary,
 11 |     FeatureFlags,
 12 | };
 13 | 
 14 | pub static DATA_FILE_VERSION: &str = "1.0.4";
 15 | 
 16 | #[derive(Debug, Serialize)]
 17 | pub struct GitMetadata {
 18 |     pub users: GitUserDictionary,
 19 | }
 20 | #[derive(Debug, Serialize, Default)]
 21 | pub struct IndicatorMetadata {
 22 |     #[serde(skip_serializing_if = "Option::is_none")]
 23 |     pub git: Option<GitMetadata>,
 24 |     #[serde(skip_serializing_if = "Option::is_none")]
 25 |     pub coupling: Option<CouplingMetadata>,
 26 | }
 27 | 
 28 | #[derive(Debug, Serialize)]
 29 | pub struct PolyglotData {
 30 |     version: String,
 31 |     name: String,
 32 |     id: String,
 33 |     tree: FlareTreeNode,
 34 |     metadata: IndicatorMetadata,
 35 |     features: FeatureFlags,
 36 | }
 37 | 
 38 | impl PolyglotData {
 39 |     pub fn new(name: &str, id: Option<&str>, tree: FlareTreeNode, features: FeatureFlags) -> Self {
 40 |         let id = id.map_or_else(
 41 |             || Uuid::new_v4().as_hyphenated().to_string(),
 42 |             std::string::ToString::to_string,
 43 |         );
 44 |         PolyglotData {
 45 |             version: DATA_FILE_VERSION.to_string(),
 46 |             name: name.to_string(),
 47 |             id,
 48 |             tree,
 49 |             metadata: IndicatorMetadata::default(),
 50 |             features,
 51 |         }
 52 |     }
 53 |     pub fn tree(&self) -> &FlareTreeNode {
 54 |         &self.tree
 55 |     }
 56 |     pub fn tree_mut(&mut self) -> &mut FlareTreeNode {
 57 |         &mut self.tree
 58 |     }
 59 | 
 60 |     pub fn metadata(&mut self) -> &mut IndicatorMetadata {
 61 |         &mut self.metadata
 62 |     }
 63 | }
 64 | 
 65 | #[cfg(test)]
 66 | mod test {
 67 |     use super::*;
 68 |     use pretty_assertions::assert_eq;
 69 |     #[test]
 70 |     fn can_build_data_tree() {
 71 |         let root = FlareTreeNode::dir("root");
 72 |         let tree: PolyglotData = PolyglotData::new(
 73 |             "test",
 74 |             Some("test-id"),
 75 |             root.clone(),
 76 |             FeatureFlags::default(),
 77 |         );
 78 | 
 79 |         let expected = PolyglotData {
 80 |             name: "test".to_string(),
 81 |             id: "test-id".to_string(),
 82 |             version: DATA_FILE_VERSION.to_string(),
 83 |             tree: root,
 84 |             metadata: IndicatorMetadata::default(),
 85 |             features: FeatureFlags::default(),
 86 |         };
 87 | 
 88 |         assert_eq!(tree.name, expected.name);
 89 |         assert_eq!(tree.tree, expected.tree);
 90 |     }
 91 | 
 92 |     #[test]
 93 |     fn data_without_id_has_uuid() {
 94 |         let root = FlareTreeNode::dir("root");
 95 |         let tree1: PolyglotData =
 96 |             PolyglotData::new("test", None, root.clone(), FeatureFlags::default());
 97 |         let tree2: PolyglotData = PolyglotData::new("test", None, root, FeatureFlags::default());
 98 |         // really just asserting IDs are different!
 99 |         assert_ne!(tree1.id, tree2.id);
100 |     }
101 | 
102 |     // TODO: removed serializing metadata test as it no longer made sense. Do we depend on just e2e tests?
103 | }
104 | 


--------------------------------------------------------------------------------
/src/git_user_dictionary.rs:
--------------------------------------------------------------------------------
  1 | #![warn(clippy::all)]
  2 | use crate::git_logger::User;
  3 | use serde::ser::SerializeSeq;
  4 | use serde::{Serialize, Serializer};
  5 | use std::collections::HashMap;
  6 | 
  7 | #[derive(Debug, Clone, Default)]
  8 | pub struct GitUserDictionary {
  9 |     next_id: usize,
 10 |     lower_users: HashMap<User, usize>,
 11 |     users: Vec<User>,
 12 | }
 13 | 
 14 | impl GitUserDictionary {
 15 |     pub fn register(&mut self, user: &User) -> usize {
 16 |         let lower_user = user.as_lower_case();
 17 |         match self.lower_users.get(&lower_user) {
 18 |             Some(id) => *id,
 19 |             None => {
 20 |                 let result = self.next_id;
 21 |                 self.lower_users.insert(lower_user, result);
 22 |                 self.users.push(user.clone());
 23 |                 self.next_id += 1;
 24 |                 result
 25 |             }
 26 |         }
 27 |     }
 28 |     #[cfg(test)]
 29 |     pub fn user_by_id(&self, user_id: usize) -> User {
 30 |         self.users
 31 |             .get(user_id)
 32 |             .expect("No user found matching ID!")
 33 |             .clone()
 34 |     }
 35 |     #[cfg(test)]
 36 |     pub fn user_count(&self) -> usize {
 37 |         self.next_id
 38 |     }
 39 |     #[cfg(test)]
 40 |     pub fn user_id(&self, user: &User) -> Option<&usize> {
 41 |         self.lower_users.get(&user.as_lower_case())
 42 |     }
 43 | }
 44 | 
 45 | /// We store, rather redundantly, the user ID in the JSON, even though users are output as an array.
 46 | /// This makes it easier for humans to correlate users with data without counting from 0
 47 | /// It also will make it easier later to alias users to other users.
 48 | #[derive(Debug, PartialEq, Serialize)]
 49 | struct UserKey<'a> {
 50 |     id: usize,
 51 |     user: &'a User,
 52 | }
 53 | 
 54 | impl Serialize for GitUserDictionary {
 55 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
 56 |     where
 57 |         S: Serializer,
 58 |     {
 59 |         let mut seq = serializer.serialize_seq(Some(self.users.len()))?;
 60 |         for (id, user) in self.users.iter().enumerate() {
 61 |             seq.serialize_element(&UserKey { id, user })?;
 62 |         }
 63 |         seq.end()
 64 |     }
 65 | }
 66 | 
 67 | #[cfg(test)]
 68 | mod test {
 69 |     use super::*;
 70 |     #[cfg(test)]
 71 |     use pretty_assertions::assert_eq;
 72 | 
 73 |     // use test_shared::*;
 74 | 
 75 |     #[test]
 76 |     fn users_receive_sequential_ids() {
 77 |         let mut dict = GitUserDictionary::default();
 78 | 
 79 |         let jane = User::new(Some("Jane"), Some("JaneDoe@gmail.com"));
 80 |         let user0 = dict.register(&jane);
 81 |         assert_eq!(user0, 0);
 82 |         assert_eq!(dict.user_by_id(user0), jane);
 83 | 
 84 |         let user1 = dict.register(&User::new(Some("Jane"), None));
 85 |         assert_eq!(user1, 1);
 86 |         let user0again = dict.register(&User::new(Some("Jane"), Some("JaneDoe@gmail.com")));
 87 |         assert_eq!(user0again, 0);
 88 |     }
 89 | 
 90 |     #[test]
 91 |     fn user_checks_are_case_insensitive_and_return_first_seen_user() {
 92 |         let mut dict = GitUserDictionary::default();
 93 | 
 94 |         let jane = User::new(Some("Jane"), Some("JaneDoe@gmail.com"));
 95 |         let lower_jane = User::new(Some("jane"), Some("janeDoe@gmail.com"));
 96 |         let user0 = dict.register(&jane);
 97 |         assert_eq!(user0, 0);
 98 |         // there is only one user!
 99 |         assert_eq!(dict.user_count(), 1);
100 | 
101 |         let user1 = dict.register(&lower_jane);
102 |         assert_eq!(user1, 0);
103 |         assert_eq!(dict.user_by_id(0), jane);
104 |     }
105 | }
106 | 


--------------------------------------------------------------------------------
/tests/expected/git/git_sample_by_filename.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "simple/parent.clj": [
  3 |     {
  4 |       "id": "0dbd54d4c524ecc776f381e660cce9b2dd92162c",
  5 |       "committer": {
  6 |         "name": "Korny Sietsma",
  7 |         "email": "korny@sietsma.com"
  8 |       },
  9 |       "commit_time": 1558524371,
 10 |       "author": {
 11 |         "name": "Korny Sietsma",
 12 |         "email": "korny@sietsma.com"
 13 |       },
 14 |       "author_time": 1558524371,
 15 |       "co_authors": [],
 16 |       "change": "Modify",
 17 |       "lines_added": 1,
 18 |       "lines_deleted": 0
 19 |     },
 20 |     {
 21 |       "id": "a0ae9997cfdf49fd0cbf54dacc72c778af337519",
 22 |       "committer": {
 23 |         "name": "Korny Sietsma",
 24 |         "email": "korny@sietsma.com"
 25 |       },
 26 |       "commit_time": 1558521550,
 27 |       "author": {
 28 |         "name": "Korny Sietsma",
 29 |         "email": "korny@sietsma.com"
 30 |       },
 31 |       "author_time": 1558521550,
 32 |       "co_authors": [
 33 |         {
 34 |           "name": "hermoine",
 35 |           "email": "hgranger@durmstrang.de"
 36 |         }
 37 |       ],
 38 |       "change": "Modify",
 39 |       "lines_added": 3,
 40 |       "lines_deleted": 1
 41 |     },
 42 |     {
 43 |       "id": "ca239efb9b26db57ac9e2ec3e2df1c42578a46f8",
 44 |       "committer": {
 45 |         "name": "Korny Sietsma",
 46 |         "email": "korny@sietsma.com"
 47 |       },
 48 |       "commit_time": 1558521386,
 49 |       "author": {
 50 |         "name": "Korny Sietsma",
 51 |         "email": "korny@sietsma.com"
 52 |       },
 53 |       "author_time": 1558521386,
 54 |       "co_authors": [],
 55 |       "change": "Add",
 56 |       "lines_added": 4,
 57 |       "lines_deleted": 0
 58 |     }
 59 |   ],
 60 |   "simple/child/a_renamed.txt": [
 61 |     {
 62 |       "id": "93ae0c7c7cd93b3c4ea1bf103bde4deafef798ad",
 63 |       "committer": {
 64 |         "name": "Korny Sietsma",
 65 |         "email": "korny@sietsma.com"
 66 |       },
 67 |       "commit_time": 1558533240,
 68 |       "author": {
 69 |         "name": "Korny Sietsma",
 70 |         "email": "korny@sietsma.com"
 71 |       },
 72 |       "author_time": 1558533240,
 73 |       "co_authors": [],
 74 |       "change": "Rename",
 75 |       "lines_added": 0,
 76 |       "lines_deleted": 0
 77 |     },
 78 |     {
 79 |       "id": "a0ae9997cfdf49fd0cbf54dacc72c778af337519",
 80 |       "committer": {
 81 |         "name": "Korny Sietsma",
 82 |         "email": "korny@sietsma.com"
 83 |       },
 84 |       "commit_time": 1558521550,
 85 |       "author": {
 86 |         "name": "Korny Sietsma",
 87 |         "email": "korny@sietsma.com"
 88 |       },
 89 |       "author_time": 1558521550,
 90 |       "co_authors": [
 91 |         {
 92 |           "name": "hermoine",
 93 |           "email": "hgranger@durmstrang.de"
 94 |         }
 95 |       ],
 96 |       "change": "Modify",
 97 |       "lines_added": 1,
 98 |       "lines_deleted": 1
 99 |     },
100 |     {
101 |       "id": "cdf8709362c267198d04d47e55e66071fdd5f52b",
102 |       "committer": {
103 |         "name": "Korny Sietsma",
104 |         "email": "korny@sietsma.com"
105 |       },
106 |       "commit_time": 1558521648,
107 |       "author": {
108 |         "name": "Korny Sietsma",
109 |         "email": "korny@sietsma.com"
110 |       },
111 |       "author_time": 1558521648,
112 |       "co_authors": [],
113 |       "change": "Modify",
114 |       "lines_added": 1,
115 |       "lines_deleted": 2
116 |     },
117 |     {
118 |       "id": "ca239efb9b26db57ac9e2ec3e2df1c42578a46f8",
119 |       "committer": {
120 |         "name": "Korny Sietsma",
121 |         "email": "korny@sietsma.com"
122 |       },
123 |       "commit_time": 1558521386,
124 |       "author": {
125 |         "name": "Korny Sietsma",
126 |         "email": "korny@sietsma.com"
127 |       },
128 |       "author_time": 1558521386,
129 |       "co_authors": [],
130 |       "change": "Add",
131 |       "lines_added": 2,
132 |       "lines_deleted": 0
133 |     }
134 |   ]
135 | }
136 | 


--------------------------------------------------------------------------------
/src/loc.rs:
--------------------------------------------------------------------------------
  1 | #![warn(clippy::all)]
  2 | use crate::polyglot_data::IndicatorMetadata;
  3 | 
  4 | use super::toxicity_indicator_calculator::ToxicityIndicatorCalculator;
  5 | use anyhow::Error;
  6 | use serde::Serialize;
  7 | 
  8 | use content_inspector::{inspect, ContentType};
  9 | 
 10 | use std::fs::File;
 11 | use std::io::Read;
 12 | use std::path::{Path, PathBuf};
 13 | 
 14 | use tokei::{Config, LanguageType};
 15 | 
 16 | /// a struct representing tokei language data - based on `tokei::Stats` and `tokei::Languages::name`
 17 | #[derive(Debug, PartialEq, Eq, Serialize, Clone)]
 18 | pub struct LanguageLocData {
 19 |     /// Canonical language name
 20 |     pub language: String,
 21 |     /// binary files only have bytes not lines!
 22 |     pub binary: bool,
 23 |     /// Number of blank lines within the file.
 24 |     pub blanks: usize,
 25 |     /// Number of lines of code within the file.
 26 |     pub code: usize,
 27 |     /// Number of comments within the file. (_includes both multi line, and
 28 |     /// single line comments_)
 29 |     pub comments: usize,
 30 |     /// Total number of lines within the file.
 31 |     pub lines: usize,
 32 |     /// File size in bytes
 33 |     pub bytes: u64,
 34 | }
 35 | 
 36 | fn safe_extension(filename: &Path) -> String {
 37 |     match filename.extension() {
 38 |         Some(ext) => ext.to_string_lossy().to_string(),
 39 |         None => "no_extension".to_owned(),
 40 |     }
 41 | }
 42 | 
 43 | fn file_size(filename: &Path) -> Result<u64, Error> {
 44 |     Ok(filename.metadata()?.len())
 45 | }
 46 | //TODO: should binary data have 'lines:0' or should it be
 47 | // an explicit special case?
 48 | impl LanguageLocData {
 49 |     fn from_binary(language_name: String, filename: &Path) -> Result<Self, Error> {
 50 |         Ok(LanguageLocData {
 51 |             language: language_name,
 52 |             binary: true,
 53 |             blanks: 0,
 54 |             code: 0,
 55 |             comments: 0,
 56 |             lines: 0,
 57 |             bytes: file_size(filename)?,
 58 |         })
 59 |     }
 60 | }
 61 | 
 62 | const MAX_PEEK_SIZE: usize = 1024;
 63 | 
 64 | fn file_content_type(filename: &Path) -> Result<ContentType, Error> {
 65 |     let file = File::open(filename)?;
 66 |     let mut buffer: Vec<u8> = vec![];
 67 | 
 68 |     file.take(MAX_PEEK_SIZE as u64).read_to_end(&mut buffer)?;
 69 |     Ok(inspect(&buffer))
 70 | }
 71 | 
 72 | fn parse_file(filename: &Path) -> Result<LanguageLocData, Error> {
 73 |     let config = Config::default();
 74 |     let mut language_name = None;
 75 |     let language = match LanguageType::from_path(filename, &config) {
 76 |         Some(language) => language,
 77 |         None => {
 78 |             language_name = Some(safe_extension(filename));
 79 |             if file_content_type(filename)? == ContentType::BINARY {
 80 |                 return LanguageLocData::from_binary(language_name.unwrap(), filename);
 81 |             }
 82 |             LanguageType::Text
 83 |         }
 84 |     };
 85 |     let language_name = language_name.unwrap_or_else(|| language.name().to_string());
 86 |     let report = language.parse(PathBuf::from(filename), &config);
 87 | 
 88 |     match report {
 89 |         Ok(report) => Ok(LanguageLocData {
 90 |             binary: false,
 91 |             blanks: report.stats.blanks,
 92 |             code: report.stats.code,
 93 |             comments: report.stats.comments,
 94 |             lines: report.stats.lines(),
 95 |             language: language_name,
 96 |             bytes: file_size(filename)?,
 97 |         }),
 98 |         Err((error, _pathbuf)) => Err(Error::from(error)),
 99 |     }
100 | }
101 | 
102 | #[derive(Debug)]
103 | pub struct LocCalculator {}
104 | 
105 | impl ToxicityIndicatorCalculator for LocCalculator {
106 |     fn name(&self) -> String {
107 |         "loc".to_string()
108 |     }
109 | 
110 |     fn visit_node(
111 |         &mut self,
112 |         node: &mut crate::flare::FlareTreeNode,
113 |         path: &Path,
114 |     ) -> Result<(), Error> {
115 |         if path.is_file() {
116 |             let stats = parse_file(path)?;
117 |             node.indicators_mut().loc = Some(stats);
118 |         }
119 |         Ok(())
120 |     }
121 | 
122 |     fn apply_metadata(&self, _metadata: &mut IndicatorMetadata) -> Result<(), Error> {
123 |         Ok(())
124 |     }
125 | }
126 | 
127 | #[cfg(test)]
128 | mod test {
129 |     use super::*;
130 | 
131 |     #[test]
132 |     fn can_get_loc_data_for_a_file() {
133 |         let stats = parse_file(Path::new("./tests/data/simple/parent.clj")).unwrap();
134 |         assert_eq!(stats.code, 3);
135 |         assert_eq!(stats.language, "Clojure");
136 |     }
137 | }
138 | 


--------------------------------------------------------------------------------
/tests/expected/git/git_sample.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "id": "93ae0c7c7cd93b3c4ea1bf103bde4deafef798ad",
  4 |     "summary": "renaming",
  5 |     "parents": ["0dbd54d4c524ecc776f381e660cce9b2dd92162c"],
  6 |     "committer": {
  7 |       "name": "Korny Sietsma",
  8 |       "email": "korny@sietsma.com"
  9 |     },
 10 |     "commit_time": 1558533240,
 11 |     "author": {
 12 |       "name": "Korny Sietsma",
 13 |       "email": "korny@sietsma.com"
 14 |     },
 15 |     "author_time": 1558533240,
 16 |     "co_authors": [],
 17 |     "file_changes": [
 18 |       {
 19 |         "file": "simple/child/a_renamed.txt",
 20 |         "old_file": "simple/child/a.txt",
 21 |         "change": "Rename",
 22 |         "lines_added": 0,
 23 |         "lines_deleted": 0
 24 |       }
 25 |     ]
 26 |   },
 27 |   {
 28 |     "id": "0dbd54d4c524ecc776f381e660cce9b2dd92162c",
 29 |     "summary": "just changed parent.clj",
 30 |     "parents": ["1ef4f2a8301699964d7bb397d3f2e86d8d9776b3"],
 31 |     "committer": {
 32 |       "name": "Korny Sietsma",
 33 |       "email": "korny@sietsma.com"
 34 |     },
 35 |     "commit_time": 1558524371,
 36 |     "author": {
 37 |       "name": "Korny Sietsma",
 38 |       "email": "korny@sietsma.com"
 39 |     },
 40 |     "author_time": 1558524371,
 41 |     "co_authors": [],
 42 |     "file_changes": [
 43 |       {
 44 |         "file": "simple/parent.clj",
 45 |         "old_file": null,
 46 |         "change": "Modify",
 47 |         "lines_added": 1,
 48 |         "lines_deleted": 0
 49 |       }
 50 |     ]
 51 |   },
 52 |   {
 53 |     "id": "1ef4f2a8301699964d7bb397d3f2e86d8d9776b3",
 54 |     "summary": "Merge branch 'fiddling'",
 55 |     "parents": [
 56 |       "cdf8709362c267198d04d47e55e66071fdd5f52b",
 57 |       "a0ae9997cfdf49fd0cbf54dacc72c778af337519"
 58 |     ],
 59 |     "committer": {
 60 |       "name": "Korny Sietsma",
 61 |       "email": "korny@sietsma.com"
 62 |     },
 63 |     "commit_time": 1558521695,
 64 |     "author": {
 65 |       "name": "Korny Sietsma",
 66 |       "email": "korny@sietsma.com"
 67 |     },
 68 |     "author_time": 1558521695,
 69 |     "co_authors": [],
 70 |     "file_changes": []
 71 |   },
 72 |   {
 73 |     "id": "a0ae9997cfdf49fd0cbf54dacc72c778af337519",
 74 |     "summary": "made some changes with a bigger comment",
 75 |     "parents": ["ca239efb9b26db57ac9e2ec3e2df1c42578a46f8"],
 76 |     "committer": {
 77 |       "name": "Korny Sietsma",
 78 |       "email": "korny@sietsma.com"
 79 |     },
 80 |     "commit_time": 1558521550,
 81 |     "author": {
 82 |       "name": "Korny Sietsma",
 83 |       "email": "korny@sietsma.com"
 84 |     },
 85 |     "author_time": 1558521550,
 86 |     "co_authors": [
 87 |       {
 88 |         "name": "hermoine",
 89 |         "email": "hgranger@durmstrang.de"
 90 |       }
 91 |     ],
 92 |     "file_changes": [
 93 |       {
 94 |         "file": "simple/child/a.txt",
 95 |         "old_file": null,
 96 |         "change": "Modify",
 97 |         "lines_added": 1,
 98 |         "lines_deleted": 1
 99 |       },
100 |       {
101 |         "file": "simple/parent.clj",
102 |         "old_file": null,
103 |         "change": "Modify",
104 |         "lines_added": 3,
105 |         "lines_deleted": 1
106 |       }
107 |     ]
108 |   },
109 |   {
110 |     "id": "cdf8709362c267198d04d47e55e66071fdd5f52b",
111 |     "summary": "removed excess line",
112 |     "parents": ["ca239efb9b26db57ac9e2ec3e2df1c42578a46f8"],
113 |     "committer": {
114 |       "name": "Korny Sietsma",
115 |       "email": "korny@sietsma.com"
116 |     },
117 |     "commit_time": 1558521648,
118 |     "author": {
119 |       "name": "Korny Sietsma",
120 |       "email": "korny@sietsma.com"
121 |     },
122 |     "author_time": 1558521648,
123 |     "co_authors": [],
124 |     "file_changes": [
125 |       {
126 |         "file": "simple/child/a.txt",
127 |         "old_file": null,
128 |         "change": "Modify",
129 |         "lines_added": 1,
130 |         "lines_deleted": 2
131 |       }
132 |     ]
133 |   },
134 |   {
135 |     "id": "ca239efb9b26db57ac9e2ec3e2df1c42578a46f8",
136 |     "summary": "first commit",
137 |     "parents": [],
138 |     "committer": {
139 |       "name": "Korny Sietsma",
140 |       "email": "korny@sietsma.com"
141 |     },
142 |     "commit_time": 1558521386,
143 |     "author": {
144 |       "name": "Korny Sietsma",
145 |       "email": "korny@sietsma.com"
146 |     },
147 |     "author_time": 1558521386,
148 |     "co_authors": [],
149 |     "file_changes": [
150 |       {
151 |         "file": "simple/child/a.txt",
152 |         "old_file": null,
153 |         "change": "Add",
154 |         "lines_added": 2,
155 |         "lines_deleted": 0
156 |       },
157 |       {
158 |         "file": "simple/parent.clj",
159 |         "old_file": null,
160 |         "change": "Add",
161 |         "lines_added": 4,
162 |         "lines_deleted": 0
163 |       }
164 |     ]
165 |   }
166 | ]
167 | 


--------------------------------------------------------------------------------
/src/code_line_data.rs:
--------------------------------------------------------------------------------
  1 | use std::{fs::File, io::Read, path::PathBuf};
  2 | 
  3 | use anyhow::Error;
  4 | use encoding_rs_io::DecodeReaderBytesBuilder;
  5 | use tokei::CodeStats;
  6 | 
  7 | #[derive(Clone, Debug, PartialEq, Eq)]
  8 | pub struct CodeLineData {
  9 |     pub spaces: u32,
 10 |     pub tabs: u32,
 11 |     pub text: u32,
 12 | }
 13 | 
 14 | impl CodeLineData {
 15 |     fn new(line: &[u8]) -> Self {
 16 |         let mut spaces: u32 = 0;
 17 |         let mut tabs: u32 = 0;
 18 |         let mut text: Option<usize> = None;
 19 |         for ix in 0..line.len() {
 20 |             let c = line[ix];
 21 |             if c == b' ' {
 22 |                 spaces += 1;
 23 |             } else if c == b'\t' {
 24 |                 tabs += 1;
 25 |             } else {
 26 |                 text = Some(
 27 |                     String::from_utf8_lossy(&line[ix..line.len()])
 28 |                         .trim()
 29 |                         .chars()
 30 |                         .count(),
 31 |                 );
 32 |                 break;
 33 |             }
 34 |         }
 35 | 
 36 |         CodeLineData {
 37 |             spaces,
 38 |             tabs,
 39 |             text: text.unwrap_or(0) as u32,
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | #[derive(Clone, Debug, PartialEq, Eq)]
 45 | pub struct CodeLines {
 46 |     pub lines: Vec<CodeLineData>,
 47 | }
 48 | 
 49 | impl CodeLines {
 50 |     pub fn from_stats(stats: &CodeStats) -> Self {
 51 |         CodeLines {
 52 |             lines: stats
 53 |                 .code_lines
 54 |                 .iter()
 55 |                 .map(|line| CodeLineData::new(line))
 56 |                 .collect(),
 57 |         }
 58 |     }
 59 |     pub fn new(path: &PathBuf) -> Result<Self, Error> {
 60 |         let text: Vec<Vec<u8>> = {
 61 |             let f = match File::open(path) {
 62 |                 Ok(f) => f,
 63 |                 Err(e) => return Err(anyhow!("error opening file {:?} - {}", &path, e)),
 64 |             };
 65 |             let mut s = Vec::new();
 66 |             let mut reader = DecodeReaderBytesBuilder::new().build(f);
 67 |             reader.read_to_end(&mut s)?;
 68 | 
 69 |             s.split(|b| *b == b'\n').map(Vec::from).collect()
 70 |         };
 71 |         Ok(CodeLines {
 72 |             lines: text.iter().map(|line| CodeLineData::new(line)).collect(),
 73 |         })
 74 |     }
 75 | }
 76 | 
 77 | #[cfg(test)]
 78 | mod tests {
 79 |     use super::*;
 80 |     use tokei::{Config, LanguageType};
 81 | 
 82 |     #[test]
 83 |     pub fn can_process_tabs_and_spaces() {
 84 |         let data = CodeLineData::new(" \t \t foo".as_bytes());
 85 |         assert_eq!(
 86 |             data,
 87 |             CodeLineData {
 88 |                 spaces: 3,
 89 |                 tabs: 2,
 90 |                 text: 3
 91 |             }
 92 |         );
 93 |     }
 94 | 
 95 |     #[test]
 96 |     pub fn can_process_unicode() {
 97 |         let data = CodeLineData::new("①②③④⑤⑥⑦⑧⑨⑩".as_bytes());
 98 |         assert_eq!(
 99 |             data,
100 |             CodeLineData {
101 |                 spaces: 0,
102 |                 tabs: 0,
103 |                 text: 10
104 |             }
105 |         );
106 |     }
107 | 
108 |     #[test]
109 |     pub fn can_parse_source_code() {
110 |         let code = r#"function foo☃() {
111 | 
112 |     blah;
113 | 
114 |     // comment
115 | }
116 | /* longer comment
117 | with blanks
118 | 
119 | yow
120 | */
121 | foo();"#;
122 |         let stats: CodeStats = LanguageType::JavaScript.parse_from_str(code, &Config::default());
123 | 
124 |         // eprintln!("Stats: {:?}", stats);
125 |         // let printable_lines: Vec<_> = stats
126 |         //     .code_lines
127 |         //     .iter()
128 |         //     .map(|l| String::from_utf8_lossy(l))
129 |         //     .collect();
130 |         // eprintln!("Code lines: {:?}", printable_lines);
131 | 
132 |         let result: CodeLines = CodeLines::from_stats(&stats);
133 | 
134 |         let mut expected = vec![
135 |             CodeLineData {
136 |                 spaces: 0,
137 |                 tabs: 0,
138 |                 text: 17,
139 |             },
140 |             CodeLineData {
141 |                 spaces: 4,
142 |                 tabs: 0,
143 |                 text: 5,
144 |             },
145 |             CodeLineData {
146 |                 spaces: 0,
147 |                 tabs: 0,
148 |                 text: 1,
149 |             },
150 |             CodeLineData {
151 |                 spaces: 0,
152 |                 tabs: 0,
153 |                 text: 6,
154 |             },
155 |             CodeLineData {
156 |                 spaces: 0,
157 |                 tabs: 0,
158 |                 text: 0,
159 |             },
160 |             CodeLineData {
161 |                 spaces: 0,
162 |                 tabs: 0,
163 |                 text: 0,
164 |             },
165 |         ];
166 |         expected.sort_by(|a, b| a.text.partial_cmp(&b.text).unwrap());
167 | 
168 |         let mut actual = result.lines;
169 | 
170 |         actual.sort_by(|a, b| a.text.partial_cmp(&b.text).unwrap());
171 |         assert_eq!(actual, expected);
172 |     }
173 | }
174 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #![forbid(unsafe_code)]
  2 | #![warn(clippy::all)]
  3 | #![warn(rust_2018_idioms)]
  4 | #![warn(clippy::pedantic)]
  5 | // pedantic is just a bit too keen for me! But still useful.
  6 | #![allow(clippy::module_name_repetitions)]
  7 | #![allow(clippy::cast_possible_truncation)]
  8 | #![allow(clippy::cast_precision_loss)]
  9 | #![allow(clippy::cast_sign_loss)]
 10 | #![allow(clippy::cast_lossless)]
 11 | #![allow(clippy::missing_errors_doc)]
 12 | #![allow(clippy::similar_names)]
 13 | #![allow(clippy::cast_possible_wrap)]
 14 | #![allow(clippy::redundant_else)]
 15 | #![allow(clippy::single_match_else)]
 16 | 
 17 | #[macro_use]
 18 | extern crate anyhow;
 19 | #[macro_use]
 20 | extern crate log;
 21 | #[macro_use]
 22 | extern crate lazy_static;
 23 | #[macro_use]
 24 | extern crate derive_builder;
 25 | #[macro_use]
 26 | extern crate derive_getters;
 27 | 
 28 | use anyhow::{Context, Error};
 29 | use file_stats::FileStatsCalculator;
 30 | use postprocessing::postprocess_tree;
 31 | use serde::Serialize;
 32 | use std::io;
 33 | use std::path::Path;
 34 | 
 35 | mod code_line_data;
 36 | // pub mod coupling;
 37 | mod file_walker;
 38 | // public so main.rs can access structures TODO: can this be done better? expose here just what main needs?
 39 | pub mod coupling;
 40 | mod file_stats;
 41 | mod flare;
 42 | mod git;
 43 | mod git_file_future;
 44 | mod git_user_dictionary;
 45 | mod indentation;
 46 | mod loc;
 47 | mod polyglot_data;
 48 | mod postprocessing;
 49 | mod toxicity_indicator_calculator;
 50 | 
 51 | mod git_file_history;
 52 | mod git_logger;
 53 | 
 54 | use crate::coupling::CouplingConfig;
 55 | use git::GitCalculator;
 56 | use git_logger::GitLogConfig;
 57 | use indentation::IndentationCalculator;
 58 | use loc::LocCalculator;
 59 | use toxicity_indicator_calculator::ToxicityIndicatorCalculator;
 60 | 
 61 | #[allow(clippy::struct_excessive_bools)]
 62 | #[derive(Debug, Default, Clone, Serialize)]
 63 | pub struct FeatureFlags {
 64 |     pub git: bool,
 65 |     pub coupling: bool,
 66 |     pub git_details: bool,
 67 |     pub file_stats: bool,
 68 | }
 69 | 
 70 | // general config for the scanner and calculators - could be split if it grows too far
 71 | pub struct ScannerConfig {
 72 |     pub git_years: Option<u64>,
 73 |     pub follow_symlinks: bool,
 74 |     pub name: String,
 75 |     pub data_id: Option<String>,
 76 |     pub features: FeatureFlags,
 77 | }
 78 | 
 79 | impl ScannerConfig {
 80 |     #[must_use]
 81 |     pub fn default(name: &str) -> Self {
 82 |         ScannerConfig {
 83 |             git_years: None,
 84 |             follow_symlinks: false,
 85 |             name: name.to_owned(),
 86 |             data_id: None,
 87 |             features: FeatureFlags::default(),
 88 |         }
 89 |     }
 90 | }
 91 | 
 92 | #[must_use]
 93 | pub fn named_toxicity_indicator_calculator(
 94 |     name: &str,
 95 |     config: &ScannerConfig,
 96 | ) -> Option<Box<dyn ToxicityIndicatorCalculator>> {
 97 |     match name {
 98 |         "loc" => Some(Box::new(LocCalculator {})),
 99 |         "git" => Some(Box::new(GitCalculator::new(
100 |             GitLogConfig::default()
101 |                 .include_merges(true)
102 |                 .since_years(config.git_years),
103 |         ))),
104 |         "indentation" => Some(Box::new(IndentationCalculator {})),
105 |         "file_stats" => Some(Box::new(FileStatsCalculator {})),
106 |         _ => None,
107 |     }
108 | }
109 | 
110 | pub fn run<W>(
111 |     root: &Path,
112 |     config: &ScannerConfig,
113 |     coupling_config: Option<CouplingConfig>,
114 |     toxicity_indicator_calculator_names: &[&str],
115 |     out: W,
116 | ) -> Result<(), Error>
117 | where
118 |     W: io::Write,
119 | {
120 |     if toxicity_indicator_calculator_names.contains(&"git") && !config.features.git {
121 |         bail!("Logic error - using git calculator when git is disabled!");
122 |     }
123 |     if toxicity_indicator_calculator_names.contains(&"file_stats") && !config.features.file_stats {
124 |         bail!("Logic error - using file_stats calculator when file_stats is disabled!");
125 |     }
126 |     let maybe_tics: Option<Vec<_>> = toxicity_indicator_calculator_names
127 |         .iter()
128 |         .map(|name| named_toxicity_indicator_calculator(name, config))
129 |         .collect();
130 | 
131 |     let mut tics = maybe_tics.expect("Some toxicity indicator calculator names don't exist!");
132 | 
133 |     info!("Walking directory tree");
134 |     let mut polyglot_data = file_walker::walk_directory(
135 |         root,
136 |         &config.name,
137 |         config.data_id.as_deref(),
138 |         config.follow_symlinks,
139 |         &mut tics,
140 |         &config.features,
141 |     )?;
142 | 
143 |     info!("adding metadata");
144 |     for tic in tics {
145 |         tic.apply_metadata(polyglot_data.metadata())
146 |             .with_context(|| format!("applying metadata for {}", tic.name()))?;
147 |     }
148 | 
149 |     if let Some(cc) = coupling_config {
150 |         // TODO: fix this to take the data
151 |         info!("gathering coupling");
152 |         coupling::gather_coupling(&mut polyglot_data, cc)?;
153 |     }
154 | 
155 |     info!("postprocessing tree");
156 |     // TODO: fix this to take the data
157 |     postprocess_tree(polyglot_data.tree_mut(), config)?;
158 | 
159 |     info!("saving as JSON");
160 |     serde_json::to_writer(out, &polyglot_data)?;
161 |     Ok(())
162 | }
163 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Polyglot Code Scanner
  2 | 
  3 | This is part of my Polyglot Code tools - for the main documentation, see <https://polyglot.korny.info>
  4 | 
  5 | ## A note about releases
  6 | 
  7 | Binary releases are working again - see <https://github.com/kornysietsma/polyglot-code-scanner/releases> for binary releases.
  8 | 
  9 | However, for M1 macs this won't work - github actions doesn't yet support M1 macs for free, so you'll have to build binaries yourself for now.
 10 | 
 11 | For Macs you also need to run `xattr -d com.apple.quarantine polyglot-code-scanner-x86_64-macos` to remove the quarantine that OSX adds to all downloaded binaries.
 12 | 
 13 | ## Intro
 14 | 
 15 | This application scans source code directories, identifying a range of code metrics and other data, and storing the results in a JSON file for later visualisation by the [Polyglot Code Explorer](https://polyglot.korny.info/tools/explorer/description/)
 16 | 
 17 | ## Installation and running
 18 | 
 19 | See also <https://polyglot.korny.info/tools/scanner/howto> for more detailed instructions for building binary releases, and running the scanner.
 20 | 
 21 | To compile and run from source, you'll need [to install rust and cargo](https://www.rust-lang.org/tools/install) and then from a copy of this project, you can build a binary package with:
 22 | 
 23 | ~~~sh
 24 | cargo build --release
 25 | ~~~
 26 | 
 27 | The binary will be built in the `target/release` directory.
 28 | 
 29 | ### Running from source
 30 | 
 31 | You can also just run it from the source directory with `cargo run polyglot_code_scanner -- (other command line arguments)` - this will be slower as it runs un-optimised code with more debug information.  But it's a lot faster for development.
 32 | 
 33 | ### Getting help
 34 | 
 35 | See <https://polyglot.korny.info> for the main documentation for this project.
 36 | 
 37 | You can get up-to-date command-line help by running
 38 | 
 39 | ~~~sh
 40 | polyglot_code_scanner -h
 41 | ~~~
 42 | 
 43 | ## Ignoring files
 44 | 
 45 | Git ignored files in `.gitignore` are not scanned.
 46 | 
 47 | You can also manually add `.polyglot_code_scanner_ignore` files anywhere in the codebase, to list extra files to be ignored - the syntax is [the same as .gitignore's](https://git-scm.com/docs/gitignore)
 48 | 
 49 | ## Usage
 50 | 
 51 | Run `polyglot_code_scanner -h` for full options, this is just the main options:
 52 | 
 53 | ~~~text
 54 | USAGE:
 55 |     polyglot_code_scanner [OPTIONS] --name <NAME> [ROOT]
 56 | 
 57 | ARGS:
 58 |     <ROOT>    Root directory, current dir if not present
 59 | 
 60 | OPTIONS:
 61 |     -h, --help
 62 |             Print help information
 63 | 
 64 |     -n, --name <NAME>
 65 |             project name - identifies the selected data for display and state storage
 66 | 
 67 |         --id <ID>
 68 |             data file ID - used to identify unique data files for browser storage, generates a UUID
 69 |             if not specified
 70 | 
 71 |     -o, --output <OUTPUT>
 72 |             Output file, stdout if not present, or not used if sending to web server
 73 | 
 74 |         --no-git
 75 |             Do not scan for git repositories
 76 | 
 77 |         --years <GIT_YEARS>
 78 |             how many years of git history to parse - default only scan the last 3 years (from now,
 79 |             not git head) [default: 3]
 80 | 
 81 |     -c, --coupling
 82 |             include temporal coupling data
 83 | 
 84 |     -V, --version
 85 |             Print version information
 86 | 
 87 | ~~~
 88 | 
 89 | ## Development notes
 90 | 
 91 | See also the `DesignDecisions.md` file
 92 | 
 93 | ### Running tests
 94 | 
 95 | To run a single named test from the command-line:
 96 | 
 97 | ~~~sh
 98 | cargo test -- --nocapture renames_and_deletes_applied_across_history
 99 | ~~~
100 | 
101 | The `--nocapture` tells rust not to capture stdout/stderr - so you can add `println!` and `eprintln!` statements to help you.
102 | 
103 | To remove some extra noise and blank lines, pipe the output through grep:
104 | 
105 | ~~~sh
106 | cargo test -- --nocapture renames_and_deletes_applied_across_history | grep -v "running 0 tests" | grep -v "0 passed" | grep -v -e '^\s*$'
107 | ~~~
108 | 
109 | ### showing logs
110 | 
111 | Rust tests don't install a logger - normally you explicitly install loggers in your `main` which tests don't use.
112 | 
113 | To install a logger using the `fern` crate, add the following to tests:
114 | 
115 | ~~~rust
116 | use test_shared::*;
117 | ~~~
118 | 
119 | then
120 | 
121 | ~~~rust
122 | install_test_logger();
123 | ~~~
124 | 
125 | This sets up a simple logger which sends logs to stdout - make sure you also use the `--nocapture` parameter mentioned earlier.
126 | 
127 | ### Pretty test output
128 | 
129 | If you want better assertions, your tests need to explicitly use the `pretty_assertions` crate:
130 | 
131 | ~~~rust
132 | use pretty_assertions::assert_eq;
133 | ~~~
134 | 
135 | ## Releasing new versions
136 | 
137 | Releasing uses [cargo-release](https://crates.io/crates/cargo-release)
138 | 
139 | The basic process is:
140 | 
141 | * update the top CHANGELOG.md entry (under 'unreleased')
142 | * commit and push changes
143 | * release
144 | 
145 | ~~~sh
146 | cargo release --dry-run
147 | ~~~
148 | 
149 | or for a minor change 0.1.3 to 0.2.0 :
150 | 
151 | ~~~sh
152 | cargo release minor --dry-run
153 | ~~~
154 | 
155 | ## License
156 | 
157 | Copyright © 2019-2022 Kornelis Sietsma
158 | 
159 | Licensed under the Apache License, Version 2.0 - see LICENSE.txt for details
160 | 


--------------------------------------------------------------------------------
/tests/expected/git/git_sample_with_merges.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "id": "93ae0c7c7cd93b3c4ea1bf103bde4deafef798ad",
  4 |     "summary": "renaming",
  5 |     "parents": ["0dbd54d4c524ecc776f381e660cce9b2dd92162c"],
  6 |     "committer": {
  7 |       "name": "Korny Sietsma",
  8 |       "email": "korny@sietsma.com"
  9 |     },
 10 |     "commit_time": 1558533240,
 11 |     "author": {
 12 |       "name": "Korny Sietsma",
 13 |       "email": "korny@sietsma.com"
 14 |     },
 15 |     "author_time": 1558533240,
 16 |     "co_authors": [],
 17 |     "file_changes": [
 18 |       {
 19 |         "file": "simple/child/a_renamed.txt",
 20 |         "old_file": "simple/child/a.txt",
 21 |         "change": "Rename",
 22 |         "lines_added": 0,
 23 |         "lines_deleted": 0
 24 |       }
 25 |     ]
 26 |   },
 27 |   {
 28 |     "id": "0dbd54d4c524ecc776f381e660cce9b2dd92162c",
 29 |     "summary": "just changed parent.clj",
 30 |     "parents": ["1ef4f2a8301699964d7bb397d3f2e86d8d9776b3"],
 31 |     "committer": {
 32 |       "name": "Korny Sietsma",
 33 |       "email": "korny@sietsma.com"
 34 |     },
 35 |     "commit_time": 1558524371,
 36 |     "author": {
 37 |       "name": "Korny Sietsma",
 38 |       "email": "korny@sietsma.com"
 39 |     },
 40 |     "author_time": 1558524371,
 41 |     "co_authors": [],
 42 |     "file_changes": [
 43 |       {
 44 |         "file": "simple/parent.clj",
 45 |         "old_file": null,
 46 |         "change": "Modify",
 47 |         "lines_added": 1,
 48 |         "lines_deleted": 0
 49 |       }
 50 |     ]
 51 |   },
 52 |   {
 53 |     "id": "1ef4f2a8301699964d7bb397d3f2e86d8d9776b3",
 54 |     "summary": "Merge branch 'fiddling'",
 55 |     "parents": [
 56 |       "cdf8709362c267198d04d47e55e66071fdd5f52b",
 57 |       "a0ae9997cfdf49fd0cbf54dacc72c778af337519"
 58 |     ],
 59 |     "committer": {
 60 |       "name": "Korny Sietsma",
 61 |       "email": "korny@sietsma.com"
 62 |     },
 63 |     "commit_time": 1558521695,
 64 |     "author": {
 65 |       "name": "Korny Sietsma",
 66 |       "email": "korny@sietsma.com"
 67 |     },
 68 |     "author_time": 1558521695,
 69 |     "co_authors": [],
 70 |     "file_changes": [
 71 |       {
 72 |         "file": "simple/child/a.txt",
 73 |         "old_file": null,
 74 |         "change": "Modify",
 75 |         "lines_added": 3,
 76 |         "lines_deleted": 1
 77 |       },
 78 |       {
 79 |         "file": "simple/parent.clj",
 80 |         "old_file": null,
 81 |         "change": "Modify",
 82 |         "lines_added": 3,
 83 |         "lines_deleted": 1
 84 |       },
 85 |       {
 86 |         "file": "simple/child/a.txt",
 87 |         "old_file": null,
 88 |         "change": "Modify",
 89 |         "lines_added": 1,
 90 |         "lines_deleted": 0
 91 |       }
 92 |     ]
 93 |   },
 94 |   {
 95 |     "id": "a0ae9997cfdf49fd0cbf54dacc72c778af337519",
 96 |     "summary": "made some changes with a bigger comment",
 97 |     "parents": ["ca239efb9b26db57ac9e2ec3e2df1c42578a46f8"],
 98 |     "committer": {
 99 |       "name": "Korny Sietsma",
100 |       "email": "korny@sietsma.com"
101 |     },
102 |     "commit_time": 1558521550,
103 |     "author": {
104 |       "name": "Korny Sietsma",
105 |       "email": "korny@sietsma.com"
106 |     },
107 |     "author_time": 1558521550,
108 |     "co_authors": [
109 |       {
110 |         "name": "hermoine",
111 |         "email": "hgranger@durmstrang.de"
112 |       }
113 |     ],
114 |     "file_changes": [
115 |       {
116 |         "file": "simple/child/a.txt",
117 |         "old_file": null,
118 |         "change": "Modify",
119 |         "lines_added": 1,
120 |         "lines_deleted": 1
121 |       },
122 |       {
123 |         "file": "simple/parent.clj",
124 |         "old_file": null,
125 |         "change": "Modify",
126 |         "lines_added": 3,
127 |         "lines_deleted": 1
128 |       }
129 |     ]
130 |   },
131 | 
132 |   {
133 |     "id": "cdf8709362c267198d04d47e55e66071fdd5f52b",
134 |     "summary": "removed excess line",
135 |     "parents": ["ca239efb9b26db57ac9e2ec3e2df1c42578a46f8"],
136 |     "committer": {
137 |       "name": "Korny Sietsma",
138 |       "email": "korny@sietsma.com"
139 |     },
140 |     "commit_time": 1558521648,
141 |     "author": {
142 |       "name": "Korny Sietsma",
143 |       "email": "korny@sietsma.com"
144 |     },
145 |     "author_time": 1558521648,
146 |     "co_authors": [],
147 |     "file_changes": [
148 |       {
149 |         "file": "simple/child/a.txt",
150 |         "old_file": null,
151 |         "change": "Modify",
152 |         "lines_added": 1,
153 |         "lines_deleted": 2
154 |       }
155 |     ]
156 |   },
157 |   {
158 |     "id": "ca239efb9b26db57ac9e2ec3e2df1c42578a46f8",
159 |     "summary": "first commit",
160 |     "parents": [],
161 |     "committer": {
162 |       "name": "Korny Sietsma",
163 |       "email": "korny@sietsma.com"
164 |     },
165 |     "commit_time": 1558521386,
166 |     "author": {
167 |       "name": "Korny Sietsma",
168 |       "email": "korny@sietsma.com"
169 |     },
170 |     "author_time": 1558521386,
171 |     "co_authors": [],
172 |     "file_changes": [
173 |       {
174 |         "file": "simple/child/a.txt",
175 |         "old_file": null,
176 |         "change": "Add",
177 |         "lines_added": 2,
178 |         "lines_deleted": 0
179 |       },
180 |       {
181 |         "file": "simple/parent.clj",
182 |         "old_file": null,
183 |         "change": "Add",
184 |         "lines_added": 4,
185 |         "lines_deleted": 0
186 |       }
187 |     ]
188 |   }
189 | ]
190 | 


--------------------------------------------------------------------------------
/src/indentation.rs:
--------------------------------------------------------------------------------
  1 | use crate::flare::FlareTreeNode;
  2 | use crate::polyglot_data::IndicatorMetadata;
  3 | 
  4 | use super::toxicity_indicator_calculator::ToxicityIndicatorCalculator;
  5 | use anyhow::{Context, Error};
  6 | use serde::Serialize;
  7 | 
  8 | use content_inspector::{inspect, ContentType};
  9 | 
 10 | use std::fs::File;
 11 | use std::io::Read;
 12 | use std::path::{Path, PathBuf};
 13 | 
 14 | use tokei::{Config, LanguageType};
 15 | 
 16 | use super::code_line_data::CodeLines;
 17 | 
 18 | use hdrhistogram::Histogram;
 19 | 
 20 | /// a struct representing file indentation data
 21 | #[derive(Debug, PartialEq, Serialize, Clone)]
 22 | pub struct IndentationData {
 23 |     pub lines: u64,
 24 |     pub minimum: u64,
 25 |     pub maximum: u64,
 26 |     pub median: u64,
 27 |     pub stddev: f64,
 28 |     pub p75: u64,
 29 |     pub p90: u64,
 30 |     pub p99: u64,
 31 |     /// the sum of indentations - probably best measure according to [HGH08]
 32 |     pub sum: u64,
 33 | }
 34 | 
 35 | impl IndentationData {
 36 |     fn new(code_lines: CodeLines) -> Option<Self> {
 37 |         // we used to have this - reinstate if creating histogram for every file is too slow.  But who knows, file I/O might be much bigger.
 38 |         // lazy_static! {
 39 |         //     static ref HISTOGRAM: Mutex<Histogram<u64>> =
 40 |         //         Mutex::new(Histogram::<u64>::new(3).unwrap());
 41 |         // }
 42 |         let mut histogram = Histogram::<u64>::new(3).expect("Can't create histogram");
 43 |         let mut sum: u64 = 0;
 44 |         for line in code_lines.lines {
 45 |             if line.text > 0 {
 46 |                 let indentation = line.spaces + line.tabs * 4;
 47 |                 histogram
 48 |                     .record(indentation as u64)
 49 |                     .expect("Invalid histogram value!");
 50 |                 sum += indentation as u64;
 51 |             }
 52 |         }
 53 |         if histogram.is_empty() {
 54 |             None
 55 |         } else {
 56 |             Some(IndentationData {
 57 |                 lines: histogram.len(),
 58 |                 minimum: histogram.low(),
 59 |                 maximum: histogram.high(),
 60 |                 median: histogram.value_at_quantile(0.5),
 61 |                 stddev: histogram.stdev(),
 62 |                 p75: histogram.value_at_quantile(0.75),
 63 |                 p90: histogram.value_at_quantile(0.90),
 64 |                 p99: histogram.value_at_quantile(0.99),
 65 |                 sum,
 66 |             })
 67 |         }
 68 |     }
 69 | }
 70 | 
 71 | // TODO: remove duplication with loc.rs
 72 | const MAX_PEEK_SIZE: usize = 1024;
 73 | 
 74 | fn file_content_type(filename: &Path) -> Result<ContentType, Error> {
 75 |     let file = File::open(filename)?;
 76 |     let mut buffer: Vec<u8> = vec![];
 77 | 
 78 |     file.take(MAX_PEEK_SIZE as u64).read_to_end(&mut buffer)?;
 79 |     Ok(inspect(&buffer))
 80 | }
 81 | 
 82 | fn parse_file(filename: &Path) -> Result<Option<IndentationData>, Error> {
 83 |     let config = Config::default();
 84 |     let code_lines = match LanguageType::from_path(filename, &config) {
 85 |         Some(language) => {
 86 |             let report = language
 87 |                 .parse(PathBuf::from(filename), &config)
 88 |                 .map_err(|(error, _pathbuf)| error);
 89 |             CodeLines::from_stats(&report?.stats)
 90 |         }
 91 |         None => {
 92 |             if file_content_type(filename)? == ContentType::BINARY {
 93 |                 return Ok(None);
 94 |             }
 95 |             debug!("Unknown language in {:?} - treating as text", filename);
 96 |             CodeLines::new(&PathBuf::from(filename))?
 97 |         }
 98 |     };
 99 |     Ok(IndentationData::new(code_lines))
100 | }
101 | 
102 | #[derive(Debug)]
103 | pub struct IndentationCalculator {}
104 | 
105 | impl ToxicityIndicatorCalculator for IndentationCalculator {
106 |     fn name(&self) -> String {
107 |         "indentation".to_string()
108 |     }
109 | 
110 |     fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error> {
111 |         if path.is_file() {
112 |             let indentation =
113 |                 parse_file(path).with_context(|| format!("parsing indentation for {:?}", path))?;
114 |             node.indicators_mut().indentation = indentation;
115 |         }
116 |         Ok(())
117 |     }
118 | 
119 |     fn apply_metadata(&self, _metadata: &mut IndicatorMetadata) -> Result<(), Error> {
120 |         Ok(())
121 |     }
122 | }
123 | 
124 | #[cfg(test)]
125 | mod test {
126 |     use super::*;
127 | 
128 |     #[test]
129 |     fn can_get_indentation_data_for_a_file() {
130 |         let indentation = parse_file(Path::new("./tests/data/simple/parent.clj"))
131 |             .unwrap()
132 |             .unwrap();
133 |         assert_eq!(indentation.lines, 3);
134 |         assert_eq!(indentation.p99, 2);
135 |         assert_eq!(indentation.sum, 2);
136 |     }
137 | 
138 |     #[test]
139 |     fn unknown_files_are_treated_as_code() {
140 |         let indentation = parse_file(Path::new("./tests/data/languages/foo.unknown"))
141 |             .unwrap()
142 |             .unwrap();
143 |         assert_eq!(indentation.lines, 2);
144 |         assert_eq!(indentation.p99, 2);
145 |         assert_eq!(indentation.sum, 2);
146 |     }
147 | 
148 |     #[test]
149 |     fn pf_files_are_fortran_unit_tests() {
150 |         let indentation = parse_file(Path::new("./tests/data/languages/pfunit_test.pf"))
151 |             .unwrap()
152 |             .unwrap();
153 |         assert_eq!(indentation.lines, 13);
154 |         assert_eq!(indentation.p99, 6);
155 |         assert_eq!(indentation.sum, 39);
156 |     }
157 | 
158 |     #[test]
159 |     fn non_utf8_text_files_are_parsed() {
160 |         let indentation = parse_file(Path::new("./tests/data/languages/non-utf8.properties"))
161 |             .unwrap()
162 |             .unwrap();
163 |         assert_eq!(indentation.lines, 2);
164 |         assert_eq!(indentation.p99, 0);
165 |         assert_eq!(indentation.sum, 0);
166 |     }
167 | }
168 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Change Log
  2 | 
  3 | All notable changes to this project will be documented in this file.
  4 | 
  5 | The format is based on [Keep a Changelog](http://keepachangelog.com/)
  6 | and this project adheres to [Semantic Versioning](http://semver.org/).
  7 | 
  8 | <!-- next-header -->
  9 | ## [Unreleased] - ReleaseDate
 10 | ## [0.4.4] - 2022-11-21
 11 | 
 12 | * fixed error with non-UTF8 files in the recent unknown languages logic
 13 | * added error context in a few places to make diagnosing errors easier
 14 | 
 15 | ## [0.4.3] - 2022-11-16
 16 | 
 17 | * tidying up
 18 | 
 19 | ## [0.4.2] - 2022-11-16
 20 | 
 21 | * Unknown languages now treated as pure code - previously they were 'Text' which was treated by tokei as all comments, so ignored in stats!
 22 | * Added support for 'pf' fortran unit test files (via changes to my tokei fork)
 23 | 
 24 | ## [0.4.1] - 2022-11-09
 25 | 
 26 | * Fixed the [tokei fork](https://github.com/kornysietsma/tokei/tree/accumulate-lines) to properly get indentation for COBOL and other languages with simple per-line comment checks
 27 | * Updated `Cargo.toml` to check for a particular tag in the Tokei fork, not just use the branch name
 28 | 
 29 | ## [0.4.0] - 2022-10-13
 30 | 
 31 | * New release - it feels that a lot has changed, really 0.3.2 should have been 0.4.0! Anyway better late than never.
 32 | * Major changes since 0.3.1:
 33 |   * Output file format reworked
 34 |   * Project files have names and IDs
 35 |   * Disabling git scanning
 36 |   * Feature flags
 37 |   * various bug fixes
 38 | 
 39 | ## [0.3.15] - 2022-10-06
 40 | 
 41 | * Bug fix for some co-authored-by lines
 42 |   * sometimes commit messages are terminated by `\r\n` characters, but rust assumed they were `\n` terminated - and my co-author regular expression didn't ignore the `\r` whitespace!
 43 | * fix for linux binaries finally
 44 |   * deleted releases 0.3.5 to 0.3.14 - debugging Github Actions took a lot of work, and a lot of release-fix-release cycles!
 45 | 
 46 | ## [0.3.4] - 2022-10-06
 47 | 
 48 | * Point release mostly to test fixes to the release process!
 49 | 
 50 | ## [0.3.3] - 2022-09-28
 51 | 
 52 | * Somewhat breaking release - the output file schema won't change, but the logic is changing - so now data format is 1.0.1 as this is sort-of compatible:
 53 |   * Previously all changes for a day were combined into a single GitDetails entry, now however I am generating a new GitDetails for each unique user set.
 54 |   * This is because previously if Jane made 1 change and Joe made 100, they would both show up as changes by Jane and Joe
 55 |   * This will make output files a bit more verbose, but hopefully not too much, but new user and team info was being distorted by this - now the UI shows you Jane and Joe separately, we need to track them separately, unless they are co-authors on a commit.
 56 | * Added a DesignDecisons.md document to discuss the next change:
 57 | * Removed the way the code used to use generic `Value` types for indicator data - everything is concrete types now.  See `DesignDecisions.md` for discussion
 58 | * Added feature flags, including new 'disable git' option, and flags in JSON output (data format v1.0.2)
 59 | * Added file creation and modification times, so the explorer can use them when git is disabled
 60 | 
 61 | ## [0.3.2] - 2022-09-20
 62 | 
 63 | * Backward-incompatible release - changing output file format for a few reasons:
 64 |   * I want a unique ID that the front end can use by default for BrowserStorage - this can be specified or random
 65 |     * actually now split into 'name' which is descriptive, and 'id' for storage etc.
 66 |   * I want a semantic version number in the data file, so the front-end can tell if it knows the data format
 67 |   * I'm moving the front-end to Typescript which means I'd like to keep types a bit cleaner, rather than just dumping data in the 'root' directory node
 68 |   * Really the old 'flare' file format hasn't been meaningful for a while, so I might as well dump irrelevant bits (like the 'value' on each node - redundant and confusing)
 69 | * username / emails are now de-duplicated by case - so if you have "Jane smith" and "Jane Smith" as git user names, they will get the same user id (and the case of whichever example was seen first) - this was needed as, especially with `co-authored-by` tags, the same user could show up several times with only case differences.
 70 | 
 71 | ## [0.3.1] - 2022-07-13
 72 | 
 73 | ### Changed
 74 | 
 75 | * Added an option to follow symlinks to fix issue #1
 76 | 
 77 | ## [0.3.0] - 2021-04-05
 78 | 
 79 | ### Changed
 80 | 
 81 | * Major change - new coupling logic, fine-grained based on timestamps instead of aggregating into daily buckets.  This will need a lot of documenting, which will probably be on the main website not here.
 82 | * updating tokei to latest code - this was ugly as tokei is now multithreaded and more complex. Modified tokei fork at <https://github.com/kornysietsma/tokei/tree/accumulate-lines> to accumulate lines of code - but note they aren't ordered so this works for stats but not much else
 83 | * Updated all other dependencies to latest stable bits
 84 | 
 85 | ## [0.2.1] - 2020-10-16
 86 | 
 87 | ### Changed
 88 | 
 89 | * fixing build on Windows
 90 | 
 91 | ## [0.2.0] - 2020-09-16
 92 | 
 93 | ### Added
 94 | 
 95 | * git log logic follows renames - a fair bit of work, as it requires splitting the git log processing into two passes, one to aggregate all rename/deletes and parent/child relationships, and one that uses that data to find what files end up being named in the final revision.
 96 | 
 97 | ### Changed
 98 | 
 99 | * Git logging may be slower and use more memory, as interim git log data is stored in memory.
100 | 
101 | ## [0.1.2] - 2020-08-25
102 | 
103 | ## [0.1.1] - 2020-08-24
104 | 
105 | ### Changed
106 | 
107 | * Trying to get Travis to publish binaries
108 | 
109 | ## [0.1.0] - 2020-08-24
110 | 
111 | ### Added
112 | 
113 | * Tagged with version 0.1.0
114 | * Added this changelog, following [cargo-release suggestions](https://github.com/sunng87/cargo-release/blob/master/docs/faq.md#maintaining-changelog) and <https://keepachangelog.com>
115 | * First release with binary files
116 | 


--------------------------------------------------------------------------------
/src/git_file_future.rs:
--------------------------------------------------------------------------------
  1 | #![warn(clippy::all)]
  2 | use git2::Oid;
  3 | use std::collections::HashMap;
  4 | use std::path::{Path, PathBuf};
  5 | 
  6 | /// Track file changes for a file - renames and deletes
  7 | #[derive(Debug, Clone)]
  8 | pub struct GitFileFutureRegistry {
  9 |     rev_changes: HashMap<Oid, RevChange>,
 10 | }
 11 | 
 12 | #[derive(Debug, Clone)]
 13 | struct RevChange {
 14 |     files: HashMap<PathBuf, FileNameChange>,
 15 |     /// first child is generally used only, it is the main branch - don't divert into other branches!
 16 |     children: Vec<Oid>,
 17 | }
 18 | 
 19 | #[derive(Debug, Clone)]
 20 | pub enum FileNameChange {
 21 |     Renamed(PathBuf),
 22 |     Deleted(),
 23 | }
 24 | 
 25 | impl RevChange {
 26 |     pub fn new() -> Self {
 27 |         RevChange {
 28 |             files: HashMap::new(),
 29 |             children: Vec::new(),
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | impl GitFileFutureRegistry {
 35 |     pub fn new() -> Self {
 36 |         GitFileFutureRegistry {
 37 |             rev_changes: HashMap::new(),
 38 |         }
 39 |     }
 40 | 
 41 |     pub fn register(
 42 |         &mut self,
 43 |         id: &Oid,
 44 |         parent_ids: &[Oid],
 45 |         file_changes: &[(PathBuf, FileNameChange)],
 46 |     ) {
 47 |         let entry = self.rev_changes.entry(*id).or_insert_with(RevChange::new);
 48 |         entry.files.extend(file_changes.iter().cloned());
 49 |         for parent_id in parent_ids {
 50 |             let pentry = self
 51 |                 .rev_changes
 52 |                 .entry(*parent_id)
 53 |                 .or_insert_with(RevChange::new);
 54 |             pentry.children.push(*id);
 55 |         }
 56 |     }
 57 | 
 58 |     /// what is this called in the final revision?
 59 |     /// returns None if it is deleted, or Some(final name)
 60 |     pub fn final_name(&self, ref_id: &Oid, file: &Path) -> Option<PathBuf> {
 61 |         let mut current_name: &PathBuf = &file.to_path_buf();
 62 |         let mut current_ref: Oid = *ref_id;
 63 |         loop {
 64 |             let current_change = self.rev_changes.get(&current_ref).unwrap();
 65 |             match current_change.files.get(current_name) {
 66 |                 Some(FileNameChange::Renamed(new_name)) => {
 67 |                     current_name = new_name;
 68 |                 }
 69 |                 Some(FileNameChange::Deleted()) => return None,
 70 |                 None => (),
 71 |             }
 72 |             if let Some(first_child) = current_change.children.get(0) {
 73 |                 current_ref = *first_child;
 74 |             // and loop will continue
 75 |             } else {
 76 |                 // no children, so finished looking into the future
 77 |                 return Some(current_name.clone());
 78 |             }
 79 |         }
 80 |     }
 81 | }
 82 | 
 83 | #[cfg(test)]
 84 | mod test {
 85 |     use super::*;
 86 |     use anyhow::Error;
 87 |     use pretty_assertions::assert_eq;
 88 | 
 89 |     fn pb(name: &str) -> PathBuf {
 90 |         PathBuf::from(name)
 91 |     }
 92 | 
 93 |     #[test]
 94 |     fn trivial_repo_returns_original_name() -> Result<(), Error> {
 95 |         let mut registry = GitFileFutureRegistry::new();
 96 |         let my_id = Oid::from_str("01")?;
 97 |         registry.register(&my_id, &[], &[]);
 98 |         assert_eq!(
 99 |             registry.final_name(&my_id, &pb("foo.txt")),
100 |             Some(pb("foo.txt"))
101 |         );
102 |         Ok(())
103 |     }
104 | 
105 |     #[test]
106 |     fn simple_rename_returns_old_name() -> Result<(), Error> {
107 |         let mut registry = GitFileFutureRegistry::new();
108 |         let my_id = Oid::from_str("01")?;
109 | 
110 |         registry.register(
111 |             &my_id,
112 |             &[],
113 |             &[(pb("foo.txt"), FileNameChange::Renamed(pb("bar.txt")))],
114 |         );
115 |         assert_eq!(
116 |             registry.final_name(&my_id, &pb("foo.txt")),
117 |             Some(pb("bar.txt"))
118 |         );
119 |         Ok(())
120 |     }
121 | 
122 |     #[test]
123 |     fn renames_and_deletes_applied_across_history() -> Result<(), Error> {
124 |         // my bad - this should be a few isolated tests not one big test-all test.
125 |         // classic how my standards slip for side projects!
126 |         let mut registry = GitFileFutureRegistry::new();
127 |         /*
128 |                    +-----+
129 |                    |01   |
130 |                    |add a|
131 |                    |add z|
132 |                    +--+--+
133 |                       |
134 |                +------v------+
135 |                |02           |
136 |                |rename a to b|
137 |                |delete z     |
138 |                +-------------+
139 |                |             |
140 |         +------v------+ +----v--------+
141 |         |04           | |05           |
142 |         |rename b to c| |rename b to d|
143 |         +--------------+--------------+
144 |                        |
145 |               +--------v---------+
146 |               |06 merge          |
147 |               |rename c to afinal|
148 |               |create new z      |
149 |               +------------------+
150 |                 */
151 |         let id_1 = Oid::from_str("01")?;
152 |         let id_2 = Oid::from_str("02")?;
153 |         let id_4 = Oid::from_str("04")?;
154 |         let id_5 = Oid::from_str("05")?;
155 |         let id_6 = Oid::from_str("06")?;
156 | 
157 |         registry.register(
158 |             &id_6,
159 |             &[id_4, id_5],
160 |             &[(pb("c"), FileNameChange::Renamed(pb("afinal")))],
161 |         );
162 |         // NOTE: topological order should (I think?) register rev 4 before rev 5 as it's first
163 |         registry.register(
164 |             &id_4,
165 |             &[id_2],
166 |             &[(pb("b"), FileNameChange::Renamed(pb("c")))],
167 |         );
168 |         registry.register(
169 |             &id_5,
170 |             &[id_2],
171 |             &[(pb("b"), FileNameChange::Renamed(pb("d")))],
172 |         );
173 |         registry.register(
174 |             &id_2,
175 |             &[id_1],
176 |             &[
177 |                 (pb("a"), FileNameChange::Renamed(pb("b"))),
178 |                 (pb("z"), FileNameChange::Deleted()),
179 |             ],
180 |         );
181 |         registry.register(&id_1, &[], &[]);
182 | 
183 |         // original a is afinal
184 |         // original z is gone
185 |         assert_eq!(registry.final_name(&id_1, &pb("a")), Some(pb("afinal")));
186 |         assert_eq!(registry.final_name(&id_1, &pb("z")), None);
187 |         // from the perspective of the filesystem after node 2, we know nothing of a any more, only b
188 |         assert_eq!(registry.final_name(&id_2, &pb("b")), Some(pb("afinal")));
189 | 
190 |         Ok(())
191 |     }
192 | }
193 | 


--------------------------------------------------------------------------------
/src/file_walker.rs:
--------------------------------------------------------------------------------
  1 | #![warn(clippy::all)]
  2 | 
  3 | use crate::{polyglot_data::PolyglotData, FeatureFlags};
  4 | 
  5 | use super::flare;
  6 | use super::flare::FlareTreeNode;
  7 | use super::toxicity_indicator_calculator::ToxicityIndicatorCalculator;
  8 | use anyhow::{Context, Error};
  9 | use ignore::{Walk, WalkBuilder};
 10 | #[allow(unused_imports)]
 11 | use path_slash::PathExt;
 12 | use std::{path::Path, time::Instant};
 13 | 
 14 | fn apply_calculators_to_node(
 15 |     node: &mut FlareTreeNode,
 16 |     path: &Path,
 17 |     toxicity_indicator_calculators: &mut [Box<dyn ToxicityIndicatorCalculator>],
 18 | ) -> Result<(), Error> {
 19 |     for tic in toxicity_indicator_calculators.iter_mut() {
 20 |         tic.visit_node(node, path)
 21 |             .with_context(|| format!("applying calcluator {} to {:?}", tic.name(), path))?;
 22 |     }
 23 |     Ok(())
 24 | }
 25 | 
 26 | const LOG_INTERVAL_SECS: u64 = 60 * 5;
 27 | 
 28 | fn walk_tree_walker(
 29 |     walker: Walk,
 30 |     prefix: &Path,
 31 |     name: &str,
 32 |     id: Option<&str>,
 33 |     toxicity_indicator_calculators: &mut [Box<dyn ToxicityIndicatorCalculator>],
 34 |     features: &FeatureFlags, // features just for JSON output
 35 | ) -> Result<PolyglotData, Error> {
 36 |     let mut tree = FlareTreeNode::new(flare::ROOT_NAME, false);
 37 | 
 38 |     apply_calculators_to_node(&mut tree, prefix, toxicity_indicator_calculators)?;
 39 | 
 40 |     let mut last_log = Instant::now();
 41 |     info!("Walking file tree");
 42 | 
 43 |     for result in walker.map(|r| r.expect("File error!")).skip(1) {
 44 |         let p = result.path();
 45 |         let relative = p.strip_prefix(prefix)?;
 46 |         let elapsed_since_log = last_log.elapsed();
 47 |         if elapsed_since_log.as_secs() > LOG_INTERVAL_SECS {
 48 |             info!("Walking progress: {:?}", relative);
 49 |             last_log = Instant::now();
 50 |         }
 51 | 
 52 |         let new_child = if p.is_dir() || p.is_file() {
 53 |             let mut f = FlareTreeNode::new(p.file_name().unwrap(), p.is_file());
 54 |             apply_calculators_to_node(&mut f, p, toxicity_indicator_calculators)?;
 55 |             Some(f)
 56 |         } else {
 57 |             warn!("Not a file or dir: {:?} - skipping", p);
 58 |             None
 59 |         };
 60 | 
 61 |         if let Some(new_child) = new_child {
 62 |             match relative.parent() {
 63 |                 Some(new_parent) => {
 64 |                     let parent = tree
 65 |                         .get_in_mut(&mut new_parent.components())
 66 |                         .expect("no parent found!");
 67 |                     parent.append_child(new_child);
 68 |                 }
 69 |                 None => {
 70 |                     tree.append_child(new_child);
 71 |                 }
 72 |             }
 73 |         }
 74 |     }
 75 |     info!("finished walking file tree");
 76 |     Ok(PolyglotData::new(name, id, tree, features.clone()))
 77 | }
 78 | 
 79 | pub fn walk_directory(
 80 |     root: &Path,
 81 |     name: &str,
 82 |     id: Option<&str>,
 83 |     follow_symlinks: bool,
 84 |     toxicity_indicator_calculators: &mut [Box<dyn ToxicityIndicatorCalculator>],
 85 |     features: &FeatureFlags, // features just for JSON output
 86 | ) -> Result<PolyglotData, Error> {
 87 |     walk_tree_walker(
 88 |         WalkBuilder::new(root)
 89 |             .add_custom_ignore_filename(".polyglot_code_scanner_ignore")
 90 |             .follow_links(follow_symlinks)
 91 |             .sort_by_file_name(std::cmp::Ord::cmp)
 92 |             .build(),
 93 |         root,
 94 |         name,
 95 |         id,
 96 |         toxicity_indicator_calculators,
 97 |         features,
 98 |     )
 99 | }
100 | 
101 | #[cfg(test)]
102 | mod test {
103 |     use crate::polyglot_data::IndicatorMetadata;
104 | 
105 |     use super::*;
106 |     use test_shared::assert_eq_json_file;
107 | 
108 |     #[test]
109 |     fn scanning_a_filesystem_builds_a_tree() {
110 |         let root = Path::new("./tests/data/simple/");
111 |         let tree = walk_directory(
112 |             root,
113 |             "test",
114 |             Some("test-id"),
115 |             false,
116 |             &mut Vec::new(),
117 |             &FeatureFlags::default(),
118 |         )
119 |         .unwrap();
120 | 
121 |         assert_eq_json_file(&tree, "./tests/expected/simple_files.json");
122 |     }
123 | 
124 |     #[test]
125 |     fn scanning_a_filesystem_can_follow_symlinks() {
126 |         let root = Path::new("./tests/data/simple_linked/");
127 |         let tree = walk_directory(
128 |             root,
129 |             "test",
130 |             Some("test-id"),
131 |             true,
132 |             &mut Vec::new(),
133 |             &FeatureFlags::default(),
134 |         )
135 |         .unwrap();
136 | 
137 |         assert_eq_json_file(&tree, "./tests/expected/simple_files.json");
138 |     }
139 | 
140 |     #[derive(Debug)]
141 |     struct FirstTIC {}
142 | 
143 |     impl ToxicityIndicatorCalculator for FirstTIC {
144 |         fn name(&self) -> String {
145 |             "foo".to_string()
146 |         }
147 |         fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error> {
148 |             if path.is_file() {
149 |                 // only mutate files!  If we rename dirs, the parent relationship breaks
150 |                 let mut name = node.name().clone();
151 |                 name.push("!");
152 |                 node.set_name(&name);
153 |             }
154 |             Ok(())
155 |         }
156 |         fn apply_metadata(&self, _metadata: &mut IndicatorMetadata) -> Result<(), Error> {
157 |             unimplemented!()
158 |         }
159 |     }
160 | 
161 |     #[derive(Debug)]
162 |     struct SecondTIC {}
163 | 
164 |     impl ToxicityIndicatorCalculator for SecondTIC {
165 |         fn name(&self) -> String {
166 |             "filename".to_string()
167 |         }
168 |         fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error> {
169 |             if path.is_file() {
170 |                 // only mutate files!  If we rename dirs, the parent relationship breaks
171 |                 let mut name = node.name().clone();
172 |                 name.push("?");
173 |                 node.set_name(&name);
174 |             }
175 |             Ok(())
176 |         }
177 | 
178 |         fn apply_metadata(&self, _metadata: &mut IndicatorMetadata) -> Result<(), Error> {
179 |             unimplemented!()
180 |         }
181 |     }
182 | 
183 |     #[test]
184 |     fn scanning_merges_data_from_mutators() {
185 |         let root = Path::new("./tests/data/simple/");
186 |         let first = FirstTIC {};
187 |         let second = SecondTIC {};
188 |         let calculators: &mut Vec<Box<dyn ToxicityIndicatorCalculator>> =
189 |             &mut vec![Box::new(first), Box::new(second)];
190 | 
191 |         let tree = walk_directory(
192 |             root,
193 |             "test",
194 |             Some("test-id"),
195 |             false,
196 |             calculators,
197 |             &FeatureFlags::default(),
198 |         )
199 |         .unwrap();
200 | 
201 |         assert_eq_json_file(&tree, "./tests/expected/simple_files_with_indicators.json");
202 |     }
203 | 
204 |     // TODO: we have no unit test for new metadata - should we?
205 | }
206 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | #![forbid(unsafe_code)]
  2 | #![warn(clippy::all)]
  3 | #![warn(clippy::pedantic)]
  4 | #![warn(rust_2018_idioms)]
  5 | 
  6 | use anyhow::Error;
  7 | use clap::{CommandFactory, ErrorKind, Parser};
  8 | use polyglot_code_scanner::coupling::CouplingConfig;
  9 | use polyglot_code_scanner::{FeatureFlags, ScannerConfig};
 10 | use std::fs::File;
 11 | use std::io;
 12 | use std::path::PathBuf;
 13 | 
 14 | #[allow(clippy::struct_excessive_bools)]
 15 | #[derive(Debug, Parser)]
 16 | #[clap(author, version)]
 17 | /// Polyglot Code Scanner
 18 | ///
 19 | /// Scans source code and generates indicators that may (or may not) show toxic code.
 20 | /// Ignores files specified by `.gitignore` or `.polyglot_code_scanner_ignore` files
 21 | /// See <https://polyglot.korny.info> for details
 22 | struct Cli {
 23 |     #[clap(
 24 |         short = 'v',
 25 |         long = "verbose",
 26 |         action = clap::ArgAction::Count
 27 |     )]
 28 |     /// Logging verbosity, v = error, vv = warn, vvv = info (default), vvvv = debug, vvvvv = trace
 29 |     verbose: u8,
 30 |     /// Output file, stdout if not present, or not used if sending to web server
 31 |     #[clap(short = 'o', long = "output", parse(from_os_str))]
 32 |     output: Option<PathBuf>,
 33 |     /// project name - identifies the selected data for display and state storage
 34 |     #[clap(value_parser, short = 'n', long = "name")]
 35 |     name: String,
 36 | 
 37 |     /// data file ID - used to identify unique data files for browser storage, generates a UUID if not specified
 38 |     #[clap(value_parser, long = "id")]
 39 |     id: Option<String>,
 40 |     /// Root directory, current dir if not present
 41 |     #[clap(parse(from_os_str))]
 42 |     root: Option<PathBuf>,
 43 | 
 44 |     // global indicator flags
 45 |     #[clap(value_parser, long = "no-git")]
 46 |     /// Do not scan for git repositories
 47 |     no_git: bool,
 48 |     #[clap(value_parser, short = 'c', long = "coupling")]
 49 |     /// include temporal coupling data
 50 |     coupling: bool,
 51 |     #[clap(value_parser, long = "no-detailed-git")]
 52 |     /// Don't include detailed git information - output may be big!
 53 |     no_detailed_git: bool,
 54 |     #[clap(value_parser, long = "no-file-stats")]
 55 |     /// Do not scan for file stats - mainly an option as this is very hard to unit test
 56 |     no_file_stats: bool,
 57 | 
 58 |     #[clap(value_parser, long = "years", default_value = "3")]
 59 |     /// how many years of git history to parse - default only scan the last 3 years (from now, not git head)
 60 |     git_years: u64,
 61 |     #[clap(value_parser, long = "follow-symlinks")]
 62 |     /// Follow symbolic links when traversing directories
 63 |     follow_symlinks: bool,
 64 |     #[clap(value_parser, long = "coupling-bucket-days", default_value = "91")]
 65 |     /// Number of days in a single "bucket" of coupling activity
 66 |     bucket_days: u64,
 67 |     #[clap(value_parser, long = "coupling-min-bursts", default_value = "10")]
 68 |     /// If a file has fewer bursts of change than this in a bucket, don't measure coupling from it
 69 |     min_activity_bursts: u64,
 70 |     #[clap(value_parser, long = "coupling-min-ratio", default_value = "0.8")]
 71 |     /// The minimum ratio of (other file changes)/(this file changes) to include a file in coupling stats
 72 |     min_coupling_ratio: f64,
 73 |     #[clap(
 74 |         value_parser,
 75 |         long = "coupling-min-activity-gap-minutes",
 76 |         default_value = "60"
 77 |     )]
 78 |     /// what is the minimum gap between activities in a burst? a sequence of commits with no gaps this long is treated as one burst
 79 |     min_activity_gap_minutes: u64,
 80 |     #[clap(
 81 |         value_parser,
 82 |         long = "coupling-time-overlap-minutes",
 83 |         default_value = "60"
 84 |     )]
 85 |     /// how far before/after an activity burst is included for coupling? e.g. if I commit Foo.c at 1am, and Bar.c at 2am, they are coupled if an overlap of 60 minutes or longer is specified
 86 |     min_overlap_minutes: u64,
 87 |     #[clap(value_parser, long = "coupling-min-distance", default_value = "3")]
 88 |     /// The minimum distance between nodes to include in coupling
 89 |     /// 0 is all, 1 is siblings, 2 is cousins and so on.
 90 |     /// so if you set this to 3, cousins "foo/src/a.rs" and "foo/test/a_test.rs" won't be counted as their distance is 2
 91 |     coupling_min_distance: usize,
 92 |     #[clap(value_parser, long = "coupling-max-common-roots")]
 93 |     /// The maximum number of common ancestors to include in coupling
 94 |     /// e.g. "foo/src/controller/a.c" and "foo/src/service/b.c" have two common ancestors, if you
 95 |     /// set this value to 3 they won't show as coupled.
 96 |     coupling_max_common_roots: Option<usize>,
 97 | }
 98 | 
 99 | // very basic logging - just so I can have a nice default, and hide verbose tokei logs
100 | fn setup_logging(verbosity: u8) -> Result<(), fern::InitError> {
101 |     let mut base_config = fern::Dispatch::new();
102 | 
103 |     base_config = match verbosity {
104 |         0 | 3 => base_config.level(log::LevelFilter::Info),
105 |         1 => base_config.level(log::LevelFilter::Error),
106 |         2 => base_config.level(log::LevelFilter::Warn),
107 |         4 => base_config.level(log::LevelFilter::Debug),
108 |         _5_or_more => base_config.level(log::LevelFilter::Trace),
109 |     };
110 | 
111 |     // Tokei warns whenever we scan a language type we don't know - but I catch that error!
112 |     base_config = base_config.level_for("tokei::language::language_type", log::LevelFilter::Error);
113 | 
114 |     let stdout_config = fern::Dispatch::new()
115 |         .format(|out, message, record| {
116 |             out.finish(format_args!(
117 |                 "[{}][{}][{}] {}",
118 |                 chrono::Local::now().format("%H:%M"),
119 |                 record.target(),
120 |                 record.level(),
121 |                 message
122 |             ));
123 |         })
124 |         .chain(io::stderr());
125 | 
126 |     base_config.chain(stdout_config).apply()?;
127 | 
128 |     Ok(())
129 | }
130 | 
131 | fn custom_validation_conflict(message: &str) {
132 |     let mut cmd = Cli::command();
133 |     cmd.error(ErrorKind::ArgumentConflict, message).exit()
134 | }
135 | 
136 | fn main() -> Result<(), Error> {
137 |     let args = Cli::from_args();
138 | 
139 |     // custom validation - easier than trying to wrangle clap to do this!
140 |     if args.no_git {
141 |         if args.coupling {
142 |             custom_validation_conflict("Can't enable coupling when git is disabled!");
143 |         }
144 |         if args.no_detailed_git {
145 |             custom_validation_conflict("Can't specify no_detailed_git when git is disabled!");
146 |         }
147 |     }
148 | 
149 |     setup_logging(args.verbose)?;
150 | 
151 |     let root = args.root.unwrap_or_else(|| PathBuf::from("."));
152 | 
153 |     let features = FeatureFlags {
154 |         git: !args.no_git,
155 |         coupling: args.coupling,
156 |         git_details: !(args.no_detailed_git || args.no_git),
157 |         file_stats: !args.no_file_stats,
158 |     };
159 | 
160 |     let scanner_config = ScannerConfig {
161 |         git_years: Some(args.git_years),
162 |         data_id: args.id,
163 |         name: args.name,
164 |         follow_symlinks: args.follow_symlinks,
165 |         features,
166 |     };
167 | 
168 |     let coupling_config = if args.coupling {
169 |         Some(CouplingConfig::new(
170 |             args.bucket_days,
171 |             args.min_activity_bursts,
172 |             args.min_coupling_ratio,
173 |             args.min_activity_gap_minutes * 60,
174 |             args.min_overlap_minutes * 60,
175 |             args.coupling_min_distance,
176 |             args.coupling_max_common_roots,
177 |         ))
178 |     } else {
179 |         None
180 |     };
181 | 
182 |     let mut out: Box<dyn io::Write> = if let Some(output) = args.output {
183 |         Box::new(File::create(output)?)
184 |     } else {
185 |         Box::new(io::stdout())
186 |     };
187 | 
188 |     let mut calculator_names: Vec<&str> = vec!["loc", "indentation"];
189 |     if !args.no_git {
190 |         calculator_names.push("git");
191 |     }
192 |     if !args.no_file_stats {
193 |         calculator_names.push("file_stats");
194 |     }
195 | 
196 |     polyglot_code_scanner::run(
197 |         &root,
198 |         &scanner_config,
199 |         coupling_config,
200 |         &calculator_names,
201 |         &mut out,
202 |     )?;
203 | 
204 |     Ok(())
205 | }
206 | 


--------------------------------------------------------------------------------
/src/flare.rs:
--------------------------------------------------------------------------------
  1 | #![warn(clippy::all)]
  2 | //! This is named 'Flare' as historically, the D3 hierarchical data files
  3 | //! were called 'flare.json' and there was an implied data format.
  4 | //!
  5 | //! As of version 1.0.0 (when I started versioning!) of the data format,
  6 | //! the syntax differs from D3 flare files, but I haven't renamed the module (yet)
  7 | 
  8 | use serde::ser::SerializeStruct;
  9 | use serde::{Serialize, Serializer};
 10 | use std::ffi::{OsStr, OsString};
 11 | 
 12 | use crate::coupling::SerializableCouplingData;
 13 | use crate::file_stats::FileStats;
 14 | use crate::git::GitNodeData;
 15 | use crate::indentation::IndentationData;
 16 | use crate::loc::LanguageLocData;
 17 | 
 18 | pub static ROOT_NAME: &str = "<root>";
 19 | 
 20 | #[derive(Debug, PartialEq, Clone, Default, Serialize)]
 21 | pub struct IndicatorData {
 22 |     #[serde(skip_serializing_if = "Option::is_none")]
 23 |     pub git: Option<GitNodeData>,
 24 |     #[serde(skip_serializing_if = "Option::is_none")]
 25 |     pub indentation: Option<IndentationData>,
 26 |     #[serde(skip_serializing_if = "Option::is_none")]
 27 |     pub loc: Option<LanguageLocData>,
 28 |     #[serde(skip_serializing_if = "Option::is_none")]
 29 |     pub coupling: Option<SerializableCouplingData>,
 30 |     #[serde(skip_serializing_if = "Option::is_none")]
 31 |     pub file_stats: Option<FileStats>,
 32 | }
 33 | 
 34 | impl IndicatorData {
 35 |     fn is_empty(&self) -> bool {
 36 |         self.git.is_none()
 37 |             && self.indentation.is_none()
 38 |             && self.loc.is_none()
 39 |             && self.coupling.is_none()
 40 |             && self.file_stats.is_none()
 41 |     }
 42 | }
 43 | 
 44 | #[derive(Debug, PartialEq, Clone)]
 45 | pub struct FlareTreeNode {
 46 |     name: OsString,
 47 |     is_file: bool,
 48 |     children: Vec<FlareTreeNode>,
 49 |     data: IndicatorData,
 50 | }
 51 | 
 52 | impl FlareTreeNode {
 53 |     pub fn name(&self) -> &OsString {
 54 |         &self.name
 55 |     }
 56 | 
 57 |     #[cfg(test)]
 58 |     pub fn set_name(&mut self, name: &OsStr) {
 59 |         self.name = name.to_owned();
 60 |     }
 61 | 
 62 |     pub fn new(name: impl Into<OsString>, is_file: bool) -> Self {
 63 |         FlareTreeNode {
 64 |             name: name.into(),
 65 |             is_file,
 66 |             children: Vec::new(),
 67 | 
 68 |             data: IndicatorData::default(),
 69 |         }
 70 |     }
 71 | 
 72 |     #[cfg(test)]
 73 |     pub fn file(name: impl Into<OsString>) -> Self {
 74 |         Self::new(name, true)
 75 |     }
 76 | 
 77 |     #[cfg(test)]
 78 |     pub fn dir<S: Into<OsString>>(name: S) -> Self {
 79 |         Self::new(name, false)
 80 |     }
 81 | 
 82 |     pub fn indicators_mut(&mut self) -> &mut IndicatorData {
 83 |         &mut self.data
 84 |     }
 85 |     pub fn indicators(&self) -> &IndicatorData {
 86 |         &self.data
 87 |     }
 88 | 
 89 |     pub fn append_child(&mut self, child: FlareTreeNode) {
 90 |         assert!(!self.is_file, "appending child to a directory: {:?}", self);
 91 |         self.children.push(child); // TODO - return self?
 92 |     }
 93 | 
 94 |     /// gets a tree entry by path, or None if something along the path doesn't exist
 95 |     #[allow(dead_code)] // used in tests
 96 |     pub fn get_in(&self, path: &mut std::path::Components<'_>) -> Option<&FlareTreeNode> {
 97 |         match path.next() {
 98 |             Some(first_name) => {
 99 |                 let dir_name = first_name.as_os_str();
100 |                 if !self.is_file {
101 |                     let first_match = self.children.iter().find(|c| dir_name == c.name)?;
102 |                     return first_match.get_in(path);
103 |                 }
104 |                 None
105 |             }
106 |             None => Some(self),
107 |         }
108 |     }
109 | 
110 |     /// gets a mutable tree entry by path, or None if something along the path doesn't exist
111 |     pub fn get_in_mut(
112 |         &mut self,
113 |         path: &mut std::path::Components<'_>,
114 |     ) -> Option<&mut FlareTreeNode> {
115 |         match path.next() {
116 |             Some(first_name) => {
117 |                 let dir_name = first_name.as_os_str();
118 |                 if !self.is_file {
119 |                     let first_match = self.children.iter_mut().find(|c| dir_name == c.name)?;
120 |                     return first_match.get_in_mut(path);
121 |                 }
122 |                 None
123 |             }
124 |             None => Some(self),
125 |         }
126 |     }
127 | 
128 |     pub fn get_children(&self) -> &Vec<FlareTreeNode> {
129 |         &self.children
130 |     }
131 | 
132 |     // used only for postprocessing - could refactor - move functionality here
133 |     pub fn get_children_mut(&mut self) -> &mut Vec<FlareTreeNode> {
134 |         &mut self.children
135 |     }
136 | }
137 | 
138 | fn name_as_str<S: Serializer>(name: &OsStr) -> Result<&str, S::Error> {
139 |     name.to_str().ok_or_else(|| {
140 |         serde::ser::Error::custom(format!("name {:?} contains invalid UTF-8 characters", name))
141 |     })
142 | }
143 | 
144 | impl Serialize for FlareTreeNode {
145 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
146 |     where
147 |         S: Serializer,
148 |     {
149 |         let mut state = serializer.serialize_struct("FlareTreeNode", 3)?;
150 |         let name = name_as_str::<S>(&self.name)?;
151 |         state.serialize_field("name", &name)?;
152 |         if !self.data.is_empty() {
153 |             state.serialize_field("data", &self.data)?;
154 |         }
155 |         if !self.is_file {
156 |             state.serialize_field("children", &self.children)?;
157 |         }
158 | 
159 |         state.end()
160 |     }
161 | }
162 | 
163 | #[cfg(test)]
164 | mod test {
165 |     use super::*;
166 |     use pretty_assertions::assert_eq;
167 |     use serde_json::json;
168 |     use std::path::Path;
169 |     use test_shared::{assert_eq_json_str, assert_eq_json_value};
170 | 
171 |     #[test]
172 |     fn can_build_tree() {
173 |         let mut root = FlareTreeNode::dir("root");
174 |         root.append_child(FlareTreeNode::file("child"));
175 | 
176 |         assert_eq!(
177 |             root,
178 |             FlareTreeNode {
179 |                 name: OsString::from("root"),
180 |                 is_file: false,
181 |                 children: vec![FlareTreeNode {
182 |                     name: OsString::from("child"),
183 |                     is_file: true,
184 |                     data: IndicatorData::default(),
185 |                     children: Vec::new(),
186 |                 }],
187 | 
188 |                 data: IndicatorData::default(),
189 |             }
190 |         );
191 |     }
192 | 
193 |     fn build_test_tree() -> FlareTreeNode {
194 |         let mut root = FlareTreeNode::dir("root");
195 |         root.append_child(FlareTreeNode::file("root_file_1.txt"));
196 |         root.append_child(FlareTreeNode::file("root_file_2.txt"));
197 |         let mut child1 = FlareTreeNode::dir("child1");
198 |         child1.append_child(FlareTreeNode::file("child1_file_1.txt"));
199 |         let mut grand_child = FlareTreeNode::dir("grandchild");
200 |         grand_child.append_child(FlareTreeNode::file("grandchild_file.txt"));
201 |         child1.append_child(grand_child);
202 |         child1.append_child(FlareTreeNode::file("child1_file_2.txt"));
203 |         let mut child2 = FlareTreeNode::dir("child2");
204 |         let child2_file = FlareTreeNode::file("child2_file.txt");
205 |         child2.append_child(child2_file);
206 |         root.append_child(child1);
207 |         root.append_child(child2);
208 |         root
209 |     }
210 | 
211 |     #[test]
212 |     fn can_get_elements_from_tree() {
213 |         let tree = build_test_tree();
214 | 
215 |         let mut path = std::path::Path::new("child1/grandchild/grandchild_file.txt").components();
216 |         let grandchild = tree.get_in(&mut path);
217 |         assert_eq!(
218 |             grandchild.expect("Grandchild not found!").name(),
219 |             "grandchild_file.txt"
220 |         );
221 |     }
222 | 
223 |     #[test]
224 |     fn can_get_top_level_element_from_tree() {
225 |         let tree = build_test_tree();
226 | 
227 |         let mut path = std::path::Path::new("child1").components();
228 |         let child1 = tree.get_in(&mut path);
229 |         assert_eq!(child1.expect("child1 not found!").name(), "child1");
230 | 
231 |         let mut path2 = std::path::Path::new("root_file_1.txt").components();
232 |         let child2 = tree.get_in(&mut path2);
233 |         assert_eq!(
234 |             child2.expect("root_file_1 not found!").name(),
235 |             "root_file_1.txt"
236 |         );
237 |     }
238 | 
239 |     #[test]
240 |     fn getting_missing_elements_returns_none() {
241 |         let tree = build_test_tree();
242 |         let mut path = std::path::Path::new("child1/grandchild/nonesuch").components();
243 |         let missing = tree.get_in(&mut path);
244 |         assert!(missing.is_none());
245 | 
246 |         let mut path2 =
247 |             Path::new("child1/grandchild/grandchild_file.txt/files_have_no_kids").components();
248 |         let missing2 = tree.get_in(&mut path2);
249 |         assert!(missing2.is_none());
250 | 
251 |         let mut path3 = Path::new("no_file_at_root").components();
252 |         let missing3 = tree.get_in(&mut path3);
253 |         assert!(missing3.is_none());
254 |     }
255 | 
256 |     #[test]
257 |     fn can_get_mut_elements_from_tree() {
258 |         let mut tree = build_test_tree();
259 |         let grandchild = tree
260 |             .get_in_mut(&mut Path::new("child1/grandchild/grandchild_file.txt").components())
261 |             .expect("Grandchild not found!");
262 |         assert_eq!(grandchild.name(), "grandchild_file.txt");
263 |         grandchild.name = OsString::from("fish");
264 |         let grandchild2 = tree.get_in_mut(&mut Path::new("child1/grandchild/fish").components());
265 |         assert_eq!(grandchild2.expect("fish not found!").name(), "fish");
266 | 
267 |         let grandchild_dir = tree
268 |             .get_in_mut(&mut Path::new("child1/grandchild").components())
269 |             .expect("Grandchild dir not found!");
270 |         assert_eq!(grandchild_dir.name(), "grandchild");
271 |         grandchild_dir.append_child(FlareTreeNode::file("new_kid_on_the_block.txt"));
272 |         let new_kid = tree
273 |             .get_in_mut(&mut Path::new("child1/grandchild/new_kid_on_the_block.txt").components())
274 |             .expect("New kid not found!");
275 |         assert_eq!(new_kid.name(), "new_kid_on_the_block.txt");
276 |     }
277 | 
278 |     #[test]
279 |     fn can_serialize_directory_to_json() {
280 |         let root = FlareTreeNode::dir("root");
281 | 
282 |         assert_eq_json_str(
283 |             &root,
284 |             r#"{
285 |                     "name":"root",
286 |                     "children": []
287 |                 }"#,
288 |         );
289 |     }
290 | 
291 |     #[test]
292 |     fn can_serialize_file_to_json() {
293 |         let file = FlareTreeNode::file("foo.txt");
294 | 
295 |         assert_eq_json_str(
296 |             &file,
297 |             r#"{
298 |                     "name":"foo.txt"
299 |                 }"#,
300 |         );
301 |     }
302 | 
303 |     #[test]
304 |     fn can_serialize_simple_tree_to_json() {
305 |         let mut root = FlareTreeNode::dir("root");
306 |         root.append_child(FlareTreeNode::file("child.txt"));
307 |         root.append_child(FlareTreeNode::dir("child2"));
308 | 
309 |         assert_eq_json_value(
310 |             &root,
311 |             &json!({
312 |                 "name":"root",
313 |                 "children":[
314 |                     {
315 |                         "name": "child.txt"
316 |                     },
317 |                     {
318 |                         "name":"child2",
319 |                         "children":[]
320 |                     }
321 |                 ]
322 |             }),
323 |         );
324 |     }
325 | 
326 |     #[test]
327 |     fn can_serialize_simple_polyglot_data_to_json() {
328 |         let mut root = FlareTreeNode::dir("root");
329 |         root.append_child(FlareTreeNode::file("child.txt"));
330 |         root.append_child(FlareTreeNode::dir("child2"));
331 | 
332 |         assert_eq_json_value(
333 |             &root,
334 |             &json!({
335 |                     "name":"root",
336 |                     "children":[
337 |                         {
338 |                             "name": "child.txt"
339 |                         },
340 |                         {
341 |                             "name":"child2",
342 |                             "children":[]
343 |                         }
344 |                     ]
345 |             }),
346 |         );
347 |     }
348 | }
349 | 


--------------------------------------------------------------------------------
/src/git_file_history.rs:
--------------------------------------------------------------------------------
  1 | #![warn(clippy::all)]
  2 | use crate::git_logger::{CommitChange, FileChange, GitLog, GitLogEntry, User};
  3 | use anyhow::Error;
  4 | use chrono::offset::TimeZone;
  5 | use chrono::Utc;
  6 | use git2::Oid;
  7 | use indicatif::{ProgressBar, ProgressStyle};
  8 | use serde::Serialize;
  9 | use std::collections::HashMap;
 10 | use std::convert::TryInto;
 11 | use std::path::Path;
 12 | use std::path::PathBuf;
 13 | 
 14 | /// For each file we just keep a simplified history - what the changes were, by whom, and when.
 15 | #[derive(Debug, Serialize, Builder)]
 16 | #[builder(setter(into), pattern = "owned")]
 17 | pub struct FileHistoryEntry {
 18 |     pub id: String,
 19 |     pub committer: User,
 20 |     pub commit_time: u64,
 21 |     pub author: User,
 22 |     pub author_time: u64,
 23 |     pub co_authors: Vec<User>,
 24 |     pub change: CommitChange,
 25 |     pub lines_added: u64,
 26 |     pub lines_deleted: u64,
 27 | }
 28 | 
 29 | impl FileHistoryEntry {
 30 |     fn from(entry: &GitLogEntry, file_change: &FileChange) -> FileHistoryEntry {
 31 |         let entry = entry.clone();
 32 |         let file_change = file_change.clone();
 33 |         FileHistoryEntry {
 34 |             id: entry.id().clone(),
 35 |             committer: entry.committer().clone(),
 36 |             commit_time: *entry.commit_time(),
 37 |             author: entry.author().clone(),
 38 |             author_time: *entry.author_time(),
 39 |             co_authors: entry.co_authors().clone(),
 40 |             change: *file_change.change(),
 41 |             lines_added: *file_change.lines_added(),
 42 |             lines_deleted: *file_change.lines_deleted(),
 43 |         }
 44 |     }
 45 | }
 46 | 
 47 | #[cfg(test)]
 48 | impl FileHistoryEntryBuilder {
 49 |     pub fn test_default() -> Self {
 50 |         FileHistoryEntryBuilder::default()
 51 |             .co_authors(Vec::new())
 52 |             .change(CommitChange::Add)
 53 |             .lines_added(0u64)
 54 |             .lines_deleted(0u64)
 55 |     }
 56 |     pub fn emails(self, email: &str) -> Self {
 57 |         self.committer(User::new(None, Some(email)))
 58 |             .author(User::new(None, Some(email)))
 59 |     }
 60 | 
 61 |     pub fn times(self, time: u64) -> Self {
 62 |         self.commit_time(time).author_time(time)
 63 |     }
 64 | }
 65 | 
 66 | #[derive(Debug, Serialize)]
 67 | pub struct GitFileHistory {
 68 |     /// repo work dir - always canonical
 69 |     workdir: PathBuf,
 70 |     history_by_file: HashMap<PathBuf, Vec<FileHistoryEntry>>,
 71 |     last_commit: u64,
 72 | }
 73 | 
 74 | impl GitFileHistory {
 75 |     pub fn new(log: &mut GitLog) -> Result<GitFileHistory, Error> {
 76 |         let mut last_commit: u64 = 0;
 77 |         let mut history_by_file = HashMap::<PathBuf, Vec<FileHistoryEntry>>::new();
 78 |         info!("Loading git log");
 79 |         let progress_bar = ProgressBar::new_spinner()
 80 |             .with_style(ProgressStyle::default_spinner().template("[{elapsed}] {msg}")?);
 81 |         progress_bar.tick();
 82 |         // TODO: this was removed in indicatif 0.17 - do we need it?
 83 |         // see https://github.com/console-rs/indicatif/issues/393
 84 |         // progress_bar.set_draw_delta(100);
 85 | 
 86 |         // for handling renames, this needs to be a 2-pass process
 87 | 
 88 |         // This is ugly! I need to think of cleaning up, probably in one of two ways:
 89 |         // 1. ditch the whole "expose an iterator" interface - if we're loading it all into memory anyway, there's no point, could make the code cleaner and maybe get rid of the ugly use of Rc<RefCell<>>
 90 |         // 2. fully split the parsing into two passes, one to get parent/child info and one to get file summary.  This would use less memory - but might be slower?  YAGNI I think.
 91 | 
 92 |         let log_iterator = log.iterator()?;
 93 |         // I can't find a cleaner way for an iterator to have side effects
 94 |         let git_file_future_registry = log_iterator.git_file_future_registry();
 95 |         let mut progress_last_updated: u64 = 0;
 96 |         let log_entries: Vec<Result<GitLogEntry, Error>> = log_iterator
 97 |             // .progress_with(progress_bar)
 98 |             .inspect(|entry| {
 99 |                 if let Ok(entry) = entry {
100 |                     let commit_time = *entry.commit_time();
101 |                     // eprintln!("plu {} ct {}", progress_last_updated, commit_time);
102 |                     if progress_last_updated == 0 // never shown
103 |                         || (commit_time > progress_last_updated) // time gone backwards
104 |                         || (progress_last_updated - commit_time) > 60 * 60
105 |                     // more than an hour change
106 |                     {
107 |                         let fmt_time = Utc.timestamp(commit_time as i64, 0).to_string();
108 |                         progress_bar.set_message(fmt_time);
109 |                         progress_last_updated = commit_time;
110 |                         progress_bar.inc(1);
111 |                     }
112 |                 }
113 |             })
114 |             .collect();
115 |         progress_bar.finish();
116 | 
117 |         // safe to borrow this now as the iterator has gone and can't mutate any more
118 |         let git_file_future_registry = git_file_future_registry.borrow();
119 | 
120 |         info!("Processing git log with {} entries", log_entries.len());
121 |         let entrybar = ProgressBar::new(log_entries.len().try_into()?);
122 |         for entry in log_entries {
123 |             entrybar.tick();
124 |             match entry {
125 |                 Ok(entry) => {
126 |                     let commit_time = *entry.commit_time();
127 |                     // let fmt_time = Utc.timestamp(commit_time as i64, 0).to_string();
128 |                     // progress_bar.set_message(&fmt_time);
129 |                     if commit_time > last_commit {
130 |                         last_commit = commit_time;
131 |                     }
132 |                     for file_change in entry.clone().file_changes() {
133 |                         // TODO: use Oids so we don't need ugly conversion.
134 |                         let final_filename = git_file_future_registry
135 |                             .final_name(&Oid::from_str(entry.id()).unwrap(), file_change.file());
136 |                         if let Some(filename) = final_filename {
137 |                             let hash_entry =
138 |                                 history_by_file.entry(filename).or_insert_with(Vec::new);
139 |                             let new_entry = FileHistoryEntry::from(&entry, file_change);
140 |                             hash_entry.push(new_entry);
141 |                         } else {
142 |                             trace!(
143 |                                 "Not storing history for deleted file {:?}",
144 |                                 file_change.file()
145 |                             );
146 |                         }
147 |                     }
148 |                 }
149 |                 Err(e) => {
150 |                     warn!("Ignoring invalid git log entry: {:?}", e);
151 |                 }
152 |             }
153 |         }
154 |         entrybar.finish();
155 | 
156 |         Ok(GitFileHistory {
157 |             workdir: log.workdir().to_owned(),
158 |             history_by_file,
159 |             last_commit,
160 |         })
161 |     }
162 | 
163 |     /// true if this repo is valid for this file - file must exist (as we canonicalize it)
164 |     pub fn is_repo_for(&self, file: &Path) -> Result<bool, Error> {
165 |         let canonical_file = file.canonicalize()?;
166 |         Ok(canonical_file.starts_with(&self.workdir))
167 |     }
168 | 
169 |     /// get git history for this file - file must exist (as we canonicalize it)
170 |     pub fn history_for(&self, file: &Path) -> Result<Option<&Vec<FileHistoryEntry>>, Error> {
171 |         let canonical_file = file.canonicalize()?;
172 |         let relative_file = canonical_file.strip_prefix(&self.workdir)?;
173 |         Ok(self.history_by_file.get(relative_file))
174 |     }
175 | 
176 |     pub fn last_commit(&self) -> u64 {
177 |         self.last_commit
178 |     }
179 | }
180 | 
181 | #[cfg(test)]
182 | mod test {
183 |     use super::*;
184 |     use crate::git_logger::GitLogConfig;
185 |     use pretty_assertions::assert_eq;
186 |     use tempfile::tempdir;
187 |     use test_shared::{assert_eq_json_file, unzip_test_sample};
188 | 
189 |     #[test]
190 |     fn can_get_log_by_filename() -> Result<(), Error> {
191 |         let gitdir = tempdir()?;
192 |         let git_root = unzip_test_sample("git_sample", gitdir.path())?;
193 | 
194 |         let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
195 | 
196 |         let history = GitFileHistory::new(&mut git_log)?;
197 | 
198 |         assert_eq!(history.workdir.canonicalize()?, git_root.canonicalize()?);
199 | 
200 |         // assert_eq_json_str(&history.history_by_file, "{}");
201 |         assert_eq_json_file(
202 |             &history.history_by_file,
203 |             "./tests/expected/git/git_sample_by_filename.json",
204 |         );
205 | 
206 |         Ok(())
207 |     }
208 | 
209 |     #[test]
210 |     fn can_tell_if_file_is_in_git_repo() -> Result<(), Error> {
211 |         let gitdir = tempdir()?;
212 |         let git_root = unzip_test_sample("git_sample", gitdir.path())?;
213 | 
214 |         let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
215 | 
216 |         let history = GitFileHistory::new(&mut git_log)?;
217 | 
218 |         assert!(history.is_repo_for(&git_root.join("simple/parent.clj"))?);
219 | 
220 |         Ok(())
221 |     }
222 | 
223 |     #[test]
224 |     fn can_get_history_for_file() -> Result<(), Error> {
225 |         let gitdir = tempdir()?;
226 |         let git_root = unzip_test_sample("git_sample", gitdir.path())?;
227 | 
228 |         let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
229 | 
230 |         let history = GitFileHistory::new(&mut git_log)?;
231 | 
232 |         let file_history = history.history_for(&git_root.join("simple/parent.clj"))?;
233 | 
234 |         assert!(file_history.is_some());
235 | 
236 |         let ids: Vec<_> = file_history.unwrap().iter().map(|h| &h.id).collect();
237 |         assert_eq!(
238 |             ids,
239 |             vec![
240 |                 "0dbd54d4c524ecc776f381e660cce9b2dd92162c",
241 |                 "a0ae9997cfdf49fd0cbf54dacc72c778af337519",
242 |                 "ca239efb9b26db57ac9e2ec3e2df1c42578a46f8"
243 |             ]
244 |         );
245 | 
246 |         assert_eq!(history.last_commit(), 1_558_533_240);
247 | 
248 |         Ok(())
249 |     }
250 | 
251 |     #[test]
252 |     fn no_history_for_files_not_known() -> Result<(), Error> {
253 |         let gitdir = tempdir()?;
254 |         let git_root = unzip_test_sample("git_sample", gitdir.path())?;
255 | 
256 |         let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
257 | 
258 |         let history = GitFileHistory::new(&mut git_log)?;
259 | 
260 |         let new_file = git_root.join("simple/nonesuch.clj");
261 |         std::fs::File::create(&new_file)?;
262 | 
263 |         let file_history = history.history_for(&new_file)?;
264 | 
265 |         assert!(file_history.is_none());
266 | 
267 |         Ok(())
268 |     }
269 | 
270 |     #[test]
271 |     fn can_get_history_for_complex_renamed_files() -> Result<(), Error> {
272 |         let gitdir = tempdir()?;
273 |         let git_root = unzip_test_sample("rename_complex", gitdir.path())?;
274 |         /*
275 |         This is generated by the script in tests/data/builders/renaming/rename_complex.sh
276 | 
277 |         log is:
278 | 
279 |         * 3629e5a (HEAD -> master) restoring deleted z
280 |         *   261e027 merging dave work with fixes
281 |         |\
282 |         | * c3b47c3 (dave_work) rename bb to b, a2 back to a
283 |         | * 500a621 rename a1 to a2, add bb, kill z
284 |         * |   fac9419 merging jay work
285 |         |\ \
286 |         | * | 34b904b (jay_work) rename bee to b, aa back to a
287 |         | * | 3bd2d90 rename a1 to aa, add bee
288 |         | |/
289 |         * | 8be47df rename a1 back to a prep merging
290 |         |/
291 |         * 388e644 rename a to a1
292 |         * bd6d7df initial commit
293 |         */
294 | 
295 |         let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
296 | 
297 |         let history = GitFileHistory::new(&mut git_log)?;
298 | 
299 |         let file_history = history.history_for(&git_root.join("a.txt"))?;
300 | 
301 |         let ids: Vec<_> = file_history.unwrap().iter().map(|h| &h.id).collect();
302 |         assert_eq!(
303 |             ids,
304 |             // all of these refs have a file that ends up being "a.txt" via renames and merges:
305 |             vec![
306 |                 "c3b47c335ebd9dbb9b0c9922bc258555a2cf71c9",
307 |                 "500a621e9e83612f51dbce15202cd7bef3c88f00",
308 |                 "34b904b010abf316167bba7a7ce2b4a5996cc0d1",
309 |                 "3bd2d9088ee5b051ada1bd30f07e7bcd390f6327",
310 |                 "8be47dfc0a25ec27941413619f632a1fa66e5ba5",
311 |                 "388e644e9240aa333fe669069bb00d418ffca500",
312 |                 "bd6d7dfa063ec95ebc3bad7bffd4262e3702b77c",
313 |             ]
314 |         );
315 | 
316 |         Ok(())
317 |     }
318 | 
319 |     #[test]
320 |     fn deleted_files_dont_have_history() -> Result<(), Error> {
321 |         let gitdir = tempdir()?;
322 |         let git_root = unzip_test_sample("rename_complex", gitdir.path())?;
323 | 
324 |         let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?;
325 | 
326 |         let history = GitFileHistory::new(&mut git_log)?;
327 | 
328 |         let file_history = history.history_for(&git_root.join("z.txt"))?;
329 | 
330 |         assert!(file_history.is_some());
331 | 
332 |         let ids: Vec<_> = file_history.unwrap().iter().map(|h| &h.id).collect();
333 |         assert_eq!(
334 |             ids,
335 |             // z.txt is only using the final commit, not the earlier file that was deleted.
336 |             vec!["3629e5a8d8d7547bac749530eb540d0f61535cd1",]
337 |         );
338 | 
339 |         Ok(())
340 |     }
341 | }
342 | 


--------------------------------------------------------------------------------
/src/git.rs:
--------------------------------------------------------------------------------
  1 | use crate::flare::FlareTreeNode;
  2 | use crate::git_file_history::{FileHistoryEntry, GitFileHistory};
  3 | use crate::git_logger::{CommitChange, GitLog, GitLogConfig, User};
  4 | use crate::git_user_dictionary::GitUserDictionary;
  5 | use crate::polyglot_data::GitMetadata;
  6 | use crate::toxicity_indicator_calculator::ToxicityIndicatorCalculator;
  7 | use anyhow::{Context, Error};
  8 | use chrono::{NaiveDateTime, NaiveTime};
  9 | 
 10 | use serde::{Deserialize, Serialize};
 11 | 
 12 | use std::cmp::Ordering;
 13 | use std::collections::HashSet;
 14 | use std::collections::{BTreeSet, HashMap};
 15 | use std::iter::once;
 16 | 
 17 | use std::path::Path;
 18 | 
 19 | use git2::Repository;
 20 | 
 21 | /// a struct representing git data for a file
 22 | #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
 23 | pub struct GitData {
 24 |     pub last_update: u64,
 25 |     pub age_in_days: u64,
 26 |     // we only have a creation date if there was an Add change in the dates scanned
 27 |     pub creation_date: Option<u64>,
 28 |     pub user_count: usize,
 29 |     pub users: Vec<usize>, // dictionary IDs
 30 |     pub details: Vec<GitDetails>,
 31 |     pub activity: Vec<GitActivity>,
 32 | }
 33 | 
 34 | /// Git information for a given day _and_ unique set of users, summarized
 35 | /// New as of 0.3.3 - we now generate new `GitDetails` per user set - the file format hasn't changed but
 36 | /// instead of a single `GitDetails` per day, there might be multiple.
 37 | /// Also dates are summarized by "author date" - had to pick author or commit date, and
 38 | /// author dates seem more reliable.  But it's named "`commit_day`" as that's more understandable
 39 | /// WIP: for better coupling data, I want individual commits, rather than summarizing per day.
 40 | #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
 41 | pub struct GitDetails {
 42 |     /// Note this is based on "author date" - commit dates can be all over the place with PRs, rebasing and the like.
 43 |     pub commit_day: u64,
 44 |     pub users: BTreeSet<usize>, // dictionary IDs, ordered
 45 |     pub commits: u64,
 46 |     pub lines_added: u64,
 47 |     pub lines_deleted: u64,
 48 | }
 49 | 
 50 | impl Ord for GitDetails {
 51 |     fn cmp(&self, other: &Self) -> Ordering {
 52 |         let day_ordering = self.commit_day.cmp(&other.commit_day);
 53 |         if day_ordering != Ordering::Equal {
 54 |             return day_ordering;
 55 |         }
 56 |         self.users.cmp(&other.users)
 57 |     }
 58 | }
 59 | 
 60 | impl PartialOrd for GitDetails {
 61 |     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
 62 |         Some(self.cmp(other))
 63 |     }
 64 | }
 65 | 
 66 | /// this is the key to keep details stored uniquely
 67 | #[derive(Debug, PartialEq, Eq, Hash)]
 68 | struct GitDetailsKey {
 69 |     pub commit_day: u64,
 70 |     pub users: BTreeSet<usize>,
 71 | }
 72 | 
 73 | /// Fine-grained git activity, for the fine-grained coupling calculations
 74 | /// this is very verbose so probably shouldn't be kept in final JSON
 75 | #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
 76 | pub struct GitActivity {
 77 |     pub author_time: u64,
 78 |     pub commit_time: u64,
 79 |     pub users: BTreeSet<usize>, // dictionary IDs
 80 |     pub change: CommitChange,
 81 |     pub lines_added: u64,
 82 |     pub lines_deleted: u64,
 83 | }
 84 | impl Ord for GitActivity {
 85 |     fn cmp(&self, other: &Self) -> Ordering {
 86 |         self.commit_time.cmp(&other.commit_time)
 87 |     }
 88 | }
 89 | 
 90 | impl PartialOrd for GitActivity {
 91 |     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
 92 |         Some(self.cmp(other))
 93 |     }
 94 | }
 95 | 
 96 | /// History of any git roots discovered by the calculator
 97 | ///  Split from `GitCalculator` as we need to mutate the dictionary while borrowing the history immutably
 98 | #[derive(Debug)]
 99 | pub struct GitHistories {
100 |     git_file_histories: Vec<GitFileHistory>,
101 |     /// config used to initialize any git histories
102 |     git_log_config: GitLogConfig,
103 | }
104 | 
105 | #[derive(Debug)]
106 | pub struct GitCalculator {
107 |     histories: GitHistories,
108 |     dictionary: GitUserDictionary,
109 | }
110 | 
111 | // Git data for a directory - just remote git info
112 | #[derive(Debug, Clone, PartialEq, Eq, Serialize)]
113 | pub struct GitInfo {
114 |     pub remote_url: Option<String>,
115 |     pub head: Option<String>,
116 | }
117 | 
118 | // Git data for a file _or_ a directory
119 | #[derive(Debug, PartialEq, Eq, Clone, Serialize)]
120 | #[serde(untagged)]
121 | pub enum GitNodeData {
122 |     File {
123 |         #[serde(flatten)]
124 |         data: GitData,
125 |     },
126 |     Dir {
127 |         #[serde(flatten)]
128 |         data: GitInfo,
129 |     },
130 | }
131 | 
132 | fn repository_head(repository: &Repository) -> Result<String, Error> {
133 |     let head = repository.head()?;
134 |     let head_ref = head.resolve()?;
135 |     Ok(head_ref.peel_to_commit()?.id().to_string())
136 | }
137 | 
138 | impl GitInfo {
139 |     pub fn new(path: &Path, repository: &Repository) -> Self {
140 |         let remote = repository.find_remote("origin");
141 |         let remote_url = match remote {
142 |             Err(e) => {
143 |                 warn!("Error fetching origin for {:?}: {}", path, e);
144 |                 None
145 |             }
146 |             Ok(remote) => remote.url().map(str::to_owned),
147 |         };
148 |         let head = match repository_head(repository) {
149 |             Err(e) => {
150 |                 warn!("Error fetching head for {:?}: {}", path, e);
151 |                 None
152 |             }
153 |             Ok(head) => Some(head),
154 |         };
155 |         GitInfo { remote_url, head }
156 |     }
157 | }
158 | 
159 | fn start_of_day(secs_since_epoch: u64) -> u64 {
160 |     let date_time = NaiveDateTime::from_timestamp(secs_since_epoch as i64, 0);
161 |     date_time
162 |         .date()
163 |         .and_time(NaiveTime::from_num_seconds_from_midnight(0, 0))
164 |         .timestamp() as u64
165 | }
166 | impl GitHistories {
167 |     fn git_history(&self, filename: &Path) -> Option<&GitFileHistory> {
168 |         self.git_file_histories
169 |             .iter()
170 |             .find(|h| h.is_repo_for(filename).unwrap())
171 |         // TODO can we get rid of unwrap here?
172 |         // it's tricky as we can't return a Result.
173 |     }
174 | 
175 |     fn add_history_for(&mut self, filename: &Path) -> Result<(), Error> {
176 |         info!("Adding new git log for {:?}", &filename);
177 |         let mut git_log = GitLog::new(filename, self.git_log_config)?;
178 |         info!("Found working dir: {:?}", git_log.workdir());
179 |         let history = GitFileHistory::new(&mut git_log)?;
180 |         self.git_file_histories.push(history);
181 |         Ok(())
182 |     }
183 |     fn unique_changers(
184 |         history: &FileHistoryEntry,
185 |         dictionary: &mut GitUserDictionary,
186 |     ) -> BTreeSet<usize> {
187 |         let mut users: Vec<&User> = history
188 |             .co_authors
189 |             .iter()
190 |             .chain(once(&history.author))
191 |             .chain(once(&history.committer))
192 |             .collect();
193 |         users.sort();
194 |         users.dedup();
195 |         // this used to use a HashSet but I want deterministic ordering and so I want it in a vec anyway
196 |         users.into_iter().map(|u| dictionary.register(u)).collect()
197 |     }
198 | 
199 |     fn stats_from_history(
200 |         dictionary: &mut GitUserDictionary,
201 |         last_commit: u64,
202 |         history: &[FileHistoryEntry],
203 |     ) -> Option<GitData> {
204 |         // for now, just get latest change - maybe non-trivial change? (i.e. ignore rename/copy) - or this could be configurable
205 |         // and get set of all authors - maybe deduplicate by email.
206 |         if history.is_empty() {
207 |             return None;
208 |         }
209 |         let mut details: HashMap<GitDetailsKey, GitDetails> = HashMap::new();
210 | 
211 |         let first_date = history.iter().map(|h| h.author_time).min();
212 | 
213 |         let mut creation_date = history
214 |             .iter()
215 |             .filter(|h| h.change == CommitChange::Add)
216 |             .map(|h| h.author_time)
217 |             .min();
218 | 
219 |         if let Some(creation) = creation_date {
220 |             // TODO: test this!
221 |             if first_date.unwrap() < creation {
222 |                 debug!(
223 |                     "File has a git date {:?} before the first Add operation {:?}",
224 |                     first_date.unwrap(),
225 |                     creation
226 |                 );
227 |                 creation_date = None;
228 |             }
229 |         }
230 | 
231 |         let last_update = history.iter().map(|h| h.commit_time).max()?;
232 | 
233 |         let age_in_days = (last_commit - last_update) / (60 * 60 * 24);
234 | 
235 |         let changers: HashSet<usize> = history
236 |             .iter()
237 |             .flat_map(|h| GitHistories::unique_changers(h, dictionary))
238 |             .collect();
239 | 
240 |         let mut activity_vec: Vec<GitActivity> = Vec::new();
241 | 
242 |         for entry in history {
243 |             let author_day = start_of_day(entry.author_time);
244 |             let unique_changers = GitHistories::unique_changers(entry, dictionary);
245 |             let key = GitDetailsKey {
246 |                 commit_day: author_day,
247 |                 users: unique_changers.clone(),
248 |             };
249 |             let daily_details = details.entry(key).or_insert(GitDetails {
250 |                 commit_day: author_day,
251 |                 users: unique_changers.clone(),
252 |                 commits: 0,
253 |                 lines_added: 0,
254 |                 lines_deleted: 0,
255 |             });
256 |             daily_details.commits += 1;
257 |             daily_details
258 |                 .users
259 |                 .extend(unique_changers.clone().into_iter());
260 |             daily_details.lines_added += entry.lines_added;
261 |             daily_details.lines_deleted += entry.lines_deleted;
262 | 
263 |             let activity: GitActivity = GitActivity {
264 |                 commit_time: entry.commit_time,
265 |                 author_time: entry.author_time,
266 |                 users: unique_changers,
267 |                 change: entry.change,
268 |                 lines_added: entry.lines_added,
269 |                 lines_deleted: entry.lines_deleted,
270 |             };
271 |             activity_vec.push(activity);
272 |         }
273 | 
274 |         let mut changer_list: Vec<usize> = changers.into_iter().collect();
275 |         changer_list.sort_unstable();
276 | 
277 |         let mut details_vec: Vec<GitDetails> = details
278 |             .into_iter()
279 |             .map(|(_k, v)| v)
280 |             .collect::<Vec<GitDetails>>();
281 |         details_vec.sort();
282 | 
283 |         Some(GitData {
284 |             last_update,
285 |             age_in_days,
286 |             creation_date,
287 |             user_count: changer_list.len(),
288 |             users: changer_list,
289 |             details: details_vec,
290 |             activity: activity_vec,
291 |         })
292 |     }
293 | }
294 | 
295 | impl GitCalculator {
296 |     pub fn new(config: GitLogConfig) -> Self {
297 |         GitCalculator {
298 |             histories: GitHistories {
299 |                 git_file_histories: Vec::new(),
300 |                 git_log_config: config,
301 |             },
302 |             dictionary: GitUserDictionary::default(),
303 |         }
304 |     }
305 | }
306 | 
307 | impl ToxicityIndicatorCalculator for GitCalculator {
308 |     fn name(&self) -> String {
309 |         "git".to_string()
310 |     }
311 |     fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error> {
312 |         if path.is_file() {
313 |             // TODO: refactor this into a method on histories (I tried this but got into a mess with mutable and immutable refs to self!)
314 |             let history = match self.histories.git_history(path) {
315 |                 Some(history) => history,
316 |                 None => {
317 |                     info!("Loading git history for {}", path.display());
318 |                     self.histories
319 |                         .add_history_for(path)
320 |                         .with_context(|| format!("Loading git history based on {:?}", path))?;
321 |                     info!("history loaded.");
322 |                     self.histories.git_history(path).unwrap()
323 |                 }
324 |             };
325 |             let last_commit = history.last_commit();
326 |             let file_history = history
327 |                 .history_for(path)
328 |                 .with_context(|| format!("getting git file history for {:?}", path))?;
329 | 
330 |             if let Some(file_history) = file_history {
331 |                 let stats = GitHistories::stats_from_history(
332 |                     &mut self.dictionary,
333 |                     last_commit,
334 |                     file_history,
335 |                 );
336 |                 node.indicators_mut().git = stats.map(|stats| GitNodeData::File { data: stats });
337 |             } else {
338 |                 // probably outside date range
339 |                 debug!("No git history found for file: {:?}", path);
340 |             }
341 |         } else {
342 |             let git_path = path.join(".git");
343 |             if git_path.is_dir() {
344 |                 match Repository::discover(path) {
345 |                     Ok(repository) => {
346 |                         let info = GitInfo::new(path, &repository);
347 |                         node.indicators_mut().git = Some(GitNodeData::Dir { data: info });
348 |                     }
349 |                     Err(e) => {
350 |                         warn!(
351 |                             "Can't find git repository at {:?}, {} - ignoring .git directory",
352 |                             path, e
353 |                         );
354 |                     }
355 |                 }
356 |             }
357 |         }
358 |         Ok(())
359 |     }
360 | 
361 |     fn apply_metadata(
362 |         &self,
363 |         metadata: &mut crate::polyglot_data::IndicatorMetadata,
364 |     ) -> Result<(), Error> {
365 |         metadata.git = Some(GitMetadata {
366 |             users: self.dictionary.clone(),
367 |         });
368 |         Ok(())
369 |     }
370 | }
371 | 
372 | // Hacky - I need this constructor for coupling tests, until I build better integration tests
373 | #[cfg(test)]
374 | impl GitData {
375 |     pub fn fake_with_activity(activity: Vec<GitActivity>) -> Self {
376 |         Self {
377 |             last_update: 0,
378 |             age_in_days: 0,
379 |             creation_date: None,
380 |             user_count: 0,
381 |             users: Vec::new(),
382 |             details: Vec::new(),
383 |             activity,
384 |         }
385 |     }
386 | }
387 | 
388 | #[cfg(test)]
389 | mod test {
390 |     use super::*;
391 |     use crate::git_file_history::FileHistoryEntryBuilder;
392 |     use crate::git_logger::{CommitChange, User};
393 |     use pretty_assertions::assert_eq;
394 | 
395 |     lazy_static! {
396 |         static ref USER_JO: User = User::new(None, Some("jo@smith.com"));
397 |         static ref USER_X: User = User::new(None, Some("x@smith.com"));
398 |         static ref USER_Y: User = User::new(Some("Why"), Some("y@smith.com"));
399 |     }
400 | 
401 |     #[test]
402 |     fn gets_basic_stats_from_git_events() -> Result<(), Error> {
403 |         let one_day_in_secs: u64 = 60 * 60 * 24;
404 | 
405 |         let first_day = one_day_in_secs;
406 | 
407 |         let events: Vec<FileHistoryEntry> = vec![
408 |             FileHistoryEntryBuilder::test_default()
409 |                 .emails("jo@smith.com")
410 |                 .times(first_day)
411 |                 .id("1111")
412 |                 .build()
413 |                 .map_err(Error::msg)?,
414 |             FileHistoryEntryBuilder::test_default()
415 |                 .emails("x@smith.com")
416 |                 .times(first_day + 3 * one_day_in_secs)
417 |                 .author(User::new(Some("Why"), Some("y@smith.com")))
418 |                 .id("2222")
419 |                 .build()
420 |                 .map_err(Error::msg)?,
421 |         ];
422 |         let mut dictionary = GitUserDictionary::default();
423 | 
424 |         let today = first_day + 5 * one_day_in_secs;
425 | 
426 |         let stats = GitHistories::stats_from_history(&mut dictionary, today, &events).unwrap();
427 | 
428 |         assert_eq!(stats.last_update, first_day + 3 * one_day_in_secs);
429 |         assert_eq!(stats.age_in_days, 2);
430 |         assert_eq!(stats.creation_date, Some(86400));
431 |         assert_eq!(stats.user_count, 3);
432 |         assert_eq!(stats.users, vec![0, 1, 2]);
433 |         // don't assert details - details used to be optional, so it is tested in next test.
434 | 
435 |         assert_eq!(dictionary.user_count(), 3);
436 |         assert_eq!(dictionary.user_id(&USER_JO), Some(&0));
437 |         assert_eq!(dictionary.user_id(&USER_X), Some(&1));
438 |         assert_eq!(dictionary.user_id(&USER_Y), Some(&2));
439 | 
440 |         Ok(())
441 |     }
442 | 
443 |     #[test]
444 |     fn gets_detailed_stats_from_git_events() -> Result<(), Error> {
445 |         let one_day_in_secs: u64 = 60 * 60 * 24;
446 | 
447 |         let first_day = one_day_in_secs;
448 | 
449 |         let events: Vec<FileHistoryEntry> = vec![
450 |             FileHistoryEntryBuilder::test_default()
451 |                 .emails("jo@smith.com")
452 |                 .times(first_day)
453 |                 .id("1111")
454 |                 .build()
455 |                 .map_err(Error::msg)?,
456 |             FileHistoryEntryBuilder::test_default()
457 |                 .emails("jo@smith.com")
458 |                 .times(first_day)
459 |                 .author(User::new(Some("Why"), Some("y@smith.com"))) // second author so new stats
460 |                 .id("1111")
461 |                 .build()
462 |                 .map_err(Error::msg)?,
463 |             FileHistoryEntryBuilder::test_default()
464 |                 .emails("x@smith.com")
465 |                 .times(first_day + 3 * one_day_in_secs)
466 |                 .author(User::new(Some("Why"), Some("y@smith.com")))
467 |                 .id("2222")
468 |                 .build()
469 |                 .map_err(Error::msg)?,
470 |         ];
471 | 
472 |         let mut dictionary = GitUserDictionary::default();
473 | 
474 |         let today = first_day + 5 * one_day_in_secs;
475 | 
476 |         let stats = GitHistories::stats_from_history(&mut dictionary, today, &events);
477 | 
478 |         let jo_set: BTreeSet<usize> = vec![0].into_iter().collect();
479 |         let xy_set: BTreeSet<usize> = vec![1, 2].into_iter().collect();
480 |         let jo_y_set: BTreeSet<usize> = vec![0, 1].into_iter().collect();
481 | 
482 |         let expected_details: Vec<GitDetails> = vec![
483 |             GitDetails {
484 |                 commit_day: 86400,
485 |                 users: jo_set.clone(),
486 |                 commits: 1,
487 |                 lines_added: 0,
488 |                 lines_deleted: 0,
489 |             },
490 |             GitDetails {
491 |                 commit_day: 86400,
492 |                 users: jo_y_set.clone(),
493 |                 commits: 1,
494 |                 lines_added: 0,
495 |                 lines_deleted: 0,
496 |             },
497 |             GitDetails {
498 |                 commit_day: 345_600,
499 |                 users: xy_set.clone(),
500 |                 commits: 1,
501 |                 lines_added: 0,
502 |                 lines_deleted: 0,
503 |             },
504 |         ];
505 | 
506 |         let expected_activity: Vec<GitActivity> = vec![
507 |             GitActivity {
508 |                 author_time: 86400,
509 |                 commit_time: 86400,
510 |                 users: jo_set,
511 |                 change: CommitChange::Add,
512 |                 lines_added: 0,
513 |                 lines_deleted: 0,
514 |             },
515 |             GitActivity {
516 |                 author_time: 86400,
517 |                 commit_time: 86400,
518 |                 users: jo_y_set,
519 |                 change: CommitChange::Add,
520 |                 lines_added: 0,
521 |                 lines_deleted: 0,
522 |             },
523 |             GitActivity {
524 |                 author_time: 345_600,
525 |                 commit_time: 345_600,
526 |                 users: xy_set,
527 |                 change: CommitChange::Add,
528 |                 lines_added: 0,
529 |                 lines_deleted: 0,
530 |             },
531 |         ];
532 | 
533 |         assert_eq!(
534 |             stats,
535 |             Some(GitData {
536 |                 last_update: first_day + 3 * one_day_in_secs,
537 |                 age_in_days: 2,
538 |                 creation_date: Some(86400),
539 |                 user_count: 3,
540 |                 users: vec![0, 1, 2],
541 |                 details: expected_details,
542 |                 activity: expected_activity,
543 |             })
544 |         );
545 | 
546 |         assert_eq!(dictionary.user_count(), 3);
547 |         assert_eq!(dictionary.user_id(&USER_JO), Some(&0));
548 |         assert_eq!(dictionary.user_id(&USER_Y), Some(&1));
549 |         assert_eq!(dictionary.user_id(&USER_X), Some(&2));
550 | 
551 |         Ok(())
552 |     }
553 | }
554 | 


--------------------------------------------------------------------------------
/src/git_logger.rs:
--------------------------------------------------------------------------------
  1 | #![warn(clippy::all)]
  2 | use crate::git_file_future::{FileNameChange, GitFileFutureRegistry};
  3 | use anyhow::Error;
  4 | use git2::Revwalk;
  5 | use git2::{Commit, Delta, DiffDelta, ObjectType, Odb, Oid, Patch, Repository, Tree};
  6 | use regex::Regex;
  7 | use serde::{Deserialize, Serialize};
  8 | use std::cell::RefCell;
  9 | use std::path::{Path, PathBuf};
 10 | use std::rc::Rc;
 11 | use std::time::{Duration, SystemTime};
 12 | 
 13 | #[derive(Debug, Clone, Copy)]
 14 | pub struct GitLogConfig {
 15 |     /// include merge commits in file stats - usually excluded by `git log` - see https://stackoverflow.com/questions/37801342/using-git-log-to-display-files-changed-during-merge
 16 |     include_merges: bool,
 17 |     /// earliest commmit for filtering
 18 |     earliest_time: Option<u64>,
 19 | }
 20 | 
 21 | impl GitLogConfig {
 22 |     pub fn default() -> GitLogConfig {
 23 |         GitLogConfig {
 24 |             include_merges: false,
 25 |             earliest_time: None,
 26 |         }
 27 |     }
 28 | 
 29 |     #[allow(dead_code)]
 30 |     pub fn include_merges(self, include_merges: bool) -> GitLogConfig {
 31 |         let mut config = self;
 32 |         config.include_merges = include_merges;
 33 |         config
 34 |     }
 35 |     /// filter log by unix timestamp
 36 |     pub fn since(self, earliest_time: Option<u64>) -> GitLogConfig {
 37 |         let mut config = self;
 38 |         config.earliest_time = earliest_time;
 39 |         config
 40 |     }
 41 |     /// filter log by number of years before now
 42 |     pub fn since_years(self, years: Option<u64>) -> GitLogConfig {
 43 |         if let Some(years) = years {
 44 |             let years_ago = SystemTime::now() - Duration::from_secs(60 * 60 * 24 * 365 * years);
 45 |             let years_ago_secs = years_ago
 46 |                 .duration_since(SystemTime::UNIX_EPOCH)
 47 |                 .unwrap()
 48 |                 .as_secs();
 49 |             self.since(Some(years_ago_secs))
 50 |         } else {
 51 |             self.since(None)
 52 |         }
 53 |     }
 54 | }
 55 | 
 56 | pub struct GitLog {
 57 |     /// repo work dir - always canonical
 58 |     workdir: PathBuf,
 59 |     repo: Repository,
 60 |     config: GitLogConfig,
 61 | }
 62 | 
 63 | pub struct GitLogIterator<'a> {
 64 |     git_log: &'a GitLog,
 65 |     odb: Odb<'a>,
 66 |     revwalk: Revwalk<'a>,
 67 |     // this is an RC as we need to use it after the iterator has been consumed
 68 |     git_file_future_registry: Rc<RefCell<GitFileFutureRegistry>>,
 69 | }
 70 | 
 71 | /// simplified user info - based on `git2::Signature`
 72 | /// everything is derived, seems to work OK as the structure is so simple
 73 | #[derive(Debug, PartialEq, Eq, Hash, Clone, PartialOrd, Ord, Serialize)]
 74 | pub struct User {
 75 |     name: Option<String>,
 76 |     email: Option<String>,
 77 | }
 78 | 
 79 | impl User {
 80 |     pub fn new(name: Option<&str>, email: Option<&str>) -> User {
 81 |         User {
 82 |             name: name.map(std::borrow::ToOwned::to_owned),
 83 |             email: email.map(std::borrow::ToOwned::to_owned),
 84 |         }
 85 |     }
 86 | 
 87 |     pub fn as_lower_case(&self) -> User {
 88 |         User {
 89 |             name: self.name.as_ref().map(|s| s.to_lowercase()),
 90 |             email: self.email.as_ref().map(|s| s.to_lowercase()),
 91 |         }
 92 |     }
 93 | }
 94 | 
 95 | /// simplified commit log entry
 96 | #[derive(Debug, Serialize, Clone, Getters)]
 97 | pub struct GitLogEntry {
 98 |     id: String,
 99 |     summary: String,
100 |     parents: Vec<String>,
101 |     committer: User,
102 |     commit_time: u64,
103 |     author: User,
104 |     author_time: u64,
105 |     co_authors: Vec<User>,
106 |     file_changes: Vec<FileChange>,
107 | }
108 | 
109 | /// the various kinds of git change we care about - a serializable subset of `git2::Delta`
110 | #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Copy)]
111 | pub enum CommitChange {
112 |     Add,
113 |     Rename,
114 |     Delete,
115 |     Modify,
116 |     Copied,
117 | }
118 | 
119 | /// Stats for file changes
120 | #[derive(Debug, Serialize, Clone, Getters)]
121 | pub struct FileChange {
122 |     file: PathBuf,
123 |     old_file: Option<PathBuf>,
124 |     change: CommitChange,
125 |     lines_added: u64,
126 |     lines_deleted: u64,
127 | }
128 | 
129 | impl GitLog {
130 |     pub fn workdir(&self) -> &Path {
131 |         &self.workdir
132 |     }
133 | 
134 |     pub fn new(start_dir: &Path, config: GitLogConfig) -> Result<GitLog, Error> {
135 |         let repo = Repository::discover(start_dir)?;
136 | 
137 |         let workdir = repo
138 |             .workdir()
139 |             .ok_or_else(|| anyhow!("bare repository - no workdir"))?
140 |             .canonicalize()?;
141 | 
142 |         debug!("work dir: {:?}", workdir);
143 | 
144 |         Ok(GitLog {
145 |             workdir,
146 |             repo,
147 |             config,
148 |         })
149 |     }
150 | 
151 |     pub fn iterator(&self) -> Result<GitLogIterator<'_>, Error> {
152 |         let odb = self.repo.odb()?;
153 |         let mut revwalk = self.repo.revwalk()?;
154 |         revwalk.set_sorting(git2::Sort::TOPOLOGICAL)?;
155 |         revwalk.push_head()?;
156 |         Ok(GitLogIterator {
157 |             git_log: self,
158 |             odb,
159 |             revwalk,
160 |             git_file_future_registry: Rc::new(RefCell::new(GitFileFutureRegistry::new())),
161 |         })
162 |     }
163 | }
164 | 
165 | impl<'a> Iterator for GitLogIterator<'a> {
166 |     type Item = Result<GitLogEntry, Error>;
167 | 
168 |     fn next(&mut self) -> Option<Self::Item> {
169 |         let mut next_item = self.revwalk.next();
170 |         while next_item.is_some() {
171 |             let oid = next_item.unwrap();
172 |             // this is a bit ugly - revwalk iterates over Result<Oid, Error> types, so some entries aren't Oids at all
173 |             // but I want an error context, and it's easier to create it here than in all the spots later that might
174 |             // return errors.
175 |             let error_context = if let Ok(valid_oid) = oid {
176 |                 format!("Processing oid {:?}", valid_oid)
177 |             } else {
178 |                 "Processing unkown oid from revwalk".to_string()
179 |             };
180 |             let c = self.summarise_commit(oid);
181 |             match c {
182 |                 Ok(Some(c)) => {
183 |                     let commit_in_range = self
184 |                         .git_log
185 |                         .config
186 |                         .earliest_time
187 |                         .map_or(true, |earliest| c.commit_time >= earliest);
188 | 
189 |                     if commit_in_range {
190 |                         self.register_file_futures(&c);
191 |                         return Some(Ok(c));
192 |                     } else {
193 |                         return None; // short circuit!
194 |                     }
195 |                 }
196 |                 Ok(None) => {}
197 |                 Err(e) => return Some(Err(e.context(error_context))),
198 |             };
199 |             next_item = self.revwalk.next();
200 |         }
201 |         None
202 |     }
203 | }
204 | 
205 | impl<'a> GitLogIterator<'a> {
206 |     pub fn git_file_future_registry(&self) -> Rc<RefCell<GitFileFutureRegistry>> {
207 |         self.git_file_future_registry.clone()
208 |     }
209 | 
210 |     /// registers renames and deletes
211 |     fn register_file_futures(&mut self, entry: &GitLogEntry) {
212 |         // TODO: probably should be using Oid not String globally, then this would be simpler:
213 |         let parents: Vec<Oid> = entry
214 |             .parents
215 |             .iter()
216 |             .map(|id| Oid::from_str(id).unwrap())
217 |             .collect();
218 |         let mut file_changes: Vec<(PathBuf, FileNameChange)> = Vec::new();
219 |         for file_change in &entry.file_changes {
220 |             match file_change.change {
221 |                 CommitChange::Rename => {
222 |                     let old_name = file_change.old_file.as_ref().unwrap().clone();
223 |                     let new_name = file_change.file.clone();
224 |                     file_changes.push((old_name, FileNameChange::Renamed(new_name)));
225 |                 }
226 |                 CommitChange::Delete => {
227 |                     let name = file_change.file.clone();
228 |                     file_changes.push((name, FileNameChange::Deleted()));
229 |                 }
230 |                 _ => (),
231 |             }
232 |         }
233 |         self.git_file_future_registry.borrow_mut().register(
234 |             &Oid::from_str(&entry.id).unwrap(),
235 |             &parents,
236 |             &file_changes,
237 |         );
238 |     }
239 | 
240 |     /// Summarises a git commit
241 |     /// returns Error if error, Result<None> if the id was not actually a commit, or Result<Some<GitLogEntry>> if valid
242 |     fn summarise_commit(
243 |         &self,
244 |         oid: Result<Oid, git2::Error>,
245 |     ) -> Result<Option<GitLogEntry>, Error> {
246 |         let oid = oid?;
247 |         let kind = self.odb.read(oid)?.kind();
248 |         match kind {
249 |             ObjectType::Commit => {
250 |                 let commit = self.git_log.repo.find_commit(oid)?;
251 |                 debug!("processing {:?}", commit);
252 |                 let author = commit.author();
253 |                 let committer = commit.committer();
254 |                 let author_time = author.when().seconds() as u64;
255 |                 let commit_time = committer.when().seconds() as u64;
256 |                 let other_time = commit.time().seconds() as u64;
257 |                 if commit_time != other_time {
258 |                     error!(
259 |                         "Commit {:?} time {:?} != commit time {:?}",
260 |                         commit, other_time, commit_time
261 |                     );
262 |                 }
263 |                 let co_authors = if let Some(message) = commit.message() {
264 |                     find_coauthors(message)
265 |                 } else {
266 |                     Vec::new()
267 |                 };
268 | 
269 |                 let commit_tree = commit.tree()?;
270 |                 let file_changes = commit_file_changes(
271 |                     &self.git_log.repo,
272 |                     &commit,
273 |                     &commit_tree,
274 |                     self.git_log.config,
275 |                 );
276 |                 Ok(Some(GitLogEntry {
277 |                     id: oid.to_string(),
278 |                     summary: commit.summary().unwrap_or("[no message]").to_string(),
279 |                     parents: commit.parent_ids().map(|p| p.to_string()).collect(),
280 |                     committer: signature_to_user(&committer),
281 |                     commit_time,
282 |                     author: signature_to_user(&author),
283 |                     author_time,
284 |                     co_authors,
285 |                     file_changes,
286 |                 }))
287 |             }
288 |             _ => {
289 |                 info!("ignoring object type: {}", kind);
290 |                 Ok(None)
291 |             }
292 |         }
293 |     }
294 | }
295 | 
296 | fn signature_to_user(signature: &git2::Signature<'_>) -> User {
297 |     User {
298 |         name: signature.name().map(std::borrow::ToOwned::to_owned),
299 |         email: signature.email().map(std::borrow::ToOwned::to_owned),
300 |     }
301 | }
302 | 
303 | fn trim_string(s: &str) -> Option<&str> {
304 |     let trimmed = s.trim();
305 |     if trimmed.is_empty() {
306 |         None
307 |     } else {
308 |         Some(trimmed)
309 |     }
310 | }
311 | 
312 | fn find_coauthors(message: &str) -> Vec<User> {
313 |     lazy_static! {
314 |         static ref CO_AUTH_LINE: Regex = Regex::new(r"(?m)^\s*Co-authored-by:(.*)$").unwrap();
315 |         static ref CO_AUTH_ANGLE_BRACKETS: Regex = Regex::new(r"^(.*)<([^>]+)>\s*$").unwrap();
316 |     }
317 | 
318 |     CO_AUTH_LINE
319 |         .captures_iter(message)
320 |         .map(|capture_group| {
321 |             let co_author_text = &capture_group[1];
322 |             if let Some(co_author_bits) = CO_AUTH_ANGLE_BRACKETS.captures(co_author_text) {
323 |                 User::new(
324 |                     trim_string(co_author_bits.get(1).unwrap().as_str()),
325 |                     trim_string(co_author_bits.get(2).unwrap().as_str()),
326 |                 )
327 |             } else if co_author_text.contains('@') {
328 |                 // no angle brackets, but an @
329 |                 User::new(None, trim_string(co_author_text))
330 |             } else {
331 |                 User::new(trim_string(co_author_text), None)
332 |             }
333 |         })
334 |         .collect()
335 | }
336 | 
337 | fn commit_file_changes(
338 |     repo: &Repository,
339 |     commit: &Commit<'_>,
340 |     commit_tree: &Tree<'_>,
341 |     config: GitLogConfig,
342 | ) -> Vec<FileChange> {
343 |     if commit.parent_count() == 0 {
344 |         info!("Commit {} has no parent", commit.id());
345 | 
346 |         scan_diffs(repo, commit_tree, None, commit, None).expect("Can't scan for diffs")
347 |     } else if commit.parent_count() > 1 && !config.include_merges {
348 |         debug!(
349 |             "Not showing file changes for merge commit {:?}",
350 |             commit.id()
351 |         );
352 |         Vec::new()
353 |     } else {
354 |         commit
355 |             .parents()
356 |             .flat_map(|parent| {
357 |                 debug!("Getting changes for parent {:?}:", parent);
358 |                 let parent_tree = parent.tree().expect("can't get parent tree");
359 |                 scan_diffs(repo, commit_tree, Some(&parent_tree), commit, Some(&parent))
360 |                     .expect("Can't scan for diffs")
361 |             })
362 |             .collect()
363 |     }
364 | }
365 | 
366 | fn scan_diffs(
367 |     repo: &Repository,
368 |     commit_tree: &Tree<'_>,
369 |     parent_tree: Option<&Tree<'_>>,
370 |     commit: &Commit<'_>,
371 |     parent: Option<&Commit<'_>>,
372 | ) -> Result<Vec<FileChange>, Error> {
373 |     let mut diff = repo.diff_tree_to_tree(parent_tree, Some(commit_tree), None)?;
374 |     // Identify renames, None means default settings - see https://libgit2.org/libgit2/#HEAD/group/diff/git_diff_find_similar
375 |     diff.find_similar(None)?;
376 |     let file_changes = diff
377 |         .deltas()
378 |         .enumerate()
379 |         .filter_map(|(delta_index, delta)| {
380 |             // can we / should we get bytes for binary changes?  Adds show as 0 lines.
381 |             let patch =
382 |                 Patch::from_diff(&diff, delta_index).expect("can't get a patch from a diff");
383 |             let (_, lines_added, lines_deleted) = if let Some(patch) = patch {
384 |                 patch
385 |                     .line_stats()
386 |                     .expect("Couldn't get line stats from a patch")
387 |             } else {
388 |                 warn!("No patch possible diffing {:?} -> {:?}", commit, parent);
389 |                 (0, 0, 0)
390 |             };
391 |             summarise_delta(&delta, lines_added as u64, lines_deleted as u64)
392 |         });
393 |     Ok(file_changes.collect())
394 | }
395 | 
396 | fn summarise_delta(
397 |     delta: &DiffDelta<'_>,
398 |     lines_added: u64,
399 |     lines_deleted: u64,
400 | ) -> Option<FileChange> {
401 |     match delta.status() {
402 |         Delta::Added => {
403 |             let name = delta.new_file().path().unwrap();
404 |             Some(FileChange {
405 |                 file: name.to_path_buf(),
406 |                 old_file: None,
407 |                 change: CommitChange::Add,
408 |                 lines_added,
409 |                 lines_deleted,
410 |             })
411 |         }
412 |         Delta::Renamed => {
413 |             let old_name = delta.old_file().path().unwrap();
414 |             let new_name = delta.new_file().path().unwrap();
415 |             Some(FileChange {
416 |                 file: new_name.to_path_buf(),
417 |                 old_file: Some(old_name.to_path_buf()),
418 |                 change: CommitChange::Rename,
419 |                 lines_added,
420 |                 lines_deleted,
421 |             })
422 |         }
423 |         Delta::Deleted => {
424 |             let name = delta.old_file().path().unwrap();
425 |             Some(FileChange {
426 |                 file: name.to_path_buf(),
427 |                 old_file: None,
428 |                 change: CommitChange::Delete,
429 |                 lines_added,
430 |                 lines_deleted,
431 |             })
432 |         }
433 |         Delta::Modified => {
434 |             let name = delta.new_file().path().unwrap();
435 |             Some(FileChange {
436 |                 file: name.to_path_buf(),
437 |                 old_file: None,
438 |                 change: CommitChange::Modify,
439 |                 lines_added,
440 |                 lines_deleted,
441 |             })
442 |         }
443 |         Delta::Copied => {
444 |             let old_name = delta.old_file().path().unwrap();
445 |             let new_name = delta.new_file().path().unwrap();
446 |             Some(FileChange {
447 |                 file: new_name.to_path_buf(),
448 |                 old_file: Some(old_name.to_path_buf()),
449 |                 change: CommitChange::Copied,
450 |                 lines_added,
451 |                 lines_deleted,
452 |             })
453 |         }
454 |         _ => {
455 |             error!("Not able to handle delta of status {:?}", delta.status());
456 |             None
457 |         }
458 |     }
459 | }
460 | 
461 | #[cfg(test)]
462 | mod test {
463 |     use super::*;
464 |     use pretty_assertions::assert_eq;
465 |     use serde_json::json;
466 |     use tempfile::tempdir;
467 |     use test_shared::{assert_eq_json_file, assert_eq_json_value, unzip_test_sample};
468 | 
469 |     #[test]
470 |     fn users_can_be_lowercased() {
471 |         assert_eq!(
472 |             User::new(Some("Fred"), Some("Fred@Gmail.com")).as_lower_case(),
473 |             User::new(Some("fred"), Some("fred@gmail.com"))
474 |         );
475 |         assert_eq!(
476 |             User::new(None, Some("Fred@Gmail.com")).as_lower_case(),
477 |             User::new(None, Some("fred@gmail.com"))
478 |         );
479 |         assert_eq!(
480 |             User::new(Some("Fred"), None).as_lower_case(),
481 |             User::new(Some("fred"), None)
482 |         );
483 |         assert_eq!(User::new(None, None).as_lower_case(), User::new(None, None));
484 |     }
485 | 
486 |     #[test]
487 |     fn authorless_message_has_no_coauthors() {
488 |         assert_eq!(find_coauthors("do be do be do"), Vec::<User>::new());
489 |     }
490 | 
491 |     #[test]
492 |     fn can_get_coauthors_from_message() {
493 |         let message = "This is a commit message
494 |         not valid: Co-authored-by: fred jones
495 |         Co-authored-by: valid user <valid@thing.com>
496 |         Co-authored-by: White Space <handles_trailing_whitespace@any-domain.com>\t\r
497 |         Co-authored-by: <be.lenient@any-domain.com>
498 |         Co-authored-by: bad@user <this isn't really trying to be clever>
499 |         ignore random lines
500 |         Co-authored-by: if there's no at it's a name
501 |         Co-authored-by: if there's an @ it's email@thing.com
502 |         ignore trailing lines
503 |         ";
504 | 
505 |         let expected = vec![
506 |             User::new(Some("valid user"), Some("valid@thing.com")),
507 |             User::new(
508 |                 Some("White Space"),
509 |                 Some("handles_trailing_whitespace@any-domain.com"),
510 |             ),
511 |             User::new(None, Some("be.lenient@any-domain.com")),
512 |             User::new(
513 |                 Some("bad@user"),
514 |                 Some("this isn't really trying to be clever"),
515 |             ),
516 |             User::new(Some("if there's no at it's a name"), None),
517 |             User::new(None, Some("if there's an @ it's email@thing.com")),
518 |         ];
519 | 
520 |         assert_eq!(find_coauthors(message), expected);
521 |     }
522 | 
523 |     #[test]
524 |     fn can_extract_basic_git_log() -> Result<(), Error> {
525 |         let gitdir = tempdir()?;
526 |         let git_root = unzip_test_sample("git_sample", gitdir.path())?;
527 |         let git_log = GitLog::new(&git_root, GitLogConfig::default())?;
528 | 
529 |         assert_eq!(git_log.workdir.canonicalize()?, git_root.canonicalize()?);
530 | 
531 |         let err_count = git_log.iterator()?.filter(|x| Result::is_err(x)).count();
532 |         assert_eq!(err_count, 0);
533 | 
534 |         let entries: Vec<_> = git_log.iterator()?.filter_map(Result::ok).collect();
535 | 
536 |         assert_eq_json_file(&entries, "./tests/expected/git/git_sample.json");
537 | 
538 |         Ok(())
539 |     }
540 | 
541 |     #[test]
542 |     fn git_log_can_include_merge_changes() -> Result<(), Error> {
543 |         let gitdir = tempdir()?;
544 |         let git_root = unzip_test_sample("git_sample", gitdir.path())?;
545 | 
546 |         let git_log = GitLog::new(&git_root, GitLogConfig::default().include_merges(true))?;
547 | 
548 |         let err_count = git_log.iterator()?.filter(Result::is_err).count();
549 |         assert_eq!(err_count, 0);
550 | 
551 |         let entries: Vec<_> = git_log.iterator()?.filter_map(Result::ok).collect();
552 | 
553 |         assert_eq_json_file(&entries, "./tests/expected/git/git_sample_with_merges.json");
554 | 
555 |         Ok(())
556 |     }
557 | 
558 |     #[allow(clippy::unreadable_literal)]
559 |     #[test]
560 |     fn git_log_can_limit_to_recent_history() -> Result<(), Error> {
561 |         let gitdir = tempdir()?;
562 |         let git_root = unzip_test_sample("git_sample", gitdir.path())?;
563 | 
564 |         let git_log = GitLog::new(&git_root, GitLogConfig::default().since(Some(1558521694)))?;
565 | 
566 |         let err_count = git_log.iterator()?.filter(Result::is_err).count();
567 |         assert_eq!(err_count, 0);
568 | 
569 |         let ids: Vec<_> = git_log
570 |             .iterator()?
571 |             .filter_map(Result::ok)
572 |             .map(|h| (h.summary.clone(), h.commit_time))
573 |             .collect();
574 |         assert_eq!(
575 |             ids,
576 |             vec![
577 |                 ("renaming".to_owned(), 1558533240u64),
578 |                 ("just changed parent.clj".to_owned(), 1558524371u64),
579 |                 ("Merge branch \'fiddling\'".to_owned(), 1558521695u64)
580 |             ]
581 |         );
582 | 
583 |         Ok(())
584 |     }
585 | 
586 |     #[test]
587 |     fn git_log_tracks_renames() -> Result<(), Error> {
588 |         let gitdir = tempdir()?;
589 |         let git_root = unzip_test_sample("rename_simple", gitdir.path())?;
590 | 
591 |         let git_log = GitLog::new(&git_root, GitLogConfig::default())?;
592 | 
593 |         let err_count = git_log.iterator()?.filter(Result::is_err).count();
594 |         assert_eq!(err_count, 0);
595 | 
596 |         let mut entries: Vec<_> = git_log.iterator()?.filter_map(Result::ok).collect();
597 |         entries.sort_by(|a, b| a.author_time.cmp(&b.author_time));
598 | 
599 |         let changes: Vec<String> = entries.iter().map(|entry| entry.summary.clone()).collect();
600 | 
601 |         assert_eq!(
602 |             changes,
603 |             vec![
604 |                 "initial commit",
605 |                 "unrelated commit",
606 |                 "moving a to c",
607 |                 "moving and renaming"
608 |             ]
609 |         );
610 | 
611 |         let file_changes: Vec<Vec<FileChange>> = entries
612 |             .iter()
613 |             .map(|entry| {
614 |                 let mut entries = entry.file_changes.clone();
615 |                 entries.sort_by(|a, b| a.file.cmp(&b.file));
616 |                 entries
617 |             })
618 |             .collect();
619 | 
620 |         assert_eq_json_value(
621 |             &file_changes,
622 |             &json!([
623 |                 [{"change":"Add",
624 |                   "file":"a.txt",
625 |                   "lines_added": 4,
626 |                   "lines_deleted": 0,
627 |                   "old_file": null}
628 |                 ],
629 |                 [{"change":"Add",
630 |                   "file":"b.txt",
631 |                   "lines_added": 1,
632 |                   "lines_deleted": 0,
633 |                   "old_file": null}
634 |                 ],
635 |                 [{"change":"Rename",
636 |                   "file":"c.txt",
637 |                   "lines_added": 0,
638 |                   "lines_deleted": 0,
639 |                   "old_file": "a.txt"}
640 |                 ],
641 |                 [{"change":"Rename",
642 |                   "file":"d.txt",
643 |                   "lines_added": 1,
644 |                   "lines_deleted": 0,
645 |                   "old_file": "c.txt"}
646 |                 ]
647 |                ]
648 |             ),
649 |         );
650 | 
651 |         Ok(())
652 |     }
653 | }
654 | /*
655 | <Array([
656 | <    Array([
657 | <        Object({
658 | <            "change": String(
659 | <                "Add",
660 | <            ),
661 | <            "file": String(
662 | <                "a.txt",
663 | <            ),
664 | <            "lines_added": Number(
665 | <                4,
666 | <            ),
667 | <            "lines_deleted": Number(
668 | <                0,
669 | <            ),
670 | <            "old_file": Null,
671 | <        }),
672 | <    ]),
673 | <    Array([
674 | <        Object({
675 | <            "change": String(
676 | <                "Add",
677 | <            ),
678 | <            "file": String(
679 | <                "b.txt",
680 | <            ),
681 | <            "lines_added": Number(
682 | <                1,
683 | <            ),
684 | <            "lines_deleted": Number(
685 | <                0,
686 | <            ),
687 | <            "old_file": Null,
688 | <        }),
689 | <    ]),
690 | <    Array([
691 | <        Object({
692 | <            "change": String(
693 | <                "Rename",
694 | <            ),
695 | <            "file": String(
696 | <                "c.txt",
697 | <            ),
698 | <            "lines_added": Number(
699 | <                0,
700 | <            ),
701 | <            "lines_deleted": Number(
702 | <                0,
703 | <            ),
704 | <            "old_file": String(
705 | <                "a.txt",
706 | <            ),
707 | <        }),
708 | <    ]),
709 | <    Array([
710 | <        Object({
711 | <            "change": String(
712 | <                "Rename",
713 | <            ),
714 | <            "file": String(
715 | <                "d.txt",
716 | <            ),
717 | <            "lines_added": Number(
718 | <                1,
719 | <            ),
720 | <            "lines_deleted": Number(
721 | <                0,
722 | <            ),
723 | <            "old_file": String(
724 | <                "c.txt",
725 | <            ),
726 | <        }),
727 | <    ]),
728 | <])
729 | */
730 | // run a single test with:
731 | // cargo test -- --nocapture can_extract_basic_git_log | grep -v "running 0 tests" | grep -v "0 passed" | grep -v -e '^\s*$'
732 | 


--------------------------------------------------------------------------------