├── tests ├── data │ ├── simple │ │ ├── child │ │ │ ├── ignored.txt │ │ │ └── a.txt │ │ ├── .polyglot_code_scanner_ignore │ │ └── parent.clj │ ├── simple_linked │ │ ├── child │ │ ├── parent.clj │ │ └── .polyglot_code_scanner_ignore │ ├── languages │ │ ├── foo.unknown │ │ ├── non-utf8.properties │ │ └── pfunit_test.pf │ ├── zipped │ │ ├── git_sample.zip │ │ ├── rename_complex.zip │ │ └── rename_simple.zip │ └── builders │ │ ├── README.md │ │ └── renaming │ │ ├── build_rename_simple.sh │ │ └── build_rename_complex.sh ├── expected │ ├── simple_files.json │ ├── simple_files_with_indicators.json │ ├── integration_tests │ │ ├── loc_flare_test.json │ │ ├── git_flare_test.json │ │ └── git_detailed_flare_test.json │ └── git │ │ ├── git_sample_by_filename.json │ │ ├── git_sample.json │ │ └── git_sample_with_merges.json └── integration_tests.rs ├── .gitignore ├── release.toml ├── test_shared ├── Cargo.toml └── src │ └── lib.rs ├── TODO.md ├── src ├── toxicity_indicator_calculator.rs ├── postprocessing.rs ├── file_stats.rs ├── polyglot_data.rs ├── git_user_dictionary.rs ├── loc.rs ├── code_line_data.rs ├── lib.rs ├── indentation.rs ├── git_file_future.rs ├── file_walker.rs ├── main.rs ├── flare.rs ├── git_file_history.rs ├── git.rs └── git_logger.rs ├── LICENSE.txt ├── .github └── workflows │ ├── test-all.yml │ ├── macos-release.yml │ ├── windows-release.yml │ └── linux-release.yml ├── Cargo.toml ├── DesignDecisons.md ├── README.md └── CHANGELOG.md /tests/data/simple/child/ignored.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data/simple_linked/child: -------------------------------------------------------------------------------- 1 | ../simple/child -------------------------------------------------------------------------------- /tests/data/simple/child/a.txt: -------------------------------------------------------------------------------- 1 | test with 2 | two lines -------------------------------------------------------------------------------- /tests/data/simple_linked/parent.clj: -------------------------------------------------------------------------------- 1 | ../simple/parent.clj -------------------------------------------------------------------------------- /tests/data/simple/.polyglot_code_scanner_ignore: -------------------------------------------------------------------------------- 1 | **/ignored.txt -------------------------------------------------------------------------------- /tests/data/simple/parent.clj: -------------------------------------------------------------------------------- 1 | (ns parent) 2 | 3 | (do 4 | (prn "wow")) -------------------------------------------------------------------------------- /tests/data/languages/foo.unknown: -------------------------------------------------------------------------------- 1 | Unknown files 2 | should be treated as code -------------------------------------------------------------------------------- /tests/data/simple_linked/.polyglot_code_scanner_ignore: -------------------------------------------------------------------------------- 1 | ../simple/.polyglot_code_scanner_ignore -------------------------------------------------------------------------------- /tests/data/languages/non-utf8.properties: -------------------------------------------------------------------------------- 1 | #test ISO 8859-1 2 | test-iso8859-1-chars=���������� 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | .idea 4 | .vscode 5 | /.cargo 6 | /.rustc_info.json 7 | /debug 8 | -------------------------------------------------------------------------------- /tests/data/zipped/git_sample.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kornysietsma/polyglot-code-scanner/HEAD/tests/data/zipped/git_sample.zip -------------------------------------------------------------------------------- /tests/data/zipped/rename_complex.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kornysietsma/polyglot-code-scanner/HEAD/tests/data/zipped/rename_complex.zip -------------------------------------------------------------------------------- /tests/data/zipped/rename_simple.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kornysietsma/polyglot-code-scanner/HEAD/tests/data/zipped/rename_simple.zip -------------------------------------------------------------------------------- /tests/data/builders/README.md: -------------------------------------------------------------------------------- 1 | # Test data builders 2 | 3 | This directory contains scripts used to build test repositories. 4 | 5 | You shouldn't need to run anything here unless you are changing tests - the generated tests will all be checked in to git. 6 | -------------------------------------------------------------------------------- /release.toml: -------------------------------------------------------------------------------- 1 | pre-release-replacements = [ 2 | {file="CHANGELOG.md", search="Unreleased", replace="{{version}}"}, 3 | {file="CHANGELOG.md", search="ReleaseDate", replace="{{date}}"}, 4 | {file="CHANGELOG.md", search="", replace="\n## [Unreleased] - ReleaseDate"}, 5 | ] -------------------------------------------------------------------------------- /tests/data/languages/pfunit_test.pf: -------------------------------------------------------------------------------- 1 | module test_simple 2 | use funit 3 | 4 | contains 5 | 6 | !!! Note: no test annotation !!! 7 | subroutine not_a_test() 8 | print*,'this procedure should not be called' 9 | end subroutine not_a_test 10 | 11 | @test 12 | subroutine test_assert_true_and_false() 13 | @assertTrue(1 == 1) 14 | @assertFalse(1 == 2) 15 | end subroutine test_assert_true_and_false 16 | 17 | end module test_simple -------------------------------------------------------------------------------- /test_shared/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "test_shared" 3 | version = "0.0.1" 4 | authors = ["Korny Sietsma "] 5 | description = "Shared test helpers for polyglot_code_scanner" 6 | edition = "2021" 7 | 8 | [dependencies] 9 | serde = { version = "1.0.144",features = ["derive"] } 10 | serde_json = "1.0.85" 11 | regex = "1.6.0" 12 | anyhow = "1.0.65" 13 | tempfile = "3.3.0" 14 | zip = "0.6.2" 15 | pretty_assertions = "1.3.0" 16 | log = "0.4.17" 17 | fern = "0.6.1" -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # TODO 2 | 3 | See Trello for anything not short term (sorry for people looking at github, but 4 | I had to look at cross-repo plans and goals) 5 | 6 | Small / immediate things: 7 | 8 | - add test that checks binary files and unknown text files (e.g. erb) 9 | - refactoring - use Into more ? "fn new>(name: S, is_file: bool)" allows the caller to decide... 10 | - Can we get rid of test_shared's duplication in cargo.toml ? 11 | - "-P" cli option is confusing - it's pretty printing _for logs_ ! 12 | - can we make the log default "warn"?? 13 | -------------------------------------------------------------------------------- /tests/expected/simple_files.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test", 3 | "id": "test-id", 4 | "version": "1.0.4", 5 | "metadata": {}, 6 | "features": { 7 | "coupling": false, 8 | "git": false, 9 | "git_details": false, 10 | "file_stats": false 11 | }, 12 | "tree": { 13 | "name": "", 14 | "children": [ 15 | { 16 | "name": "child", 17 | 18 | "children": [ 19 | { 20 | "name": "a.txt" 21 | } 22 | ] 23 | }, 24 | { 25 | "name": "parent.clj" 26 | } 27 | ] 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /tests/expected/simple_files_with_indicators.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test", 3 | "id": "test-id", 4 | "version": "1.0.4", 5 | "metadata": {}, 6 | "features": { 7 | "coupling": false, 8 | "git": false, 9 | "git_details": false, 10 | "file_stats": false 11 | }, 12 | "tree": { 13 | "name": "", 14 | "children": [ 15 | { 16 | "name": "child", 17 | "children": [ 18 | { 19 | "name": "a.txt!?" 20 | } 21 | ] 22 | }, 23 | { 24 | "name": "parent.clj!?" 25 | } 26 | ] 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/toxicity_indicator_calculator.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | 3 | use anyhow::Error; 4 | use std::path::Path; 5 | 6 | use crate::{flare::FlareTreeNode, polyglot_data::IndicatorMetadata}; 7 | 8 | /// Wrapper for the logic that calculates toxicity indicators 9 | pub trait ToxicityIndicatorCalculator: std::fmt::Debug { 10 | fn name(&self) -> String; 11 | fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error>; 12 | /// root-level metadata - output after all files added 13 | fn apply_metadata(&self, metadata: &mut IndicatorMetadata) -> Result<(), Error>; 14 | } 15 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2019 Kornelis Sietsma 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /src/postprocessing.rs: -------------------------------------------------------------------------------- 1 | use crate::{flare::FlareTreeNode, git::GitNodeData, ScannerConfig}; 2 | use anyhow::Error; 3 | 4 | fn remove_details(node: &mut FlareTreeNode, config: &ScannerConfig) -> Result<(), Error> { 5 | if let Some(GitNodeData::File { data }) = &mut node.indicators_mut().git { 6 | if !config.features.git_details { 7 | data.details = Vec::new(); 8 | } 9 | data.activity = Vec::new(); 10 | } 11 | for child in node.get_children_mut() { 12 | remove_details(child, config)?; 13 | } 14 | Ok(()) 15 | } 16 | 17 | pub fn postprocess_tree(tree: &mut FlareTreeNode, config: &ScannerConfig) -> Result<(), Error> { 18 | info!("Postprocessing tree before persisting"); 19 | remove_details(tree, config)?; 20 | Ok(()) 21 | } 22 | -------------------------------------------------------------------------------- /.github/workflows/test-all.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | on: 3 | pull_request: 4 | branches: 5 | - "*" 6 | push: 7 | branches: 8 | - master 9 | tags-ignore: 10 | - "*" 11 | 12 | jobs: 13 | test: 14 | if: | 15 | !contains(github.event.commits[0].message, '[ci skip]') && 16 | !contains(github.event.commits[0].message, '(cargo-release)') 17 | env: 18 | RUST_BACKTRACE: "full" 19 | runs-on: ${{ matrix.os }} 20 | strategy: 21 | fail-fast: true 22 | matrix: 23 | os: [macos-latest, ubuntu-20.04, ubuntu-22.04] # removed windows as tests broken atm 24 | 25 | steps: 26 | - uses: actions/checkout@v2 27 | - name: Install Rust stable 28 | uses: actions-rs/toolchain@v1 29 | with: 30 | profile: minimal 31 | toolchain: stable 32 | components: rustfmt, clippy 33 | - uses: Swatinem/rust-cache@v1 34 | with: 35 | key: ${{ matrix.os }} 36 | - name: Test 37 | run: | 38 | cargo fmt -- --check 39 | cargo clippy --release 40 | cargo test 41 | -------------------------------------------------------------------------------- /tests/expected/integration_tests/loc_flare_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test", 3 | "id": "test-id", 4 | "version": "1.0.4", 5 | "metadata": {}, 6 | "features": { 7 | "coupling": false, 8 | "git": false, 9 | "git_details": false, 10 | "file_stats": false 11 | }, 12 | "tree": { 13 | "name": "", 14 | "children": [ 15 | { 16 | "name": "child", 17 | "children": [ 18 | { 19 | "name": "a.txt", 20 | "data": { 21 | "loc": { 22 | "language": "Plain Text", 23 | "binary": false, 24 | "blanks": 0, 25 | "code": 0, 26 | "comments": 2, 27 | "lines": 2, 28 | "bytes": 19 29 | } 30 | } 31 | } 32 | ] 33 | }, 34 | { 35 | "name": "parent.clj", 36 | "data": { 37 | "loc": { 38 | "language": "Clojure", 39 | "binary": false, 40 | "blanks": 1, 41 | "code": 3, 42 | "comments": 0, 43 | "lines": 4, 44 | "bytes": 31 45 | } 46 | } 47 | } 48 | ] 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "polyglot_code_scanner" 3 | version = "0.4.5-alpha.0" 4 | publish = false 5 | authors = ["Korny Sietsma "] 6 | description = "Polyglot Code Scanner - scans source code and generates tree-structured JSON files for d3 visualisation" 7 | edition = "2021" 8 | 9 | [profile.release] 10 | debug = true 11 | 12 | [dependencies] 13 | tokei = { git = "https://github.com/kornysietsma/tokei", tag = "PolyglotV1.0.1" } 14 | ignore = "0.4.18" 15 | serde = { version = "1.0.144",features = ["derive","rc"] } 16 | erased-serde = "0.3.23" 17 | serde_json = "1.0.85" 18 | regex = "1.6.0" 19 | clap = { version = "3.2.22", features = ["derive"] } 20 | log = "0.4.17" 21 | fern = "0.6.1" 22 | clap-verbosity-flag = "1.0.1" 23 | lazy_static = "1.4.0" 24 | git2 = "0.15.0" 25 | derive_builder = "0.11.2" 26 | derive-getters = "0.2.0" 27 | content_inspector = "0.2.4" 28 | encoding_rs_io = "0.1.7" 29 | grep-searcher = "0.1.10" 30 | hdrhistogram = "7.5.2" 31 | indicatif = "0.17.1" 32 | chrono = "0.4.22" 33 | openssl = { version = "0.10.42", features=["vendored"] } 34 | path-slash = "0.2.1" 35 | uuid = { version = "1.1.2", features = ["v4"] } 36 | anyhow = "1.0.65" 37 | filetime = "0.2.17" 38 | 39 | [dev-dependencies] 40 | test_shared = { path = "test_shared" } 41 | tempfile = "3.3.0" 42 | zip = "0.6.2" 43 | pretty_assertions = "1.3.0" 44 | -------------------------------------------------------------------------------- /tests/data/builders/renaming/build_rename_simple.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | 3 | if [[ -d "rename_simple" ]]; then 4 | rm -r rename_simple 5 | fi 6 | 7 | mkdir rename_simple 8 | cd rename_simple 9 | 10 | git_dates() { 11 | # really simple - sets the hour only, so dates are ordered 12 | if [ -z "$1" ]; then 13 | echo "needs a param" 14 | exit 1 15 | fi 16 | export GIT_AUTHOR_DATE="2020-09-13T$1:00:00" 17 | export GIT_COMMITTER_DATE="2020-09-13T$1:00:00" 18 | } 19 | 20 | export GIT_AUTHOR_NAME="Kate Smith" 21 | export GIT_AUTHOR_EMAIL="kate@smith.com" 22 | export GIT_COMMITTER_NAME="Jay" 23 | export GIT_COMMITTER_EMAIL="Jay@smith.com" 24 | git_dates "01" 25 | 26 | git init 27 | 28 | cat <a.txt 29 | a 30 | a 31 | a 32 | a 33 | EOF 34 | 35 | git add . 36 | 37 | git commit -m "initial commit" 38 | 39 | git_dates "02" 40 | 41 | cat <b.txt 42 | b 43 | EOF 44 | 45 | git add . 46 | 47 | git commit -m "unrelated commit" 48 | 49 | git_dates "03" 50 | 51 | git mv a.txt c.txt 52 | 53 | git add . 54 | 55 | git commit -m "moving a to c" 56 | 57 | git_dates "04" 58 | 59 | git mv c.txt d.txt 60 | 61 | echo "d" >>d.txt 62 | 63 | git add . 64 | 65 | git commit -m "moving and renaming" 66 | 67 | cd .. 68 | 69 | if [[ -f "rename_simple.zip" ]]; then 70 | rm rename_simple.zip 71 | fi 72 | 73 | zip -r rename_simple.zip rename_simple 74 | -------------------------------------------------------------------------------- /.github/workflows/macos-release.yml: -------------------------------------------------------------------------------- 1 | name: macos-release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | 8 | jobs: 9 | build: 10 | name: Build on macOS 11 | runs-on: macos-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Install Rust stable 15 | uses: dtolnay/rust-toolchain@master 16 | with: 17 | toolchain: stable 18 | - uses: Swatinem/rust-cache@v1 19 | - name: Build 20 | run: | 21 | cargo build --release --locked 22 | - name: Upload build artifact 23 | uses: actions/upload-artifact@v3 24 | with: 25 | name: binary 26 | path: target/release/polyglot_code_scanner 27 | test: 28 | name: Test on macOS 29 | runs-on: macos-latest 30 | steps: 31 | - uses: actions/checkout@v2 32 | - name: Install Rust stable 33 | uses: actions-rs/toolchain@v1 34 | with: 35 | profile: minimal 36 | toolchain: stable 37 | - uses: Swatinem/rust-cache@v1 38 | - name: Test 39 | run: | 40 | cargo fmt -- --check 41 | cargo clippy --release 42 | cargo test --release --locked 43 | release: 44 | runs-on: macos-latest 45 | needs: [build, test] 46 | steps: 47 | - name: Set the release tag 48 | id: set_tag 49 | run: echo ::set-output name=RELEASE_TAG::${GITHUB_REF/refs\/tags\/v/} 50 | shell: bash 51 | - name: Restore artifact from previous job 52 | uses: actions/download-artifact@v3 53 | with: 54 | name: binary 55 | - name: Upload binaries to release 56 | uses: svenstaro/upload-release-action@v1-release 57 | with: 58 | repo_token: ${{ secrets.GITHUB_TOKEN }} 59 | file: polyglot_code_scanner 60 | asset_name: polyglot-code-scanner-x86_64-macos 61 | tag: ${{ github.ref }} 62 | overwrite: true 63 | -------------------------------------------------------------------------------- /.github/workflows/windows-release.yml: -------------------------------------------------------------------------------- 1 | name: windows-release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "*" 7 | 8 | jobs: 9 | build: 10 | name: Build on Windows 11 | runs-on: windows-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Install Rust stable 15 | uses: dtolnay/rust-toolchain@master 16 | with: 17 | toolchain: stable 18 | - uses: Swatinem/rust-cache@v1 19 | - name: Build 20 | run: | 21 | cargo build --release --locked 22 | - name: Upload build artifact 23 | uses: actions/upload-artifact@v3 24 | with: 25 | name: binary 26 | path: target/release/polyglot_code_scanner.exe 27 | test: 28 | name: Test on Windows 29 | if: ${{ false }} # disabled as windows tests have issues with file sizes at the moment 30 | runs-on: windows-latest 31 | steps: 32 | - uses: actions/checkout@v2 33 | - name: Install Rust stable 34 | uses: actions-rs/toolchain@v1 35 | with: 36 | profile: minimal 37 | toolchain: stable 38 | - uses: Swatinem/rust-cache@v1 39 | - name: Test 40 | run: | 41 | cargo fmt -- --check 42 | cargo clippy --release 43 | cargo test --release --locked 44 | release: 45 | runs-on: windows-latest 46 | needs: [build] 47 | steps: 48 | - name: Set the release tag 49 | id: set_tag 50 | run: echo ::set-output name=RELEASE_TAG::${GITHUB_REF/refs\/tags\/v/} 51 | shell: bash 52 | - uses: actions/checkout@v2 53 | - name: Restore artifact from previous job 54 | uses: actions/download-artifact@v3 55 | with: 56 | name: binary 57 | - name: Upload binaries to release 58 | uses: svenstaro/upload-release-action@v1-release 59 | with: 60 | repo_token: ${{ secrets.GITHUB_TOKEN }} 61 | file: polyglot_code_scanner.exe 62 | asset_name: polyglot-code-scanner-x86_64-windows.exe 63 | tag: ${{ github.ref }} 64 | overwrite: true 65 | -------------------------------------------------------------------------------- /tests/expected/integration_tests/git_flare_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test", 3 | "id": "test-id", 4 | "version": "1.0.4", 5 | "features": { 6 | "coupling": false, 7 | "git": true, 8 | "git_details": false, 9 | "file_stats": false 10 | }, 11 | "tree": { 12 | "name": "", 13 | "children": [ 14 | { 15 | "name": "simple", 16 | "children": [ 17 | { 18 | "name": "child", 19 | "children": [ 20 | { 21 | "name": "a_renamed.txt", 22 | "data": { 23 | "git": { 24 | "age_in_days": 0, 25 | "creation_date": 1558521386, 26 | "last_update": 1558533240, 27 | "user_count": 2, 28 | "users": [0, 1], 29 | "activity": [], 30 | "details": [] 31 | } 32 | } 33 | } 34 | ] 35 | }, 36 | { 37 | "name": "parent.clj", 38 | "data": { 39 | "git": { 40 | "age_in_days": 0, 41 | "creation_date": 1558521386, 42 | "last_update": 1558524371, 43 | "user_count": 2, 44 | "users": [0, 1], 45 | "activity": [], 46 | "details": [] 47 | } 48 | } 49 | } 50 | ] 51 | } 52 | ], 53 | "data": { 54 | "git": { 55 | "head": "93ae0c7c7cd93b3c4ea1bf103bde4deafef798ad", 56 | "remote_url": null 57 | } 58 | } 59 | }, 60 | "metadata": { 61 | "git": { 62 | "users": [ 63 | { 64 | "id": 0, 65 | "user": { 66 | "email": "korny@sietsma.com", 67 | "name": "Korny Sietsma" 68 | } 69 | }, 70 | { 71 | "id": 1, 72 | "user": { 73 | "email": "hgranger@durmstrang.de", 74 | "name": "hermoine" 75 | } 76 | } 77 | ] 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /.github/workflows/linux-release.yml: -------------------------------------------------------------------------------- 1 | name: linux-release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | 8 | jobs: 9 | build: 10 | name: Build on Linux 11 | container: node:alpine 12 | runs-on: ubuntu-20.04 13 | env: 14 | RUST_BACKTRACE: "full" 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Install dependencies 18 | run: | 19 | apk add --no-cache bash curl build-base openssl-dev perl tar 20 | - name: Install Rust stable 21 | uses: dtolnay/rust-toolchain@master 22 | with: 23 | toolchain: stable 24 | - uses: Swatinem/rust-cache@v1 25 | with: 26 | key: ubuntu-latest 27 | - name: Build 28 | env: 29 | RUSTFLAGS: "-C link-arg=-s" 30 | run: | 31 | cargo build --release --locked 32 | - name: Upload build artifact 33 | uses: actions/upload-artifact@v3 34 | with: 35 | name: binary 36 | path: target/release/polyglot_code_scanner 37 | test: 38 | name: Test on Linux 39 | runs-on: ubuntu-latest 40 | steps: 41 | - uses: actions/checkout@v2 42 | - name: Install Rust stable 43 | uses: actions-rs/toolchain@v1 44 | with: 45 | profile: minimal 46 | toolchain: stable 47 | components: rustfmt, clippy 48 | - uses: Swatinem/rust-cache@v1 49 | with: 50 | key: ubuntu-latest 51 | - name: Test 52 | run: | 53 | cargo fmt -- --check 54 | cargo clippy --release 55 | cargo test --release --locked 56 | release: 57 | runs-on: ubuntu-latest 58 | needs: [build, test] 59 | steps: 60 | - name: Restore artifact from previous job 61 | uses: actions/download-artifact@v3 62 | with: 63 | name: binary 64 | - name: Upload binaries to release 65 | uses: svenstaro/upload-release-action@v1-release 66 | with: 67 | repo_token: ${{ secrets.GITHUB_TOKEN }} 68 | file: polyglot_code_scanner 69 | asset_name: polyglot-code-scanner-x86_64-linux 70 | tag: ${{ github.ref }} 71 | overwrite: true 72 | - uses: actions/checkout@v2 73 | -------------------------------------------------------------------------------- /DesignDecisons.md: -------------------------------------------------------------------------------- 1 | # Software design decisions 2 | 3 | This file arose as I wanted somewhere for notes on _why_ I make changes - a bit like Architecture Decision Records, but it's a bit grand to call them Architecture :) 4 | 5 | Mostly as right now (Sep 2022) I'm reversing an original decision and without having a pair to talk to, making notes here is useful for me! 6 | 7 | ## Sep 2022 - stopping using Value for Toxicity Calculators 8 | 9 | Originally I built this scanner a bit too generically. You'd think after decades of preaching "YAGNI - You Ain't Gonna Need It" I'd have learned better, but no... 10 | 11 | So the scanner used to have these fairly generic `ToxicityIndicatorCalculator` structs, which have two methods: 12 | 13 | * `calculate` which is the heart of the calculator, it is a pure-ish function that returns a JSON `Value` for the calculator - e.g. for the Lines of Code one it returns a set of code lines metrics - this is called for each file/dir scanned, and the returned `Value` is added to the `data` for each file/dir 14 | * `metadata` which is called at the end, to store any metadata that the calculator generates and needs to be saved. This also used a `Value` 15 | 16 | This seemed like a good idea at the time - nice to have side-effect-free functions, and the `Value` returns meant no coupling between the calculators and the rest of the app. 17 | 18 | But, once I moved the Explorer to TypeScript, I had to re-build the types used in the Scanner in TypeScript - and I realised that really the use of `Value` meant I was bypassing the type system. And I only have 3 Indicators! So why all this effort for generic behaviour? There's no point making the Rust flexible when the TypeScript isn't! 19 | 20 | (I think when I started I had no idea how many indicators I would want, and I could see it being some kind of place I could plug in language-specific tools... like I said, YAGNI should have applied) 21 | 22 | So, I want to go to Value-less code. I can see two options: 23 | 24 | 1. Instead of `calculate` returning a `Value` I make it generic, so `calculate` returns a `T` 25 | 2. Make it a visitor instead. `calculate` takes a mutable `FileTreeNode` parameter, it changes the data it needs to. 26 | 27 | I'm going for option `2` - it feels a lot simpler. 28 | 29 | The only downside here is - this made some unit tests harder. The more-generic code could be tested by throwing fake `Value` objects around for tests - the new code only accepts 'real' types. This is probably good overall, as it means the tests are closer to reality. But some things aren't well tested, except in end-to-end tests. -------------------------------------------------------------------------------- /tests/integration_tests.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Error; 2 | use polyglot_code_scanner::ScannerConfig; 3 | use serde_json::Value; 4 | use std::io::Cursor; 5 | use std::path::PathBuf; 6 | use tempfile::tempdir; 7 | use test_shared::*; 8 | 9 | fn test_scanner_config(with_git: bool) -> ScannerConfig { 10 | let mut config = ScannerConfig::default("test"); 11 | config.data_id = Some("test-id".to_string()); 12 | config.features.git = with_git; 13 | config 14 | } 15 | 16 | #[test] 17 | fn it_calculates_lines_of_code() -> Result<(), Error> { 18 | let root = PathBuf::from("./tests/data/simple/"); 19 | 20 | let mut buffer: Vec = Vec::new(); 21 | let out = Cursor::new(&mut buffer); 22 | 23 | let result = 24 | polyglot_code_scanner::run(&root, &test_scanner_config(false), None, &["loc"], out); 25 | 26 | assert!(result.is_ok()); 27 | 28 | let parsed_result: Value = serde_json::from_reader(buffer.as_slice())?; 29 | 30 | assert_eq_json_file( 31 | &parsed_result, 32 | "./tests/expected/integration_tests/loc_flare_test.json", 33 | ); 34 | 35 | Ok(()) 36 | } 37 | 38 | #[test] 39 | fn it_calculates_git_stats() -> Result<(), Error> { 40 | let gitdir = tempdir()?; 41 | let git_root = unzip_test_sample("git_sample", gitdir.path())?; 42 | 43 | let mut buffer: Vec = Vec::new(); 44 | let out = Cursor::new(&mut buffer); 45 | 46 | let result = 47 | polyglot_code_scanner::run(&git_root, &test_scanner_config(true), None, &["git"], out); 48 | 49 | assert!(result.is_ok()); 50 | 51 | let parsed_result: Value = serde_json::from_reader(buffer.as_slice())?; 52 | 53 | assert_eq_json_file( 54 | &parsed_result, 55 | "./tests/expected/integration_tests/git_flare_test.json", 56 | ); 57 | 58 | Ok(()) 59 | } 60 | 61 | #[test] 62 | fn it_calculates_detailed_git_stats() -> Result<(), Error> { 63 | let gitdir = tempdir()?; 64 | let git_root = unzip_test_sample("git_sample", gitdir.path())?; 65 | 66 | let mut buffer: Vec = Vec::new(); 67 | let out = Cursor::new(&mut buffer); 68 | 69 | let mut config = test_scanner_config(true); 70 | config.features.git_details = true; 71 | 72 | let result = polyglot_code_scanner::run(&git_root, &config, None, &["git"], out); 73 | 74 | assert!(result.is_ok()); 75 | 76 | let parsed_result: Value = serde_json::from_reader(buffer.as_slice())?; 77 | 78 | assert_eq_json_file( 79 | &parsed_result, 80 | "./tests/expected/integration_tests/git_detailed_flare_test.json", 81 | ); 82 | 83 | Ok(()) 84 | } 85 | 86 | // TODO: add a coupling e2e test! Needs a lot of setup 87 | -------------------------------------------------------------------------------- /src/file_stats.rs: -------------------------------------------------------------------------------- 1 | use std::{fs, path::Path}; 2 | 3 | use anyhow::Error; 4 | use filetime::FileTime; 5 | use serde::Serialize; 6 | 7 | use crate::{ 8 | flare::FlareTreeNode, polyglot_data::IndicatorMetadata, 9 | toxicity_indicator_calculator::ToxicityIndicatorCalculator, 10 | }; 11 | 12 | /// File creation and modification times, in seconds since unix epoch 13 | /// using the filetime crate so Windows times are converted to unix times! 14 | #[derive(Debug, PartialEq, Eq, Clone, Serialize, Default)] 15 | pub struct FileStats { 16 | created: i64, 17 | modified: i64, 18 | } 19 | 20 | impl FileStats { 21 | fn new(path: &Path) -> Result { 22 | let metadata = fs::metadata(path)?; 23 | let ctime = FileTime::from_creation_time(&metadata); 24 | let mtime = FileTime::from_last_modification_time(&metadata); 25 | match (ctime, mtime) { 26 | (Some(ctime), mtime) => Ok(FileStats { 27 | created: ctime.unix_seconds(), 28 | modified: mtime.unix_seconds(), 29 | }), 30 | (None, mtime) => { 31 | warn!("File has no ctime - using mtime"); 32 | Ok(FileStats { 33 | created: mtime.unix_seconds(), 34 | modified: mtime.unix_seconds(), 35 | }) 36 | } 37 | } 38 | } 39 | } 40 | #[derive(Debug)] 41 | pub struct FileStatsCalculator {} 42 | 43 | impl ToxicityIndicatorCalculator for FileStatsCalculator { 44 | fn name(&self) -> String { 45 | "file_stats".to_string() 46 | } 47 | 48 | fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error> { 49 | let file_stats = FileStats::new(path)?; 50 | node.indicators_mut().file_stats = Some(file_stats); 51 | 52 | Ok(()) 53 | } 54 | 55 | fn apply_metadata(&self, _metadata: &mut IndicatorMetadata) -> Result<(), Error> { 56 | Ok(()) 57 | } 58 | } 59 | 60 | #[cfg(test)] 61 | mod test { 62 | use std::time::SystemTime; 63 | 64 | use super::*; 65 | use std::time::UNIX_EPOCH; 66 | use tempfile::{NamedTempFile, TempDir}; 67 | 68 | #[test] 69 | fn can_get_stats_for_a_file() -> Result<(), Error> { 70 | let newfile = NamedTempFile::new()?; 71 | 72 | let stats = FileStats::new(newfile.path())?; 73 | let now: i64 = SystemTime::now() 74 | .duration_since(UNIX_EPOCH)? 75 | .as_secs() 76 | .try_into()?; 77 | 78 | assert!(stats.created > now - 1 && stats.created < now + 1); 79 | assert!(stats.modified > now - 1 && stats.modified < now + 1); 80 | 81 | Ok(()) 82 | } 83 | #[test] 84 | fn can_get_stats_for_a_dir() -> Result<(), Error> { 85 | let newdir = TempDir::new()?; 86 | 87 | let stats = FileStats::new(newdir.path())?; 88 | let now: i64 = SystemTime::now() 89 | .duration_since(UNIX_EPOCH)? 90 | .as_secs() 91 | .try_into()?; 92 | 93 | assert!(stats.created > now - 1 && stats.created < now + 1); 94 | assert!(stats.modified > now - 1 && stats.modified < now + 1); 95 | 96 | Ok(()) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /test_shared/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![forbid(unsafe_code)] 2 | #![warn(clippy::all)] 3 | #![warn(clippy::pedantic)] 4 | #![warn(rust_2018_idioms)] 5 | 6 | use anyhow::Error; 7 | use pretty_assertions::assert_eq; 8 | use serde::Serialize; 9 | use serde_json::Value; 10 | use std::fs::File; 11 | use std::path::{Path, PathBuf}; 12 | use zip::ZipArchive; 13 | 14 | /// adapted from https://github.com/mvdnes/zip-rs/blob/master/examples/extract.rs 15 | /// Note zip files only store modification time, this sets ctime to the mtime for tests 16 | pub fn unzip_to_dir(dest: &Path, zipname: &str) -> Result<(), Error> { 17 | let fname = std::path::Path::new(zipname); 18 | let file = File::open(&fname)?; 19 | 20 | let mut archive = ZipArchive::new(file)?; 21 | 22 | for i in 0..archive.len() { 23 | let mut file = archive.by_index(i)?; 24 | let outpath = PathBuf::from(dest).join(file.mangled_name()); 25 | 26 | if (&*file.name()).ends_with('/') { 27 | std::fs::create_dir_all(&outpath)?; 28 | } else { 29 | if let Some(p) = outpath.parent() { 30 | if !p.exists() { 31 | std::fs::create_dir_all(&p)?; 32 | } 33 | } 34 | let mut outfile = std::fs::File::create(&outpath)?; 35 | std::io::copy(&mut file, &mut outfile)?; 36 | } 37 | } 38 | Ok(()) 39 | } 40 | 41 | /// unzip a zip file - assumes the name shortname.zip and contains a shortname directory in the file 42 | /// returns the working directory in the unzipped data 43 | pub fn unzip_test_sample(shortname: &str, workdir: &Path) -> Result { 44 | let zip_name = "tests/data/zipped/".to_owned() + shortname + ".zip"; 45 | unzip_to_dir(workdir, &zip_name)?; 46 | Ok(PathBuf::from(workdir).join(shortname)) 47 | } 48 | 49 | pub fn assert_eq_json_file(actual: &T, expected_file: &str) 50 | where 51 | T: Serialize, 52 | { 53 | let expected = std::fs::read_to_string(Path::new(expected_file)).unwrap(); 54 | 55 | assert_eq_json_str(&actual, &expected) 56 | } 57 | 58 | pub fn assert_eq_json_str(actual_serializable: &T, expected_json: &str) 59 | where 60 | T: Serialize, 61 | { 62 | let actual = serde_json::to_value(&actual_serializable).unwrap(); 63 | 64 | let expected: Value = serde_json::from_str(expected_json).unwrap(); 65 | assert_eq!(&actual, &expected) 66 | } 67 | 68 | pub fn assert_eq_json_value(actual_serializable: &T, expected_json: &Value) 69 | where 70 | T: Serialize, 71 | { 72 | let actual = serde_json::to_value(&actual_serializable).unwrap(); 73 | 74 | assert_eq!(&actual, expected_json) 75 | } 76 | 77 | pub fn assert_eq_json(left: &str, right: &str) { 78 | let left: Value = serde_json::from_str(left).unwrap(); 79 | let right: Value = serde_json::from_str(right).unwrap(); 80 | assert_eq!(left, right); 81 | } 82 | 83 | /// install a test logger - call this in tests where you want to see log output! 84 | pub fn install_test_logger() { 85 | // This'll fail if called twice; don't worry. 86 | let _ = fern::Dispatch::new() 87 | // ... 88 | .level(log::LevelFilter::Debug) 89 | .chain(fern::Output::call(|record| println!("{}", record.args()))) 90 | .apply(); 91 | } 92 | -------------------------------------------------------------------------------- /tests/expected/integration_tests/git_detailed_flare_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test", 3 | "id": "test-id", 4 | "version": "1.0.4", 5 | "features": { 6 | "coupling": false, 7 | "git": true, 8 | "git_details": true, 9 | "file_stats": false 10 | }, 11 | "tree": { 12 | "name": "", 13 | "children": [ 14 | { 15 | "name": "simple", 16 | "children": [ 17 | { 18 | "name": "child", 19 | "children": [ 20 | { 21 | "name": "a_renamed.txt", 22 | "data": { 23 | "git": { 24 | "age_in_days": 0, 25 | "creation_date": 1558521386, 26 | "last_update": 1558533240, 27 | "user_count": 2, 28 | "users": [0, 1], 29 | "activity": [], 30 | "details": [ 31 | { 32 | "commit_day": 1558483200, 33 | "commits": 5, 34 | "lines_added": 7, 35 | "lines_deleted": 3, 36 | "users": [0] 37 | }, 38 | { 39 | "commit_day": 1558483200, 40 | "commits": 1, 41 | "lines_added": 1, 42 | "lines_deleted": 1, 43 | "users": [0, 1] 44 | } 45 | ] 46 | } 47 | } 48 | } 49 | ] 50 | }, 51 | { 52 | "name": "parent.clj", 53 | "data": { 54 | "git": { 55 | "age_in_days": 0, 56 | "creation_date": 1558521386, 57 | "last_update": 1558524371, 58 | "user_count": 2, 59 | "users": [0, 1], 60 | "activity": [], 61 | "details": [ 62 | { 63 | "commit_day": 1558483200, 64 | "commits": 3, 65 | "lines_added": 8, 66 | "lines_deleted": 1, 67 | "users": [0] 68 | }, 69 | { 70 | "commit_day": 1558483200, 71 | "commits": 1, 72 | "lines_added": 3, 73 | "lines_deleted": 1, 74 | "users": [0, 1] 75 | } 76 | ] 77 | } 78 | } 79 | } 80 | ] 81 | } 82 | ], 83 | "data": { 84 | "git": { 85 | "head": "93ae0c7c7cd93b3c4ea1bf103bde4deafef798ad", 86 | "remote_url": null 87 | } 88 | } 89 | }, 90 | "metadata": { 91 | "git": { 92 | "users": [ 93 | { 94 | "id": 0, 95 | "user": { "email": "korny@sietsma.com", "name": "Korny Sietsma" } 96 | }, 97 | { 98 | "id": 1, 99 | "user": { 100 | "email": "hgranger@durmstrang.de", 101 | "name": "hermoine" 102 | } 103 | } 104 | ] 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /tests/data/builders/renaming/build_rename_complex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | if [[ -d "rename_complex" ]]; then 4 | rm -r rename_complex 5 | fi 6 | 7 | mkdir rename_complex 8 | cd rename_complex 9 | 10 | author_kate() { 11 | export GIT_AUTHOR_NAME="Kate Smith" 12 | export GIT_AUTHOR_EMAIL="kate@smith.com" 13 | } 14 | 15 | author_jay() { 16 | export GIT_AUTHOR_NAME="Jay" 17 | export GIT_AUTHOR_EMAIL="Jay@smith.com" 18 | } 19 | 20 | committer_jay() { 21 | export GIT_COMMITTER_NAME="Jay" 22 | export GIT_COMMITTER_EMAIL="Jay@smith.com" 23 | } 24 | 25 | author_dave() { 26 | export GIT_AUTHOR_NAME="Dave Smith" 27 | export GIT_AUTHOR_EMAIL="dave@smith.com" 28 | } 29 | 30 | git_dates() { 31 | # really simple - sets the hour only, so dates are ordered 32 | if [ -z "$1" ]; then 33 | echo "needs a param" 34 | exit 1 35 | fi 36 | export GIT_AUTHOR_DATE="2020-09-13T$1:00:00" 37 | export GIT_COMMITTER_DATE="2020-09-13T$1:00:00" 38 | } 39 | 40 | git init 41 | 42 | author_kate 43 | committer_jay 44 | 45 | git_dates "01" 46 | 47 | cat <a.txt 48 | a 49 | a 50 | a 51 | a 52 | EOF 53 | 54 | cat <z.txt 55 | z 56 | z 57 | z 58 | z 59 | EOF 60 | 61 | git add . 62 | git commit -am "initial commit" 63 | 64 | git_dates "02" 65 | 66 | git mv a.txt a1.txt 67 | git commit -am "rename a to a1" 68 | 69 | git_dates "03" 70 | 71 | author_dave 72 | committer_jay 73 | 74 | git checkout -b "dave_work" 75 | git mv a1.txt a2.txt 76 | echo "junk" >>a2.txt 77 | 78 | cat <bb.txt 79 | b 80 | b 81 | b 82 | b 83 | EOF 84 | 85 | git rm z.txt 86 | 87 | git add . 88 | git commit -am "rename a1 to a2, add bb, kill z" 89 | 90 | git_dates "05" 91 | 92 | git mv bb.txt b.txt 93 | git mv a2.txt a.txt 94 | 95 | git commit -am "rename bb to b, a2 back to a" 96 | 97 | git checkout master 98 | 99 | git_dates "04" 100 | author_jay 101 | committer_jay 102 | 103 | git checkout -b "jay_work" 104 | 105 | git mv a1.txt aa.txt 106 | echo "junk!" >>aa.txt 107 | 108 | cat <bee.txt 109 | B 110 | B 111 | B 112 | EOF 113 | 114 | git add . 115 | git commit -am "rename a1 to aa, add bee" 116 | 117 | git_dates "06" 118 | 119 | git mv bee.txt b.txt 120 | git mv aa.txt a.txt 121 | git add . 122 | git commit -m "rename bee to b, aa back to a" 123 | 124 | git checkout master 125 | 126 | git_dates "07" 127 | author_kate 128 | committer_jay 129 | 130 | git mv a1.txt a.txt 131 | git commit -m "rename a1 back to a prep merging" 132 | 133 | git merge jay_work -m "merging jay work" 134 | 135 | git_dates "08" 136 | 137 | git merge dave_work -m "merging dave work" || true # will fail! 138 | 139 | echo "fixing" 140 | 141 | git_dates "09" 142 | 143 | cat <a.txt 144 | a 145 | a 146 | a 147 | a 148 | fixed 149 | EOF 150 | 151 | cat <b.txt 152 | b 153 | b 154 | b 155 | b 156 | fixed 157 | EOF 158 | 159 | git commit -am "merging dave work with fixes" 160 | 161 | git_dates "10" 162 | 163 | cat <z.txt 164 | z 165 | z 166 | z 167 | z 168 | fixed 169 | EOF 170 | 171 | git add z.txt 172 | 173 | git commit -m "restoring deleted z" 174 | 175 | cd .. 176 | 177 | if [[ -f "rename_complex.zip" ]]; then 178 | rm rename_complex.zip 179 | fi 180 | 181 | zip -r rename_complex.zip rename_complex 182 | -------------------------------------------------------------------------------- /src/polyglot_data.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | //! Data formats for JSON output from the scanner 3 | //! 4 | //! Data format should now follow semantic versioning - a major version change is incompatible, a minor version change is backward compatible, a patch version is mostly around bug fixes. 5 | 6 | use serde::Serialize; 7 | use uuid::Uuid; 8 | 9 | use crate::{ 10 | coupling::CouplingMetadata, flare::FlareTreeNode, git_user_dictionary::GitUserDictionary, 11 | FeatureFlags, 12 | }; 13 | 14 | pub static DATA_FILE_VERSION: &str = "1.0.4"; 15 | 16 | #[derive(Debug, Serialize)] 17 | pub struct GitMetadata { 18 | pub users: GitUserDictionary, 19 | } 20 | #[derive(Debug, Serialize, Default)] 21 | pub struct IndicatorMetadata { 22 | #[serde(skip_serializing_if = "Option::is_none")] 23 | pub git: Option, 24 | #[serde(skip_serializing_if = "Option::is_none")] 25 | pub coupling: Option, 26 | } 27 | 28 | #[derive(Debug, Serialize)] 29 | pub struct PolyglotData { 30 | version: String, 31 | name: String, 32 | id: String, 33 | tree: FlareTreeNode, 34 | metadata: IndicatorMetadata, 35 | features: FeatureFlags, 36 | } 37 | 38 | impl PolyglotData { 39 | pub fn new(name: &str, id: Option<&str>, tree: FlareTreeNode, features: FeatureFlags) -> Self { 40 | let id = id.map_or_else( 41 | || Uuid::new_v4().as_hyphenated().to_string(), 42 | std::string::ToString::to_string, 43 | ); 44 | PolyglotData { 45 | version: DATA_FILE_VERSION.to_string(), 46 | name: name.to_string(), 47 | id, 48 | tree, 49 | metadata: IndicatorMetadata::default(), 50 | features, 51 | } 52 | } 53 | pub fn tree(&self) -> &FlareTreeNode { 54 | &self.tree 55 | } 56 | pub fn tree_mut(&mut self) -> &mut FlareTreeNode { 57 | &mut self.tree 58 | } 59 | 60 | pub fn metadata(&mut self) -> &mut IndicatorMetadata { 61 | &mut self.metadata 62 | } 63 | } 64 | 65 | #[cfg(test)] 66 | mod test { 67 | use super::*; 68 | use pretty_assertions::assert_eq; 69 | #[test] 70 | fn can_build_data_tree() { 71 | let root = FlareTreeNode::dir("root"); 72 | let tree: PolyglotData = PolyglotData::new( 73 | "test", 74 | Some("test-id"), 75 | root.clone(), 76 | FeatureFlags::default(), 77 | ); 78 | 79 | let expected = PolyglotData { 80 | name: "test".to_string(), 81 | id: "test-id".to_string(), 82 | version: DATA_FILE_VERSION.to_string(), 83 | tree: root, 84 | metadata: IndicatorMetadata::default(), 85 | features: FeatureFlags::default(), 86 | }; 87 | 88 | assert_eq!(tree.name, expected.name); 89 | assert_eq!(tree.tree, expected.tree); 90 | } 91 | 92 | #[test] 93 | fn data_without_id_has_uuid() { 94 | let root = FlareTreeNode::dir("root"); 95 | let tree1: PolyglotData = 96 | PolyglotData::new("test", None, root.clone(), FeatureFlags::default()); 97 | let tree2: PolyglotData = PolyglotData::new("test", None, root, FeatureFlags::default()); 98 | // really just asserting IDs are different! 99 | assert_ne!(tree1.id, tree2.id); 100 | } 101 | 102 | // TODO: removed serializing metadata test as it no longer made sense. Do we depend on just e2e tests? 103 | } 104 | -------------------------------------------------------------------------------- /src/git_user_dictionary.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | use crate::git_logger::User; 3 | use serde::ser::SerializeSeq; 4 | use serde::{Serialize, Serializer}; 5 | use std::collections::HashMap; 6 | 7 | #[derive(Debug, Clone, Default)] 8 | pub struct GitUserDictionary { 9 | next_id: usize, 10 | lower_users: HashMap, 11 | users: Vec, 12 | } 13 | 14 | impl GitUserDictionary { 15 | pub fn register(&mut self, user: &User) -> usize { 16 | let lower_user = user.as_lower_case(); 17 | match self.lower_users.get(&lower_user) { 18 | Some(id) => *id, 19 | None => { 20 | let result = self.next_id; 21 | self.lower_users.insert(lower_user, result); 22 | self.users.push(user.clone()); 23 | self.next_id += 1; 24 | result 25 | } 26 | } 27 | } 28 | #[cfg(test)] 29 | pub fn user_by_id(&self, user_id: usize) -> User { 30 | self.users 31 | .get(user_id) 32 | .expect("No user found matching ID!") 33 | .clone() 34 | } 35 | #[cfg(test)] 36 | pub fn user_count(&self) -> usize { 37 | self.next_id 38 | } 39 | #[cfg(test)] 40 | pub fn user_id(&self, user: &User) -> Option<&usize> { 41 | self.lower_users.get(&user.as_lower_case()) 42 | } 43 | } 44 | 45 | /// We store, rather redundantly, the user ID in the JSON, even though users are output as an array. 46 | /// This makes it easier for humans to correlate users with data without counting from 0 47 | /// It also will make it easier later to alias users to other users. 48 | #[derive(Debug, PartialEq, Serialize)] 49 | struct UserKey<'a> { 50 | id: usize, 51 | user: &'a User, 52 | } 53 | 54 | impl Serialize for GitUserDictionary { 55 | fn serialize(&self, serializer: S) -> Result 56 | where 57 | S: Serializer, 58 | { 59 | let mut seq = serializer.serialize_seq(Some(self.users.len()))?; 60 | for (id, user) in self.users.iter().enumerate() { 61 | seq.serialize_element(&UserKey { id, user })?; 62 | } 63 | seq.end() 64 | } 65 | } 66 | 67 | #[cfg(test)] 68 | mod test { 69 | use super::*; 70 | #[cfg(test)] 71 | use pretty_assertions::assert_eq; 72 | 73 | // use test_shared::*; 74 | 75 | #[test] 76 | fn users_receive_sequential_ids() { 77 | let mut dict = GitUserDictionary::default(); 78 | 79 | let jane = User::new(Some("Jane"), Some("JaneDoe@gmail.com")); 80 | let user0 = dict.register(&jane); 81 | assert_eq!(user0, 0); 82 | assert_eq!(dict.user_by_id(user0), jane); 83 | 84 | let user1 = dict.register(&User::new(Some("Jane"), None)); 85 | assert_eq!(user1, 1); 86 | let user0again = dict.register(&User::new(Some("Jane"), Some("JaneDoe@gmail.com"))); 87 | assert_eq!(user0again, 0); 88 | } 89 | 90 | #[test] 91 | fn user_checks_are_case_insensitive_and_return_first_seen_user() { 92 | let mut dict = GitUserDictionary::default(); 93 | 94 | let jane = User::new(Some("Jane"), Some("JaneDoe@gmail.com")); 95 | let lower_jane = User::new(Some("jane"), Some("janeDoe@gmail.com")); 96 | let user0 = dict.register(&jane); 97 | assert_eq!(user0, 0); 98 | // there is only one user! 99 | assert_eq!(dict.user_count(), 1); 100 | 101 | let user1 = dict.register(&lower_jane); 102 | assert_eq!(user1, 0); 103 | assert_eq!(dict.user_by_id(0), jane); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /tests/expected/git/git_sample_by_filename.json: -------------------------------------------------------------------------------- 1 | { 2 | "simple/parent.clj": [ 3 | { 4 | "id": "0dbd54d4c524ecc776f381e660cce9b2dd92162c", 5 | "committer": { 6 | "name": "Korny Sietsma", 7 | "email": "korny@sietsma.com" 8 | }, 9 | "commit_time": 1558524371, 10 | "author": { 11 | "name": "Korny Sietsma", 12 | "email": "korny@sietsma.com" 13 | }, 14 | "author_time": 1558524371, 15 | "co_authors": [], 16 | "change": "Modify", 17 | "lines_added": 1, 18 | "lines_deleted": 0 19 | }, 20 | { 21 | "id": "a0ae9997cfdf49fd0cbf54dacc72c778af337519", 22 | "committer": { 23 | "name": "Korny Sietsma", 24 | "email": "korny@sietsma.com" 25 | }, 26 | "commit_time": 1558521550, 27 | "author": { 28 | "name": "Korny Sietsma", 29 | "email": "korny@sietsma.com" 30 | }, 31 | "author_time": 1558521550, 32 | "co_authors": [ 33 | { 34 | "name": "hermoine", 35 | "email": "hgranger@durmstrang.de" 36 | } 37 | ], 38 | "change": "Modify", 39 | "lines_added": 3, 40 | "lines_deleted": 1 41 | }, 42 | { 43 | "id": "ca239efb9b26db57ac9e2ec3e2df1c42578a46f8", 44 | "committer": { 45 | "name": "Korny Sietsma", 46 | "email": "korny@sietsma.com" 47 | }, 48 | "commit_time": 1558521386, 49 | "author": { 50 | "name": "Korny Sietsma", 51 | "email": "korny@sietsma.com" 52 | }, 53 | "author_time": 1558521386, 54 | "co_authors": [], 55 | "change": "Add", 56 | "lines_added": 4, 57 | "lines_deleted": 0 58 | } 59 | ], 60 | "simple/child/a_renamed.txt": [ 61 | { 62 | "id": "93ae0c7c7cd93b3c4ea1bf103bde4deafef798ad", 63 | "committer": { 64 | "name": "Korny Sietsma", 65 | "email": "korny@sietsma.com" 66 | }, 67 | "commit_time": 1558533240, 68 | "author": { 69 | "name": "Korny Sietsma", 70 | "email": "korny@sietsma.com" 71 | }, 72 | "author_time": 1558533240, 73 | "co_authors": [], 74 | "change": "Rename", 75 | "lines_added": 0, 76 | "lines_deleted": 0 77 | }, 78 | { 79 | "id": "a0ae9997cfdf49fd0cbf54dacc72c778af337519", 80 | "committer": { 81 | "name": "Korny Sietsma", 82 | "email": "korny@sietsma.com" 83 | }, 84 | "commit_time": 1558521550, 85 | "author": { 86 | "name": "Korny Sietsma", 87 | "email": "korny@sietsma.com" 88 | }, 89 | "author_time": 1558521550, 90 | "co_authors": [ 91 | { 92 | "name": "hermoine", 93 | "email": "hgranger@durmstrang.de" 94 | } 95 | ], 96 | "change": "Modify", 97 | "lines_added": 1, 98 | "lines_deleted": 1 99 | }, 100 | { 101 | "id": "cdf8709362c267198d04d47e55e66071fdd5f52b", 102 | "committer": { 103 | "name": "Korny Sietsma", 104 | "email": "korny@sietsma.com" 105 | }, 106 | "commit_time": 1558521648, 107 | "author": { 108 | "name": "Korny Sietsma", 109 | "email": "korny@sietsma.com" 110 | }, 111 | "author_time": 1558521648, 112 | "co_authors": [], 113 | "change": "Modify", 114 | "lines_added": 1, 115 | "lines_deleted": 2 116 | }, 117 | { 118 | "id": "ca239efb9b26db57ac9e2ec3e2df1c42578a46f8", 119 | "committer": { 120 | "name": "Korny Sietsma", 121 | "email": "korny@sietsma.com" 122 | }, 123 | "commit_time": 1558521386, 124 | "author": { 125 | "name": "Korny Sietsma", 126 | "email": "korny@sietsma.com" 127 | }, 128 | "author_time": 1558521386, 129 | "co_authors": [], 130 | "change": "Add", 131 | "lines_added": 2, 132 | "lines_deleted": 0 133 | } 134 | ] 135 | } 136 | -------------------------------------------------------------------------------- /src/loc.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | use crate::polyglot_data::IndicatorMetadata; 3 | 4 | use super::toxicity_indicator_calculator::ToxicityIndicatorCalculator; 5 | use anyhow::Error; 6 | use serde::Serialize; 7 | 8 | use content_inspector::{inspect, ContentType}; 9 | 10 | use std::fs::File; 11 | use std::io::Read; 12 | use std::path::{Path, PathBuf}; 13 | 14 | use tokei::{Config, LanguageType}; 15 | 16 | /// a struct representing tokei language data - based on `tokei::Stats` and `tokei::Languages::name` 17 | #[derive(Debug, PartialEq, Eq, Serialize, Clone)] 18 | pub struct LanguageLocData { 19 | /// Canonical language name 20 | pub language: String, 21 | /// binary files only have bytes not lines! 22 | pub binary: bool, 23 | /// Number of blank lines within the file. 24 | pub blanks: usize, 25 | /// Number of lines of code within the file. 26 | pub code: usize, 27 | /// Number of comments within the file. (_includes both multi line, and 28 | /// single line comments_) 29 | pub comments: usize, 30 | /// Total number of lines within the file. 31 | pub lines: usize, 32 | /// File size in bytes 33 | pub bytes: u64, 34 | } 35 | 36 | fn safe_extension(filename: &Path) -> String { 37 | match filename.extension() { 38 | Some(ext) => ext.to_string_lossy().to_string(), 39 | None => "no_extension".to_owned(), 40 | } 41 | } 42 | 43 | fn file_size(filename: &Path) -> Result { 44 | Ok(filename.metadata()?.len()) 45 | } 46 | //TODO: should binary data have 'lines:0' or should it be 47 | // an explicit special case? 48 | impl LanguageLocData { 49 | fn from_binary(language_name: String, filename: &Path) -> Result { 50 | Ok(LanguageLocData { 51 | language: language_name, 52 | binary: true, 53 | blanks: 0, 54 | code: 0, 55 | comments: 0, 56 | lines: 0, 57 | bytes: file_size(filename)?, 58 | }) 59 | } 60 | } 61 | 62 | const MAX_PEEK_SIZE: usize = 1024; 63 | 64 | fn file_content_type(filename: &Path) -> Result { 65 | let file = File::open(filename)?; 66 | let mut buffer: Vec = vec![]; 67 | 68 | file.take(MAX_PEEK_SIZE as u64).read_to_end(&mut buffer)?; 69 | Ok(inspect(&buffer)) 70 | } 71 | 72 | fn parse_file(filename: &Path) -> Result { 73 | let config = Config::default(); 74 | let mut language_name = None; 75 | let language = match LanguageType::from_path(filename, &config) { 76 | Some(language) => language, 77 | None => { 78 | language_name = Some(safe_extension(filename)); 79 | if file_content_type(filename)? == ContentType::BINARY { 80 | return LanguageLocData::from_binary(language_name.unwrap(), filename); 81 | } 82 | LanguageType::Text 83 | } 84 | }; 85 | let language_name = language_name.unwrap_or_else(|| language.name().to_string()); 86 | let report = language.parse(PathBuf::from(filename), &config); 87 | 88 | match report { 89 | Ok(report) => Ok(LanguageLocData { 90 | binary: false, 91 | blanks: report.stats.blanks, 92 | code: report.stats.code, 93 | comments: report.stats.comments, 94 | lines: report.stats.lines(), 95 | language: language_name, 96 | bytes: file_size(filename)?, 97 | }), 98 | Err((error, _pathbuf)) => Err(Error::from(error)), 99 | } 100 | } 101 | 102 | #[derive(Debug)] 103 | pub struct LocCalculator {} 104 | 105 | impl ToxicityIndicatorCalculator for LocCalculator { 106 | fn name(&self) -> String { 107 | "loc".to_string() 108 | } 109 | 110 | fn visit_node( 111 | &mut self, 112 | node: &mut crate::flare::FlareTreeNode, 113 | path: &Path, 114 | ) -> Result<(), Error> { 115 | if path.is_file() { 116 | let stats = parse_file(path)?; 117 | node.indicators_mut().loc = Some(stats); 118 | } 119 | Ok(()) 120 | } 121 | 122 | fn apply_metadata(&self, _metadata: &mut IndicatorMetadata) -> Result<(), Error> { 123 | Ok(()) 124 | } 125 | } 126 | 127 | #[cfg(test)] 128 | mod test { 129 | use super::*; 130 | 131 | #[test] 132 | fn can_get_loc_data_for_a_file() { 133 | let stats = parse_file(Path::new("./tests/data/simple/parent.clj")).unwrap(); 134 | assert_eq!(stats.code, 3); 135 | assert_eq!(stats.language, "Clojure"); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /tests/expected/git/git_sample.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "93ae0c7c7cd93b3c4ea1bf103bde4deafef798ad", 4 | "summary": "renaming", 5 | "parents": ["0dbd54d4c524ecc776f381e660cce9b2dd92162c"], 6 | "committer": { 7 | "name": "Korny Sietsma", 8 | "email": "korny@sietsma.com" 9 | }, 10 | "commit_time": 1558533240, 11 | "author": { 12 | "name": "Korny Sietsma", 13 | "email": "korny@sietsma.com" 14 | }, 15 | "author_time": 1558533240, 16 | "co_authors": [], 17 | "file_changes": [ 18 | { 19 | "file": "simple/child/a_renamed.txt", 20 | "old_file": "simple/child/a.txt", 21 | "change": "Rename", 22 | "lines_added": 0, 23 | "lines_deleted": 0 24 | } 25 | ] 26 | }, 27 | { 28 | "id": "0dbd54d4c524ecc776f381e660cce9b2dd92162c", 29 | "summary": "just changed parent.clj", 30 | "parents": ["1ef4f2a8301699964d7bb397d3f2e86d8d9776b3"], 31 | "committer": { 32 | "name": "Korny Sietsma", 33 | "email": "korny@sietsma.com" 34 | }, 35 | "commit_time": 1558524371, 36 | "author": { 37 | "name": "Korny Sietsma", 38 | "email": "korny@sietsma.com" 39 | }, 40 | "author_time": 1558524371, 41 | "co_authors": [], 42 | "file_changes": [ 43 | { 44 | "file": "simple/parent.clj", 45 | "old_file": null, 46 | "change": "Modify", 47 | "lines_added": 1, 48 | "lines_deleted": 0 49 | } 50 | ] 51 | }, 52 | { 53 | "id": "1ef4f2a8301699964d7bb397d3f2e86d8d9776b3", 54 | "summary": "Merge branch 'fiddling'", 55 | "parents": [ 56 | "cdf8709362c267198d04d47e55e66071fdd5f52b", 57 | "a0ae9997cfdf49fd0cbf54dacc72c778af337519" 58 | ], 59 | "committer": { 60 | "name": "Korny Sietsma", 61 | "email": "korny@sietsma.com" 62 | }, 63 | "commit_time": 1558521695, 64 | "author": { 65 | "name": "Korny Sietsma", 66 | "email": "korny@sietsma.com" 67 | }, 68 | "author_time": 1558521695, 69 | "co_authors": [], 70 | "file_changes": [] 71 | }, 72 | { 73 | "id": "a0ae9997cfdf49fd0cbf54dacc72c778af337519", 74 | "summary": "made some changes with a bigger comment", 75 | "parents": ["ca239efb9b26db57ac9e2ec3e2df1c42578a46f8"], 76 | "committer": { 77 | "name": "Korny Sietsma", 78 | "email": "korny@sietsma.com" 79 | }, 80 | "commit_time": 1558521550, 81 | "author": { 82 | "name": "Korny Sietsma", 83 | "email": "korny@sietsma.com" 84 | }, 85 | "author_time": 1558521550, 86 | "co_authors": [ 87 | { 88 | "name": "hermoine", 89 | "email": "hgranger@durmstrang.de" 90 | } 91 | ], 92 | "file_changes": [ 93 | { 94 | "file": "simple/child/a.txt", 95 | "old_file": null, 96 | "change": "Modify", 97 | "lines_added": 1, 98 | "lines_deleted": 1 99 | }, 100 | { 101 | "file": "simple/parent.clj", 102 | "old_file": null, 103 | "change": "Modify", 104 | "lines_added": 3, 105 | "lines_deleted": 1 106 | } 107 | ] 108 | }, 109 | { 110 | "id": "cdf8709362c267198d04d47e55e66071fdd5f52b", 111 | "summary": "removed excess line", 112 | "parents": ["ca239efb9b26db57ac9e2ec3e2df1c42578a46f8"], 113 | "committer": { 114 | "name": "Korny Sietsma", 115 | "email": "korny@sietsma.com" 116 | }, 117 | "commit_time": 1558521648, 118 | "author": { 119 | "name": "Korny Sietsma", 120 | "email": "korny@sietsma.com" 121 | }, 122 | "author_time": 1558521648, 123 | "co_authors": [], 124 | "file_changes": [ 125 | { 126 | "file": "simple/child/a.txt", 127 | "old_file": null, 128 | "change": "Modify", 129 | "lines_added": 1, 130 | "lines_deleted": 2 131 | } 132 | ] 133 | }, 134 | { 135 | "id": "ca239efb9b26db57ac9e2ec3e2df1c42578a46f8", 136 | "summary": "first commit", 137 | "parents": [], 138 | "committer": { 139 | "name": "Korny Sietsma", 140 | "email": "korny@sietsma.com" 141 | }, 142 | "commit_time": 1558521386, 143 | "author": { 144 | "name": "Korny Sietsma", 145 | "email": "korny@sietsma.com" 146 | }, 147 | "author_time": 1558521386, 148 | "co_authors": [], 149 | "file_changes": [ 150 | { 151 | "file": "simple/child/a.txt", 152 | "old_file": null, 153 | "change": "Add", 154 | "lines_added": 2, 155 | "lines_deleted": 0 156 | }, 157 | { 158 | "file": "simple/parent.clj", 159 | "old_file": null, 160 | "change": "Add", 161 | "lines_added": 4, 162 | "lines_deleted": 0 163 | } 164 | ] 165 | } 166 | ] 167 | -------------------------------------------------------------------------------- /src/code_line_data.rs: -------------------------------------------------------------------------------- 1 | use std::{fs::File, io::Read, path::PathBuf}; 2 | 3 | use anyhow::Error; 4 | use encoding_rs_io::DecodeReaderBytesBuilder; 5 | use tokei::CodeStats; 6 | 7 | #[derive(Clone, Debug, PartialEq, Eq)] 8 | pub struct CodeLineData { 9 | pub spaces: u32, 10 | pub tabs: u32, 11 | pub text: u32, 12 | } 13 | 14 | impl CodeLineData { 15 | fn new(line: &[u8]) -> Self { 16 | let mut spaces: u32 = 0; 17 | let mut tabs: u32 = 0; 18 | let mut text: Option = None; 19 | for ix in 0..line.len() { 20 | let c = line[ix]; 21 | if c == b' ' { 22 | spaces += 1; 23 | } else if c == b'\t' { 24 | tabs += 1; 25 | } else { 26 | text = Some( 27 | String::from_utf8_lossy(&line[ix..line.len()]) 28 | .trim() 29 | .chars() 30 | .count(), 31 | ); 32 | break; 33 | } 34 | } 35 | 36 | CodeLineData { 37 | spaces, 38 | tabs, 39 | text: text.unwrap_or(0) as u32, 40 | } 41 | } 42 | } 43 | 44 | #[derive(Clone, Debug, PartialEq, Eq)] 45 | pub struct CodeLines { 46 | pub lines: Vec, 47 | } 48 | 49 | impl CodeLines { 50 | pub fn from_stats(stats: &CodeStats) -> Self { 51 | CodeLines { 52 | lines: stats 53 | .code_lines 54 | .iter() 55 | .map(|line| CodeLineData::new(line)) 56 | .collect(), 57 | } 58 | } 59 | pub fn new(path: &PathBuf) -> Result { 60 | let text: Vec> = { 61 | let f = match File::open(path) { 62 | Ok(f) => f, 63 | Err(e) => return Err(anyhow!("error opening file {:?} - {}", &path, e)), 64 | }; 65 | let mut s = Vec::new(); 66 | let mut reader = DecodeReaderBytesBuilder::new().build(f); 67 | reader.read_to_end(&mut s)?; 68 | 69 | s.split(|b| *b == b'\n').map(Vec::from).collect() 70 | }; 71 | Ok(CodeLines { 72 | lines: text.iter().map(|line| CodeLineData::new(line)).collect(), 73 | }) 74 | } 75 | } 76 | 77 | #[cfg(test)] 78 | mod tests { 79 | use super::*; 80 | use tokei::{Config, LanguageType}; 81 | 82 | #[test] 83 | pub fn can_process_tabs_and_spaces() { 84 | let data = CodeLineData::new(" \t \t foo".as_bytes()); 85 | assert_eq!( 86 | data, 87 | CodeLineData { 88 | spaces: 3, 89 | tabs: 2, 90 | text: 3 91 | } 92 | ); 93 | } 94 | 95 | #[test] 96 | pub fn can_process_unicode() { 97 | let data = CodeLineData::new("①②③④⑤⑥⑦⑧⑨⑩".as_bytes()); 98 | assert_eq!( 99 | data, 100 | CodeLineData { 101 | spaces: 0, 102 | tabs: 0, 103 | text: 10 104 | } 105 | ); 106 | } 107 | 108 | #[test] 109 | pub fn can_parse_source_code() { 110 | let code = r#"function foo☃() { 111 | 112 | blah; 113 | 114 | // comment 115 | } 116 | /* longer comment 117 | with blanks 118 | 119 | yow 120 | */ 121 | foo();"#; 122 | let stats: CodeStats = LanguageType::JavaScript.parse_from_str(code, &Config::default()); 123 | 124 | // eprintln!("Stats: {:?}", stats); 125 | // let printable_lines: Vec<_> = stats 126 | // .code_lines 127 | // .iter() 128 | // .map(|l| String::from_utf8_lossy(l)) 129 | // .collect(); 130 | // eprintln!("Code lines: {:?}", printable_lines); 131 | 132 | let result: CodeLines = CodeLines::from_stats(&stats); 133 | 134 | let mut expected = vec![ 135 | CodeLineData { 136 | spaces: 0, 137 | tabs: 0, 138 | text: 17, 139 | }, 140 | CodeLineData { 141 | spaces: 4, 142 | tabs: 0, 143 | text: 5, 144 | }, 145 | CodeLineData { 146 | spaces: 0, 147 | tabs: 0, 148 | text: 1, 149 | }, 150 | CodeLineData { 151 | spaces: 0, 152 | tabs: 0, 153 | text: 6, 154 | }, 155 | CodeLineData { 156 | spaces: 0, 157 | tabs: 0, 158 | text: 0, 159 | }, 160 | CodeLineData { 161 | spaces: 0, 162 | tabs: 0, 163 | text: 0, 164 | }, 165 | ]; 166 | expected.sort_by(|a, b| a.text.partial_cmp(&b.text).unwrap()); 167 | 168 | let mut actual = result.lines; 169 | 170 | actual.sort_by(|a, b| a.text.partial_cmp(&b.text).unwrap()); 171 | assert_eq!(actual, expected); 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![forbid(unsafe_code)] 2 | #![warn(clippy::all)] 3 | #![warn(rust_2018_idioms)] 4 | #![warn(clippy::pedantic)] 5 | // pedantic is just a bit too keen for me! But still useful. 6 | #![allow(clippy::module_name_repetitions)] 7 | #![allow(clippy::cast_possible_truncation)] 8 | #![allow(clippy::cast_precision_loss)] 9 | #![allow(clippy::cast_sign_loss)] 10 | #![allow(clippy::cast_lossless)] 11 | #![allow(clippy::missing_errors_doc)] 12 | #![allow(clippy::similar_names)] 13 | #![allow(clippy::cast_possible_wrap)] 14 | #![allow(clippy::redundant_else)] 15 | #![allow(clippy::single_match_else)] 16 | 17 | #[macro_use] 18 | extern crate anyhow; 19 | #[macro_use] 20 | extern crate log; 21 | #[macro_use] 22 | extern crate lazy_static; 23 | #[macro_use] 24 | extern crate derive_builder; 25 | #[macro_use] 26 | extern crate derive_getters; 27 | 28 | use anyhow::{Context, Error}; 29 | use file_stats::FileStatsCalculator; 30 | use postprocessing::postprocess_tree; 31 | use serde::Serialize; 32 | use std::io; 33 | use std::path::Path; 34 | 35 | mod code_line_data; 36 | // pub mod coupling; 37 | mod file_walker; 38 | // public so main.rs can access structures TODO: can this be done better? expose here just what main needs? 39 | pub mod coupling; 40 | mod file_stats; 41 | mod flare; 42 | mod git; 43 | mod git_file_future; 44 | mod git_user_dictionary; 45 | mod indentation; 46 | mod loc; 47 | mod polyglot_data; 48 | mod postprocessing; 49 | mod toxicity_indicator_calculator; 50 | 51 | mod git_file_history; 52 | mod git_logger; 53 | 54 | use crate::coupling::CouplingConfig; 55 | use git::GitCalculator; 56 | use git_logger::GitLogConfig; 57 | use indentation::IndentationCalculator; 58 | use loc::LocCalculator; 59 | use toxicity_indicator_calculator::ToxicityIndicatorCalculator; 60 | 61 | #[allow(clippy::struct_excessive_bools)] 62 | #[derive(Debug, Default, Clone, Serialize)] 63 | pub struct FeatureFlags { 64 | pub git: bool, 65 | pub coupling: bool, 66 | pub git_details: bool, 67 | pub file_stats: bool, 68 | } 69 | 70 | // general config for the scanner and calculators - could be split if it grows too far 71 | pub struct ScannerConfig { 72 | pub git_years: Option, 73 | pub follow_symlinks: bool, 74 | pub name: String, 75 | pub data_id: Option, 76 | pub features: FeatureFlags, 77 | } 78 | 79 | impl ScannerConfig { 80 | #[must_use] 81 | pub fn default(name: &str) -> Self { 82 | ScannerConfig { 83 | git_years: None, 84 | follow_symlinks: false, 85 | name: name.to_owned(), 86 | data_id: None, 87 | features: FeatureFlags::default(), 88 | } 89 | } 90 | } 91 | 92 | #[must_use] 93 | pub fn named_toxicity_indicator_calculator( 94 | name: &str, 95 | config: &ScannerConfig, 96 | ) -> Option> { 97 | match name { 98 | "loc" => Some(Box::new(LocCalculator {})), 99 | "git" => Some(Box::new(GitCalculator::new( 100 | GitLogConfig::default() 101 | .include_merges(true) 102 | .since_years(config.git_years), 103 | ))), 104 | "indentation" => Some(Box::new(IndentationCalculator {})), 105 | "file_stats" => Some(Box::new(FileStatsCalculator {})), 106 | _ => None, 107 | } 108 | } 109 | 110 | pub fn run( 111 | root: &Path, 112 | config: &ScannerConfig, 113 | coupling_config: Option, 114 | toxicity_indicator_calculator_names: &[&str], 115 | out: W, 116 | ) -> Result<(), Error> 117 | where 118 | W: io::Write, 119 | { 120 | if toxicity_indicator_calculator_names.contains(&"git") && !config.features.git { 121 | bail!("Logic error - using git calculator when git is disabled!"); 122 | } 123 | if toxicity_indicator_calculator_names.contains(&"file_stats") && !config.features.file_stats { 124 | bail!("Logic error - using file_stats calculator when file_stats is disabled!"); 125 | } 126 | let maybe_tics: Option> = toxicity_indicator_calculator_names 127 | .iter() 128 | .map(|name| named_toxicity_indicator_calculator(name, config)) 129 | .collect(); 130 | 131 | let mut tics = maybe_tics.expect("Some toxicity indicator calculator names don't exist!"); 132 | 133 | info!("Walking directory tree"); 134 | let mut polyglot_data = file_walker::walk_directory( 135 | root, 136 | &config.name, 137 | config.data_id.as_deref(), 138 | config.follow_symlinks, 139 | &mut tics, 140 | &config.features, 141 | )?; 142 | 143 | info!("adding metadata"); 144 | for tic in tics { 145 | tic.apply_metadata(polyglot_data.metadata()) 146 | .with_context(|| format!("applying metadata for {}", tic.name()))?; 147 | } 148 | 149 | if let Some(cc) = coupling_config { 150 | // TODO: fix this to take the data 151 | info!("gathering coupling"); 152 | coupling::gather_coupling(&mut polyglot_data, cc)?; 153 | } 154 | 155 | info!("postprocessing tree"); 156 | // TODO: fix this to take the data 157 | postprocess_tree(polyglot_data.tree_mut(), config)?; 158 | 159 | info!("saving as JSON"); 160 | serde_json::to_writer(out, &polyglot_data)?; 161 | Ok(()) 162 | } 163 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Polyglot Code Scanner 2 | 3 | This is part of my Polyglot Code tools - for the main documentation, see 4 | 5 | ## A note about releases 6 | 7 | Binary releases are working again - see for binary releases. 8 | 9 | However, for M1 macs this won't work - github actions doesn't yet support M1 macs for free, so you'll have to build binaries yourself for now. 10 | 11 | For Macs you also need to run `xattr -d com.apple.quarantine polyglot-code-scanner-x86_64-macos` to remove the quarantine that OSX adds to all downloaded binaries. 12 | 13 | ## Intro 14 | 15 | This application scans source code directories, identifying a range of code metrics and other data, and storing the results in a JSON file for later visualisation by the [Polyglot Code Explorer](https://polyglot.korny.info/tools/explorer/description/) 16 | 17 | ## Installation and running 18 | 19 | See also for more detailed instructions for building binary releases, and running the scanner. 20 | 21 | To compile and run from source, you'll need [to install rust and cargo](https://www.rust-lang.org/tools/install) and then from a copy of this project, you can build a binary package with: 22 | 23 | ~~~sh 24 | cargo build --release 25 | ~~~ 26 | 27 | The binary will be built in the `target/release` directory. 28 | 29 | ### Running from source 30 | 31 | You can also just run it from the source directory with `cargo run polyglot_code_scanner -- (other command line arguments)` - this will be slower as it runs un-optimised code with more debug information. But it's a lot faster for development. 32 | 33 | ### Getting help 34 | 35 | See for the main documentation for this project. 36 | 37 | You can get up-to-date command-line help by running 38 | 39 | ~~~sh 40 | polyglot_code_scanner -h 41 | ~~~ 42 | 43 | ## Ignoring files 44 | 45 | Git ignored files in `.gitignore` are not scanned. 46 | 47 | You can also manually add `.polyglot_code_scanner_ignore` files anywhere in the codebase, to list extra files to be ignored - the syntax is [the same as .gitignore's](https://git-scm.com/docs/gitignore) 48 | 49 | ## Usage 50 | 51 | Run `polyglot_code_scanner -h` for full options, this is just the main options: 52 | 53 | ~~~text 54 | USAGE: 55 | polyglot_code_scanner [OPTIONS] --name [ROOT] 56 | 57 | ARGS: 58 | Root directory, current dir if not present 59 | 60 | OPTIONS: 61 | -h, --help 62 | Print help information 63 | 64 | -n, --name 65 | project name - identifies the selected data for display and state storage 66 | 67 | --id 68 | data file ID - used to identify unique data files for browser storage, generates a UUID 69 | if not specified 70 | 71 | -o, --output 72 | Output file, stdout if not present, or not used if sending to web server 73 | 74 | --no-git 75 | Do not scan for git repositories 76 | 77 | --years 78 | how many years of git history to parse - default only scan the last 3 years (from now, 79 | not git head) [default: 3] 80 | 81 | -c, --coupling 82 | include temporal coupling data 83 | 84 | -V, --version 85 | Print version information 86 | 87 | ~~~ 88 | 89 | ## Development notes 90 | 91 | See also the `DesignDecisions.md` file 92 | 93 | ### Running tests 94 | 95 | To run a single named test from the command-line: 96 | 97 | ~~~sh 98 | cargo test -- --nocapture renames_and_deletes_applied_across_history 99 | ~~~ 100 | 101 | The `--nocapture` tells rust not to capture stdout/stderr - so you can add `println!` and `eprintln!` statements to help you. 102 | 103 | To remove some extra noise and blank lines, pipe the output through grep: 104 | 105 | ~~~sh 106 | cargo test -- --nocapture renames_and_deletes_applied_across_history | grep -v "running 0 tests" | grep -v "0 passed" | grep -v -e '^\s*$' 107 | ~~~ 108 | 109 | ### showing logs 110 | 111 | Rust tests don't install a logger - normally you explicitly install loggers in your `main` which tests don't use. 112 | 113 | To install a logger using the `fern` crate, add the following to tests: 114 | 115 | ~~~rust 116 | use test_shared::*; 117 | ~~~ 118 | 119 | then 120 | 121 | ~~~rust 122 | install_test_logger(); 123 | ~~~ 124 | 125 | This sets up a simple logger which sends logs to stdout - make sure you also use the `--nocapture` parameter mentioned earlier. 126 | 127 | ### Pretty test output 128 | 129 | If you want better assertions, your tests need to explicitly use the `pretty_assertions` crate: 130 | 131 | ~~~rust 132 | use pretty_assertions::assert_eq; 133 | ~~~ 134 | 135 | ## Releasing new versions 136 | 137 | Releasing uses [cargo-release](https://crates.io/crates/cargo-release) 138 | 139 | The basic process is: 140 | 141 | * update the top CHANGELOG.md entry (under 'unreleased') 142 | * commit and push changes 143 | * release 144 | 145 | ~~~sh 146 | cargo release --dry-run 147 | ~~~ 148 | 149 | or for a minor change 0.1.3 to 0.2.0 : 150 | 151 | ~~~sh 152 | cargo release minor --dry-run 153 | ~~~ 154 | 155 | ## License 156 | 157 | Copyright © 2019-2022 Kornelis Sietsma 158 | 159 | Licensed under the Apache License, Version 2.0 - see LICENSE.txt for details 160 | -------------------------------------------------------------------------------- /tests/expected/git/git_sample_with_merges.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "93ae0c7c7cd93b3c4ea1bf103bde4deafef798ad", 4 | "summary": "renaming", 5 | "parents": ["0dbd54d4c524ecc776f381e660cce9b2dd92162c"], 6 | "committer": { 7 | "name": "Korny Sietsma", 8 | "email": "korny@sietsma.com" 9 | }, 10 | "commit_time": 1558533240, 11 | "author": { 12 | "name": "Korny Sietsma", 13 | "email": "korny@sietsma.com" 14 | }, 15 | "author_time": 1558533240, 16 | "co_authors": [], 17 | "file_changes": [ 18 | { 19 | "file": "simple/child/a_renamed.txt", 20 | "old_file": "simple/child/a.txt", 21 | "change": "Rename", 22 | "lines_added": 0, 23 | "lines_deleted": 0 24 | } 25 | ] 26 | }, 27 | { 28 | "id": "0dbd54d4c524ecc776f381e660cce9b2dd92162c", 29 | "summary": "just changed parent.clj", 30 | "parents": ["1ef4f2a8301699964d7bb397d3f2e86d8d9776b3"], 31 | "committer": { 32 | "name": "Korny Sietsma", 33 | "email": "korny@sietsma.com" 34 | }, 35 | "commit_time": 1558524371, 36 | "author": { 37 | "name": "Korny Sietsma", 38 | "email": "korny@sietsma.com" 39 | }, 40 | "author_time": 1558524371, 41 | "co_authors": [], 42 | "file_changes": [ 43 | { 44 | "file": "simple/parent.clj", 45 | "old_file": null, 46 | "change": "Modify", 47 | "lines_added": 1, 48 | "lines_deleted": 0 49 | } 50 | ] 51 | }, 52 | { 53 | "id": "1ef4f2a8301699964d7bb397d3f2e86d8d9776b3", 54 | "summary": "Merge branch 'fiddling'", 55 | "parents": [ 56 | "cdf8709362c267198d04d47e55e66071fdd5f52b", 57 | "a0ae9997cfdf49fd0cbf54dacc72c778af337519" 58 | ], 59 | "committer": { 60 | "name": "Korny Sietsma", 61 | "email": "korny@sietsma.com" 62 | }, 63 | "commit_time": 1558521695, 64 | "author": { 65 | "name": "Korny Sietsma", 66 | "email": "korny@sietsma.com" 67 | }, 68 | "author_time": 1558521695, 69 | "co_authors": [], 70 | "file_changes": [ 71 | { 72 | "file": "simple/child/a.txt", 73 | "old_file": null, 74 | "change": "Modify", 75 | "lines_added": 3, 76 | "lines_deleted": 1 77 | }, 78 | { 79 | "file": "simple/parent.clj", 80 | "old_file": null, 81 | "change": "Modify", 82 | "lines_added": 3, 83 | "lines_deleted": 1 84 | }, 85 | { 86 | "file": "simple/child/a.txt", 87 | "old_file": null, 88 | "change": "Modify", 89 | "lines_added": 1, 90 | "lines_deleted": 0 91 | } 92 | ] 93 | }, 94 | { 95 | "id": "a0ae9997cfdf49fd0cbf54dacc72c778af337519", 96 | "summary": "made some changes with a bigger comment", 97 | "parents": ["ca239efb9b26db57ac9e2ec3e2df1c42578a46f8"], 98 | "committer": { 99 | "name": "Korny Sietsma", 100 | "email": "korny@sietsma.com" 101 | }, 102 | "commit_time": 1558521550, 103 | "author": { 104 | "name": "Korny Sietsma", 105 | "email": "korny@sietsma.com" 106 | }, 107 | "author_time": 1558521550, 108 | "co_authors": [ 109 | { 110 | "name": "hermoine", 111 | "email": "hgranger@durmstrang.de" 112 | } 113 | ], 114 | "file_changes": [ 115 | { 116 | "file": "simple/child/a.txt", 117 | "old_file": null, 118 | "change": "Modify", 119 | "lines_added": 1, 120 | "lines_deleted": 1 121 | }, 122 | { 123 | "file": "simple/parent.clj", 124 | "old_file": null, 125 | "change": "Modify", 126 | "lines_added": 3, 127 | "lines_deleted": 1 128 | } 129 | ] 130 | }, 131 | 132 | { 133 | "id": "cdf8709362c267198d04d47e55e66071fdd5f52b", 134 | "summary": "removed excess line", 135 | "parents": ["ca239efb9b26db57ac9e2ec3e2df1c42578a46f8"], 136 | "committer": { 137 | "name": "Korny Sietsma", 138 | "email": "korny@sietsma.com" 139 | }, 140 | "commit_time": 1558521648, 141 | "author": { 142 | "name": "Korny Sietsma", 143 | "email": "korny@sietsma.com" 144 | }, 145 | "author_time": 1558521648, 146 | "co_authors": [], 147 | "file_changes": [ 148 | { 149 | "file": "simple/child/a.txt", 150 | "old_file": null, 151 | "change": "Modify", 152 | "lines_added": 1, 153 | "lines_deleted": 2 154 | } 155 | ] 156 | }, 157 | { 158 | "id": "ca239efb9b26db57ac9e2ec3e2df1c42578a46f8", 159 | "summary": "first commit", 160 | "parents": [], 161 | "committer": { 162 | "name": "Korny Sietsma", 163 | "email": "korny@sietsma.com" 164 | }, 165 | "commit_time": 1558521386, 166 | "author": { 167 | "name": "Korny Sietsma", 168 | "email": "korny@sietsma.com" 169 | }, 170 | "author_time": 1558521386, 171 | "co_authors": [], 172 | "file_changes": [ 173 | { 174 | "file": "simple/child/a.txt", 175 | "old_file": null, 176 | "change": "Add", 177 | "lines_added": 2, 178 | "lines_deleted": 0 179 | }, 180 | { 181 | "file": "simple/parent.clj", 182 | "old_file": null, 183 | "change": "Add", 184 | "lines_added": 4, 185 | "lines_deleted": 0 186 | } 187 | ] 188 | } 189 | ] 190 | -------------------------------------------------------------------------------- /src/indentation.rs: -------------------------------------------------------------------------------- 1 | use crate::flare::FlareTreeNode; 2 | use crate::polyglot_data::IndicatorMetadata; 3 | 4 | use super::toxicity_indicator_calculator::ToxicityIndicatorCalculator; 5 | use anyhow::{Context, Error}; 6 | use serde::Serialize; 7 | 8 | use content_inspector::{inspect, ContentType}; 9 | 10 | use std::fs::File; 11 | use std::io::Read; 12 | use std::path::{Path, PathBuf}; 13 | 14 | use tokei::{Config, LanguageType}; 15 | 16 | use super::code_line_data::CodeLines; 17 | 18 | use hdrhistogram::Histogram; 19 | 20 | /// a struct representing file indentation data 21 | #[derive(Debug, PartialEq, Serialize, Clone)] 22 | pub struct IndentationData { 23 | pub lines: u64, 24 | pub minimum: u64, 25 | pub maximum: u64, 26 | pub median: u64, 27 | pub stddev: f64, 28 | pub p75: u64, 29 | pub p90: u64, 30 | pub p99: u64, 31 | /// the sum of indentations - probably best measure according to [HGH08] 32 | pub sum: u64, 33 | } 34 | 35 | impl IndentationData { 36 | fn new(code_lines: CodeLines) -> Option { 37 | // we used to have this - reinstate if creating histogram for every file is too slow. But who knows, file I/O might be much bigger. 38 | // lazy_static! { 39 | // static ref HISTOGRAM: Mutex> = 40 | // Mutex::new(Histogram::::new(3).unwrap()); 41 | // } 42 | let mut histogram = Histogram::::new(3).expect("Can't create histogram"); 43 | let mut sum: u64 = 0; 44 | for line in code_lines.lines { 45 | if line.text > 0 { 46 | let indentation = line.spaces + line.tabs * 4; 47 | histogram 48 | .record(indentation as u64) 49 | .expect("Invalid histogram value!"); 50 | sum += indentation as u64; 51 | } 52 | } 53 | if histogram.is_empty() { 54 | None 55 | } else { 56 | Some(IndentationData { 57 | lines: histogram.len(), 58 | minimum: histogram.low(), 59 | maximum: histogram.high(), 60 | median: histogram.value_at_quantile(0.5), 61 | stddev: histogram.stdev(), 62 | p75: histogram.value_at_quantile(0.75), 63 | p90: histogram.value_at_quantile(0.90), 64 | p99: histogram.value_at_quantile(0.99), 65 | sum, 66 | }) 67 | } 68 | } 69 | } 70 | 71 | // TODO: remove duplication with loc.rs 72 | const MAX_PEEK_SIZE: usize = 1024; 73 | 74 | fn file_content_type(filename: &Path) -> Result { 75 | let file = File::open(filename)?; 76 | let mut buffer: Vec = vec![]; 77 | 78 | file.take(MAX_PEEK_SIZE as u64).read_to_end(&mut buffer)?; 79 | Ok(inspect(&buffer)) 80 | } 81 | 82 | fn parse_file(filename: &Path) -> Result, Error> { 83 | let config = Config::default(); 84 | let code_lines = match LanguageType::from_path(filename, &config) { 85 | Some(language) => { 86 | let report = language 87 | .parse(PathBuf::from(filename), &config) 88 | .map_err(|(error, _pathbuf)| error); 89 | CodeLines::from_stats(&report?.stats) 90 | } 91 | None => { 92 | if file_content_type(filename)? == ContentType::BINARY { 93 | return Ok(None); 94 | } 95 | debug!("Unknown language in {:?} - treating as text", filename); 96 | CodeLines::new(&PathBuf::from(filename))? 97 | } 98 | }; 99 | Ok(IndentationData::new(code_lines)) 100 | } 101 | 102 | #[derive(Debug)] 103 | pub struct IndentationCalculator {} 104 | 105 | impl ToxicityIndicatorCalculator for IndentationCalculator { 106 | fn name(&self) -> String { 107 | "indentation".to_string() 108 | } 109 | 110 | fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error> { 111 | if path.is_file() { 112 | let indentation = 113 | parse_file(path).with_context(|| format!("parsing indentation for {:?}", path))?; 114 | node.indicators_mut().indentation = indentation; 115 | } 116 | Ok(()) 117 | } 118 | 119 | fn apply_metadata(&self, _metadata: &mut IndicatorMetadata) -> Result<(), Error> { 120 | Ok(()) 121 | } 122 | } 123 | 124 | #[cfg(test)] 125 | mod test { 126 | use super::*; 127 | 128 | #[test] 129 | fn can_get_indentation_data_for_a_file() { 130 | let indentation = parse_file(Path::new("./tests/data/simple/parent.clj")) 131 | .unwrap() 132 | .unwrap(); 133 | assert_eq!(indentation.lines, 3); 134 | assert_eq!(indentation.p99, 2); 135 | assert_eq!(indentation.sum, 2); 136 | } 137 | 138 | #[test] 139 | fn unknown_files_are_treated_as_code() { 140 | let indentation = parse_file(Path::new("./tests/data/languages/foo.unknown")) 141 | .unwrap() 142 | .unwrap(); 143 | assert_eq!(indentation.lines, 2); 144 | assert_eq!(indentation.p99, 2); 145 | assert_eq!(indentation.sum, 2); 146 | } 147 | 148 | #[test] 149 | fn pf_files_are_fortran_unit_tests() { 150 | let indentation = parse_file(Path::new("./tests/data/languages/pfunit_test.pf")) 151 | .unwrap() 152 | .unwrap(); 153 | assert_eq!(indentation.lines, 13); 154 | assert_eq!(indentation.p99, 6); 155 | assert_eq!(indentation.sum, 39); 156 | } 157 | 158 | #[test] 159 | fn non_utf8_text_files_are_parsed() { 160 | let indentation = parse_file(Path::new("./tests/data/languages/non-utf8.properties")) 161 | .unwrap() 162 | .unwrap(); 163 | assert_eq!(indentation.lines, 2); 164 | assert_eq!(indentation.p99, 0); 165 | assert_eq!(indentation.sum, 0); 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/) 6 | and this project adheres to [Semantic Versioning](http://semver.org/). 7 | 8 | 9 | ## [Unreleased] - ReleaseDate 10 | ## [0.4.4] - 2022-11-21 11 | 12 | * fixed error with non-UTF8 files in the recent unknown languages logic 13 | * added error context in a few places to make diagnosing errors easier 14 | 15 | ## [0.4.3] - 2022-11-16 16 | 17 | * tidying up 18 | 19 | ## [0.4.2] - 2022-11-16 20 | 21 | * Unknown languages now treated as pure code - previously they were 'Text' which was treated by tokei as all comments, so ignored in stats! 22 | * Added support for 'pf' fortran unit test files (via changes to my tokei fork) 23 | 24 | ## [0.4.1] - 2022-11-09 25 | 26 | * Fixed the [tokei fork](https://github.com/kornysietsma/tokei/tree/accumulate-lines) to properly get indentation for COBOL and other languages with simple per-line comment checks 27 | * Updated `Cargo.toml` to check for a particular tag in the Tokei fork, not just use the branch name 28 | 29 | ## [0.4.0] - 2022-10-13 30 | 31 | * New release - it feels that a lot has changed, really 0.3.2 should have been 0.4.0! Anyway better late than never. 32 | * Major changes since 0.3.1: 33 | * Output file format reworked 34 | * Project files have names and IDs 35 | * Disabling git scanning 36 | * Feature flags 37 | * various bug fixes 38 | 39 | ## [0.3.15] - 2022-10-06 40 | 41 | * Bug fix for some co-authored-by lines 42 | * sometimes commit messages are terminated by `\r\n` characters, but rust assumed they were `\n` terminated - and my co-author regular expression didn't ignore the `\r` whitespace! 43 | * fix for linux binaries finally 44 | * deleted releases 0.3.5 to 0.3.14 - debugging Github Actions took a lot of work, and a lot of release-fix-release cycles! 45 | 46 | ## [0.3.4] - 2022-10-06 47 | 48 | * Point release mostly to test fixes to the release process! 49 | 50 | ## [0.3.3] - 2022-09-28 51 | 52 | * Somewhat breaking release - the output file schema won't change, but the logic is changing - so now data format is 1.0.1 as this is sort-of compatible: 53 | * Previously all changes for a day were combined into a single GitDetails entry, now however I am generating a new GitDetails for each unique user set. 54 | * This is because previously if Jane made 1 change and Joe made 100, they would both show up as changes by Jane and Joe 55 | * This will make output files a bit more verbose, but hopefully not too much, but new user and team info was being distorted by this - now the UI shows you Jane and Joe separately, we need to track them separately, unless they are co-authors on a commit. 56 | * Added a DesignDecisons.md document to discuss the next change: 57 | * Removed the way the code used to use generic `Value` types for indicator data - everything is concrete types now. See `DesignDecisions.md` for discussion 58 | * Added feature flags, including new 'disable git' option, and flags in JSON output (data format v1.0.2) 59 | * Added file creation and modification times, so the explorer can use them when git is disabled 60 | 61 | ## [0.3.2] - 2022-09-20 62 | 63 | * Backward-incompatible release - changing output file format for a few reasons: 64 | * I want a unique ID that the front end can use by default for BrowserStorage - this can be specified or random 65 | * actually now split into 'name' which is descriptive, and 'id' for storage etc. 66 | * I want a semantic version number in the data file, so the front-end can tell if it knows the data format 67 | * I'm moving the front-end to Typescript which means I'd like to keep types a bit cleaner, rather than just dumping data in the 'root' directory node 68 | * Really the old 'flare' file format hasn't been meaningful for a while, so I might as well dump irrelevant bits (like the 'value' on each node - redundant and confusing) 69 | * username / emails are now de-duplicated by case - so if you have "Jane smith" and "Jane Smith" as git user names, they will get the same user id (and the case of whichever example was seen first) - this was needed as, especially with `co-authored-by` tags, the same user could show up several times with only case differences. 70 | 71 | ## [0.3.1] - 2022-07-13 72 | 73 | ### Changed 74 | 75 | * Added an option to follow symlinks to fix issue #1 76 | 77 | ## [0.3.0] - 2021-04-05 78 | 79 | ### Changed 80 | 81 | * Major change - new coupling logic, fine-grained based on timestamps instead of aggregating into daily buckets. This will need a lot of documenting, which will probably be on the main website not here. 82 | * updating tokei to latest code - this was ugly as tokei is now multithreaded and more complex. Modified tokei fork at to accumulate lines of code - but note they aren't ordered so this works for stats but not much else 83 | * Updated all other dependencies to latest stable bits 84 | 85 | ## [0.2.1] - 2020-10-16 86 | 87 | ### Changed 88 | 89 | * fixing build on Windows 90 | 91 | ## [0.2.0] - 2020-09-16 92 | 93 | ### Added 94 | 95 | * git log logic follows renames - a fair bit of work, as it requires splitting the git log processing into two passes, one to aggregate all rename/deletes and parent/child relationships, and one that uses that data to find what files end up being named in the final revision. 96 | 97 | ### Changed 98 | 99 | * Git logging may be slower and use more memory, as interim git log data is stored in memory. 100 | 101 | ## [0.1.2] - 2020-08-25 102 | 103 | ## [0.1.1] - 2020-08-24 104 | 105 | ### Changed 106 | 107 | * Trying to get Travis to publish binaries 108 | 109 | ## [0.1.0] - 2020-08-24 110 | 111 | ### Added 112 | 113 | * Tagged with version 0.1.0 114 | * Added this changelog, following [cargo-release suggestions](https://github.com/sunng87/cargo-release/blob/master/docs/faq.md#maintaining-changelog) and 115 | * First release with binary files 116 | -------------------------------------------------------------------------------- /src/git_file_future.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | use git2::Oid; 3 | use std::collections::HashMap; 4 | use std::path::{Path, PathBuf}; 5 | 6 | /// Track file changes for a file - renames and deletes 7 | #[derive(Debug, Clone)] 8 | pub struct GitFileFutureRegistry { 9 | rev_changes: HashMap, 10 | } 11 | 12 | #[derive(Debug, Clone)] 13 | struct RevChange { 14 | files: HashMap, 15 | /// first child is generally used only, it is the main branch - don't divert into other branches! 16 | children: Vec, 17 | } 18 | 19 | #[derive(Debug, Clone)] 20 | pub enum FileNameChange { 21 | Renamed(PathBuf), 22 | Deleted(), 23 | } 24 | 25 | impl RevChange { 26 | pub fn new() -> Self { 27 | RevChange { 28 | files: HashMap::new(), 29 | children: Vec::new(), 30 | } 31 | } 32 | } 33 | 34 | impl GitFileFutureRegistry { 35 | pub fn new() -> Self { 36 | GitFileFutureRegistry { 37 | rev_changes: HashMap::new(), 38 | } 39 | } 40 | 41 | pub fn register( 42 | &mut self, 43 | id: &Oid, 44 | parent_ids: &[Oid], 45 | file_changes: &[(PathBuf, FileNameChange)], 46 | ) { 47 | let entry = self.rev_changes.entry(*id).or_insert_with(RevChange::new); 48 | entry.files.extend(file_changes.iter().cloned()); 49 | for parent_id in parent_ids { 50 | let pentry = self 51 | .rev_changes 52 | .entry(*parent_id) 53 | .or_insert_with(RevChange::new); 54 | pentry.children.push(*id); 55 | } 56 | } 57 | 58 | /// what is this called in the final revision? 59 | /// returns None if it is deleted, or Some(final name) 60 | pub fn final_name(&self, ref_id: &Oid, file: &Path) -> Option { 61 | let mut current_name: &PathBuf = &file.to_path_buf(); 62 | let mut current_ref: Oid = *ref_id; 63 | loop { 64 | let current_change = self.rev_changes.get(¤t_ref).unwrap(); 65 | match current_change.files.get(current_name) { 66 | Some(FileNameChange::Renamed(new_name)) => { 67 | current_name = new_name; 68 | } 69 | Some(FileNameChange::Deleted()) => return None, 70 | None => (), 71 | } 72 | if let Some(first_child) = current_change.children.get(0) { 73 | current_ref = *first_child; 74 | // and loop will continue 75 | } else { 76 | // no children, so finished looking into the future 77 | return Some(current_name.clone()); 78 | } 79 | } 80 | } 81 | } 82 | 83 | #[cfg(test)] 84 | mod test { 85 | use super::*; 86 | use anyhow::Error; 87 | use pretty_assertions::assert_eq; 88 | 89 | fn pb(name: &str) -> PathBuf { 90 | PathBuf::from(name) 91 | } 92 | 93 | #[test] 94 | fn trivial_repo_returns_original_name() -> Result<(), Error> { 95 | let mut registry = GitFileFutureRegistry::new(); 96 | let my_id = Oid::from_str("01")?; 97 | registry.register(&my_id, &[], &[]); 98 | assert_eq!( 99 | registry.final_name(&my_id, &pb("foo.txt")), 100 | Some(pb("foo.txt")) 101 | ); 102 | Ok(()) 103 | } 104 | 105 | #[test] 106 | fn simple_rename_returns_old_name() -> Result<(), Error> { 107 | let mut registry = GitFileFutureRegistry::new(); 108 | let my_id = Oid::from_str("01")?; 109 | 110 | registry.register( 111 | &my_id, 112 | &[], 113 | &[(pb("foo.txt"), FileNameChange::Renamed(pb("bar.txt")))], 114 | ); 115 | assert_eq!( 116 | registry.final_name(&my_id, &pb("foo.txt")), 117 | Some(pb("bar.txt")) 118 | ); 119 | Ok(()) 120 | } 121 | 122 | #[test] 123 | fn renames_and_deletes_applied_across_history() -> Result<(), Error> { 124 | // my bad - this should be a few isolated tests not one big test-all test. 125 | // classic how my standards slip for side projects! 126 | let mut registry = GitFileFutureRegistry::new(); 127 | /* 128 | +-----+ 129 | |01 | 130 | |add a| 131 | |add z| 132 | +--+--+ 133 | | 134 | +------v------+ 135 | |02 | 136 | |rename a to b| 137 | |delete z | 138 | +-------------+ 139 | | | 140 | +------v------+ +----v--------+ 141 | |04 | |05 | 142 | |rename b to c| |rename b to d| 143 | +--------------+--------------+ 144 | | 145 | +--------v---------+ 146 | |06 merge | 147 | |rename c to afinal| 148 | |create new z | 149 | +------------------+ 150 | */ 151 | let id_1 = Oid::from_str("01")?; 152 | let id_2 = Oid::from_str("02")?; 153 | let id_4 = Oid::from_str("04")?; 154 | let id_5 = Oid::from_str("05")?; 155 | let id_6 = Oid::from_str("06")?; 156 | 157 | registry.register( 158 | &id_6, 159 | &[id_4, id_5], 160 | &[(pb("c"), FileNameChange::Renamed(pb("afinal")))], 161 | ); 162 | // NOTE: topological order should (I think?) register rev 4 before rev 5 as it's first 163 | registry.register( 164 | &id_4, 165 | &[id_2], 166 | &[(pb("b"), FileNameChange::Renamed(pb("c")))], 167 | ); 168 | registry.register( 169 | &id_5, 170 | &[id_2], 171 | &[(pb("b"), FileNameChange::Renamed(pb("d")))], 172 | ); 173 | registry.register( 174 | &id_2, 175 | &[id_1], 176 | &[ 177 | (pb("a"), FileNameChange::Renamed(pb("b"))), 178 | (pb("z"), FileNameChange::Deleted()), 179 | ], 180 | ); 181 | registry.register(&id_1, &[], &[]); 182 | 183 | // original a is afinal 184 | // original z is gone 185 | assert_eq!(registry.final_name(&id_1, &pb("a")), Some(pb("afinal"))); 186 | assert_eq!(registry.final_name(&id_1, &pb("z")), None); 187 | // from the perspective of the filesystem after node 2, we know nothing of a any more, only b 188 | assert_eq!(registry.final_name(&id_2, &pb("b")), Some(pb("afinal"))); 189 | 190 | Ok(()) 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/file_walker.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | 3 | use crate::{polyglot_data::PolyglotData, FeatureFlags}; 4 | 5 | use super::flare; 6 | use super::flare::FlareTreeNode; 7 | use super::toxicity_indicator_calculator::ToxicityIndicatorCalculator; 8 | use anyhow::{Context, Error}; 9 | use ignore::{Walk, WalkBuilder}; 10 | #[allow(unused_imports)] 11 | use path_slash::PathExt; 12 | use std::{path::Path, time::Instant}; 13 | 14 | fn apply_calculators_to_node( 15 | node: &mut FlareTreeNode, 16 | path: &Path, 17 | toxicity_indicator_calculators: &mut [Box], 18 | ) -> Result<(), Error> { 19 | for tic in toxicity_indicator_calculators.iter_mut() { 20 | tic.visit_node(node, path) 21 | .with_context(|| format!("applying calcluator {} to {:?}", tic.name(), path))?; 22 | } 23 | Ok(()) 24 | } 25 | 26 | const LOG_INTERVAL_SECS: u64 = 60 * 5; 27 | 28 | fn walk_tree_walker( 29 | walker: Walk, 30 | prefix: &Path, 31 | name: &str, 32 | id: Option<&str>, 33 | toxicity_indicator_calculators: &mut [Box], 34 | features: &FeatureFlags, // features just for JSON output 35 | ) -> Result { 36 | let mut tree = FlareTreeNode::new(flare::ROOT_NAME, false); 37 | 38 | apply_calculators_to_node(&mut tree, prefix, toxicity_indicator_calculators)?; 39 | 40 | let mut last_log = Instant::now(); 41 | info!("Walking file tree"); 42 | 43 | for result in walker.map(|r| r.expect("File error!")).skip(1) { 44 | let p = result.path(); 45 | let relative = p.strip_prefix(prefix)?; 46 | let elapsed_since_log = last_log.elapsed(); 47 | if elapsed_since_log.as_secs() > LOG_INTERVAL_SECS { 48 | info!("Walking progress: {:?}", relative); 49 | last_log = Instant::now(); 50 | } 51 | 52 | let new_child = if p.is_dir() || p.is_file() { 53 | let mut f = FlareTreeNode::new(p.file_name().unwrap(), p.is_file()); 54 | apply_calculators_to_node(&mut f, p, toxicity_indicator_calculators)?; 55 | Some(f) 56 | } else { 57 | warn!("Not a file or dir: {:?} - skipping", p); 58 | None 59 | }; 60 | 61 | if let Some(new_child) = new_child { 62 | match relative.parent() { 63 | Some(new_parent) => { 64 | let parent = tree 65 | .get_in_mut(&mut new_parent.components()) 66 | .expect("no parent found!"); 67 | parent.append_child(new_child); 68 | } 69 | None => { 70 | tree.append_child(new_child); 71 | } 72 | } 73 | } 74 | } 75 | info!("finished walking file tree"); 76 | Ok(PolyglotData::new(name, id, tree, features.clone())) 77 | } 78 | 79 | pub fn walk_directory( 80 | root: &Path, 81 | name: &str, 82 | id: Option<&str>, 83 | follow_symlinks: bool, 84 | toxicity_indicator_calculators: &mut [Box], 85 | features: &FeatureFlags, // features just for JSON output 86 | ) -> Result { 87 | walk_tree_walker( 88 | WalkBuilder::new(root) 89 | .add_custom_ignore_filename(".polyglot_code_scanner_ignore") 90 | .follow_links(follow_symlinks) 91 | .sort_by_file_name(std::cmp::Ord::cmp) 92 | .build(), 93 | root, 94 | name, 95 | id, 96 | toxicity_indicator_calculators, 97 | features, 98 | ) 99 | } 100 | 101 | #[cfg(test)] 102 | mod test { 103 | use crate::polyglot_data::IndicatorMetadata; 104 | 105 | use super::*; 106 | use test_shared::assert_eq_json_file; 107 | 108 | #[test] 109 | fn scanning_a_filesystem_builds_a_tree() { 110 | let root = Path::new("./tests/data/simple/"); 111 | let tree = walk_directory( 112 | root, 113 | "test", 114 | Some("test-id"), 115 | false, 116 | &mut Vec::new(), 117 | &FeatureFlags::default(), 118 | ) 119 | .unwrap(); 120 | 121 | assert_eq_json_file(&tree, "./tests/expected/simple_files.json"); 122 | } 123 | 124 | #[test] 125 | fn scanning_a_filesystem_can_follow_symlinks() { 126 | let root = Path::new("./tests/data/simple_linked/"); 127 | let tree = walk_directory( 128 | root, 129 | "test", 130 | Some("test-id"), 131 | true, 132 | &mut Vec::new(), 133 | &FeatureFlags::default(), 134 | ) 135 | .unwrap(); 136 | 137 | assert_eq_json_file(&tree, "./tests/expected/simple_files.json"); 138 | } 139 | 140 | #[derive(Debug)] 141 | struct FirstTIC {} 142 | 143 | impl ToxicityIndicatorCalculator for FirstTIC { 144 | fn name(&self) -> String { 145 | "foo".to_string() 146 | } 147 | fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error> { 148 | if path.is_file() { 149 | // only mutate files! If we rename dirs, the parent relationship breaks 150 | let mut name = node.name().clone(); 151 | name.push("!"); 152 | node.set_name(&name); 153 | } 154 | Ok(()) 155 | } 156 | fn apply_metadata(&self, _metadata: &mut IndicatorMetadata) -> Result<(), Error> { 157 | unimplemented!() 158 | } 159 | } 160 | 161 | #[derive(Debug)] 162 | struct SecondTIC {} 163 | 164 | impl ToxicityIndicatorCalculator for SecondTIC { 165 | fn name(&self) -> String { 166 | "filename".to_string() 167 | } 168 | fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error> { 169 | if path.is_file() { 170 | // only mutate files! If we rename dirs, the parent relationship breaks 171 | let mut name = node.name().clone(); 172 | name.push("?"); 173 | node.set_name(&name); 174 | } 175 | Ok(()) 176 | } 177 | 178 | fn apply_metadata(&self, _metadata: &mut IndicatorMetadata) -> Result<(), Error> { 179 | unimplemented!() 180 | } 181 | } 182 | 183 | #[test] 184 | fn scanning_merges_data_from_mutators() { 185 | let root = Path::new("./tests/data/simple/"); 186 | let first = FirstTIC {}; 187 | let second = SecondTIC {}; 188 | let calculators: &mut Vec> = 189 | &mut vec![Box::new(first), Box::new(second)]; 190 | 191 | let tree = walk_directory( 192 | root, 193 | "test", 194 | Some("test-id"), 195 | false, 196 | calculators, 197 | &FeatureFlags::default(), 198 | ) 199 | .unwrap(); 200 | 201 | assert_eq_json_file(&tree, "./tests/expected/simple_files_with_indicators.json"); 202 | } 203 | 204 | // TODO: we have no unit test for new metadata - should we? 205 | } 206 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | #![forbid(unsafe_code)] 2 | #![warn(clippy::all)] 3 | #![warn(clippy::pedantic)] 4 | #![warn(rust_2018_idioms)] 5 | 6 | use anyhow::Error; 7 | use clap::{CommandFactory, ErrorKind, Parser}; 8 | use polyglot_code_scanner::coupling::CouplingConfig; 9 | use polyglot_code_scanner::{FeatureFlags, ScannerConfig}; 10 | use std::fs::File; 11 | use std::io; 12 | use std::path::PathBuf; 13 | 14 | #[allow(clippy::struct_excessive_bools)] 15 | #[derive(Debug, Parser)] 16 | #[clap(author, version)] 17 | /// Polyglot Code Scanner 18 | /// 19 | /// Scans source code and generates indicators that may (or may not) show toxic code. 20 | /// Ignores files specified by `.gitignore` or `.polyglot_code_scanner_ignore` files 21 | /// See for details 22 | struct Cli { 23 | #[clap( 24 | short = 'v', 25 | long = "verbose", 26 | action = clap::ArgAction::Count 27 | )] 28 | /// Logging verbosity, v = error, vv = warn, vvv = info (default), vvvv = debug, vvvvv = trace 29 | verbose: u8, 30 | /// Output file, stdout if not present, or not used if sending to web server 31 | #[clap(short = 'o', long = "output", parse(from_os_str))] 32 | output: Option, 33 | /// project name - identifies the selected data for display and state storage 34 | #[clap(value_parser, short = 'n', long = "name")] 35 | name: String, 36 | 37 | /// data file ID - used to identify unique data files for browser storage, generates a UUID if not specified 38 | #[clap(value_parser, long = "id")] 39 | id: Option, 40 | /// Root directory, current dir if not present 41 | #[clap(parse(from_os_str))] 42 | root: Option, 43 | 44 | // global indicator flags 45 | #[clap(value_parser, long = "no-git")] 46 | /// Do not scan for git repositories 47 | no_git: bool, 48 | #[clap(value_parser, short = 'c', long = "coupling")] 49 | /// include temporal coupling data 50 | coupling: bool, 51 | #[clap(value_parser, long = "no-detailed-git")] 52 | /// Don't include detailed git information - output may be big! 53 | no_detailed_git: bool, 54 | #[clap(value_parser, long = "no-file-stats")] 55 | /// Do not scan for file stats - mainly an option as this is very hard to unit test 56 | no_file_stats: bool, 57 | 58 | #[clap(value_parser, long = "years", default_value = "3")] 59 | /// how many years of git history to parse - default only scan the last 3 years (from now, not git head) 60 | git_years: u64, 61 | #[clap(value_parser, long = "follow-symlinks")] 62 | /// Follow symbolic links when traversing directories 63 | follow_symlinks: bool, 64 | #[clap(value_parser, long = "coupling-bucket-days", default_value = "91")] 65 | /// Number of days in a single "bucket" of coupling activity 66 | bucket_days: u64, 67 | #[clap(value_parser, long = "coupling-min-bursts", default_value = "10")] 68 | /// If a file has fewer bursts of change than this in a bucket, don't measure coupling from it 69 | min_activity_bursts: u64, 70 | #[clap(value_parser, long = "coupling-min-ratio", default_value = "0.8")] 71 | /// The minimum ratio of (other file changes)/(this file changes) to include a file in coupling stats 72 | min_coupling_ratio: f64, 73 | #[clap( 74 | value_parser, 75 | long = "coupling-min-activity-gap-minutes", 76 | default_value = "60" 77 | )] 78 | /// what is the minimum gap between activities in a burst? a sequence of commits with no gaps this long is treated as one burst 79 | min_activity_gap_minutes: u64, 80 | #[clap( 81 | value_parser, 82 | long = "coupling-time-overlap-minutes", 83 | default_value = "60" 84 | )] 85 | /// how far before/after an activity burst is included for coupling? e.g. if I commit Foo.c at 1am, and Bar.c at 2am, they are coupled if an overlap of 60 minutes or longer is specified 86 | min_overlap_minutes: u64, 87 | #[clap(value_parser, long = "coupling-min-distance", default_value = "3")] 88 | /// The minimum distance between nodes to include in coupling 89 | /// 0 is all, 1 is siblings, 2 is cousins and so on. 90 | /// so if you set this to 3, cousins "foo/src/a.rs" and "foo/test/a_test.rs" won't be counted as their distance is 2 91 | coupling_min_distance: usize, 92 | #[clap(value_parser, long = "coupling-max-common-roots")] 93 | /// The maximum number of common ancestors to include in coupling 94 | /// e.g. "foo/src/controller/a.c" and "foo/src/service/b.c" have two common ancestors, if you 95 | /// set this value to 3 they won't show as coupled. 96 | coupling_max_common_roots: Option, 97 | } 98 | 99 | // very basic logging - just so I can have a nice default, and hide verbose tokei logs 100 | fn setup_logging(verbosity: u8) -> Result<(), fern::InitError> { 101 | let mut base_config = fern::Dispatch::new(); 102 | 103 | base_config = match verbosity { 104 | 0 | 3 => base_config.level(log::LevelFilter::Info), 105 | 1 => base_config.level(log::LevelFilter::Error), 106 | 2 => base_config.level(log::LevelFilter::Warn), 107 | 4 => base_config.level(log::LevelFilter::Debug), 108 | _5_or_more => base_config.level(log::LevelFilter::Trace), 109 | }; 110 | 111 | // Tokei warns whenever we scan a language type we don't know - but I catch that error! 112 | base_config = base_config.level_for("tokei::language::language_type", log::LevelFilter::Error); 113 | 114 | let stdout_config = fern::Dispatch::new() 115 | .format(|out, message, record| { 116 | out.finish(format_args!( 117 | "[{}][{}][{}] {}", 118 | chrono::Local::now().format("%H:%M"), 119 | record.target(), 120 | record.level(), 121 | message 122 | )); 123 | }) 124 | .chain(io::stderr()); 125 | 126 | base_config.chain(stdout_config).apply()?; 127 | 128 | Ok(()) 129 | } 130 | 131 | fn custom_validation_conflict(message: &str) { 132 | let mut cmd = Cli::command(); 133 | cmd.error(ErrorKind::ArgumentConflict, message).exit() 134 | } 135 | 136 | fn main() -> Result<(), Error> { 137 | let args = Cli::from_args(); 138 | 139 | // custom validation - easier than trying to wrangle clap to do this! 140 | if args.no_git { 141 | if args.coupling { 142 | custom_validation_conflict("Can't enable coupling when git is disabled!"); 143 | } 144 | if args.no_detailed_git { 145 | custom_validation_conflict("Can't specify no_detailed_git when git is disabled!"); 146 | } 147 | } 148 | 149 | setup_logging(args.verbose)?; 150 | 151 | let root = args.root.unwrap_or_else(|| PathBuf::from(".")); 152 | 153 | let features = FeatureFlags { 154 | git: !args.no_git, 155 | coupling: args.coupling, 156 | git_details: !(args.no_detailed_git || args.no_git), 157 | file_stats: !args.no_file_stats, 158 | }; 159 | 160 | let scanner_config = ScannerConfig { 161 | git_years: Some(args.git_years), 162 | data_id: args.id, 163 | name: args.name, 164 | follow_symlinks: args.follow_symlinks, 165 | features, 166 | }; 167 | 168 | let coupling_config = if args.coupling { 169 | Some(CouplingConfig::new( 170 | args.bucket_days, 171 | args.min_activity_bursts, 172 | args.min_coupling_ratio, 173 | args.min_activity_gap_minutes * 60, 174 | args.min_overlap_minutes * 60, 175 | args.coupling_min_distance, 176 | args.coupling_max_common_roots, 177 | )) 178 | } else { 179 | None 180 | }; 181 | 182 | let mut out: Box = if let Some(output) = args.output { 183 | Box::new(File::create(output)?) 184 | } else { 185 | Box::new(io::stdout()) 186 | }; 187 | 188 | let mut calculator_names: Vec<&str> = vec!["loc", "indentation"]; 189 | if !args.no_git { 190 | calculator_names.push("git"); 191 | } 192 | if !args.no_file_stats { 193 | calculator_names.push("file_stats"); 194 | } 195 | 196 | polyglot_code_scanner::run( 197 | &root, 198 | &scanner_config, 199 | coupling_config, 200 | &calculator_names, 201 | &mut out, 202 | )?; 203 | 204 | Ok(()) 205 | } 206 | -------------------------------------------------------------------------------- /src/flare.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | //! This is named 'Flare' as historically, the D3 hierarchical data files 3 | //! were called 'flare.json' and there was an implied data format. 4 | //! 5 | //! As of version 1.0.0 (when I started versioning!) of the data format, 6 | //! the syntax differs from D3 flare files, but I haven't renamed the module (yet) 7 | 8 | use serde::ser::SerializeStruct; 9 | use serde::{Serialize, Serializer}; 10 | use std::ffi::{OsStr, OsString}; 11 | 12 | use crate::coupling::SerializableCouplingData; 13 | use crate::file_stats::FileStats; 14 | use crate::git::GitNodeData; 15 | use crate::indentation::IndentationData; 16 | use crate::loc::LanguageLocData; 17 | 18 | pub static ROOT_NAME: &str = ""; 19 | 20 | #[derive(Debug, PartialEq, Clone, Default, Serialize)] 21 | pub struct IndicatorData { 22 | #[serde(skip_serializing_if = "Option::is_none")] 23 | pub git: Option, 24 | #[serde(skip_serializing_if = "Option::is_none")] 25 | pub indentation: Option, 26 | #[serde(skip_serializing_if = "Option::is_none")] 27 | pub loc: Option, 28 | #[serde(skip_serializing_if = "Option::is_none")] 29 | pub coupling: Option, 30 | #[serde(skip_serializing_if = "Option::is_none")] 31 | pub file_stats: Option, 32 | } 33 | 34 | impl IndicatorData { 35 | fn is_empty(&self) -> bool { 36 | self.git.is_none() 37 | && self.indentation.is_none() 38 | && self.loc.is_none() 39 | && self.coupling.is_none() 40 | && self.file_stats.is_none() 41 | } 42 | } 43 | 44 | #[derive(Debug, PartialEq, Clone)] 45 | pub struct FlareTreeNode { 46 | name: OsString, 47 | is_file: bool, 48 | children: Vec, 49 | data: IndicatorData, 50 | } 51 | 52 | impl FlareTreeNode { 53 | pub fn name(&self) -> &OsString { 54 | &self.name 55 | } 56 | 57 | #[cfg(test)] 58 | pub fn set_name(&mut self, name: &OsStr) { 59 | self.name = name.to_owned(); 60 | } 61 | 62 | pub fn new(name: impl Into, is_file: bool) -> Self { 63 | FlareTreeNode { 64 | name: name.into(), 65 | is_file, 66 | children: Vec::new(), 67 | 68 | data: IndicatorData::default(), 69 | } 70 | } 71 | 72 | #[cfg(test)] 73 | pub fn file(name: impl Into) -> Self { 74 | Self::new(name, true) 75 | } 76 | 77 | #[cfg(test)] 78 | pub fn dir>(name: S) -> Self { 79 | Self::new(name, false) 80 | } 81 | 82 | pub fn indicators_mut(&mut self) -> &mut IndicatorData { 83 | &mut self.data 84 | } 85 | pub fn indicators(&self) -> &IndicatorData { 86 | &self.data 87 | } 88 | 89 | pub fn append_child(&mut self, child: FlareTreeNode) { 90 | assert!(!self.is_file, "appending child to a directory: {:?}", self); 91 | self.children.push(child); // TODO - return self? 92 | } 93 | 94 | /// gets a tree entry by path, or None if something along the path doesn't exist 95 | #[allow(dead_code)] // used in tests 96 | pub fn get_in(&self, path: &mut std::path::Components<'_>) -> Option<&FlareTreeNode> { 97 | match path.next() { 98 | Some(first_name) => { 99 | let dir_name = first_name.as_os_str(); 100 | if !self.is_file { 101 | let first_match = self.children.iter().find(|c| dir_name == c.name)?; 102 | return first_match.get_in(path); 103 | } 104 | None 105 | } 106 | None => Some(self), 107 | } 108 | } 109 | 110 | /// gets a mutable tree entry by path, or None if something along the path doesn't exist 111 | pub fn get_in_mut( 112 | &mut self, 113 | path: &mut std::path::Components<'_>, 114 | ) -> Option<&mut FlareTreeNode> { 115 | match path.next() { 116 | Some(first_name) => { 117 | let dir_name = first_name.as_os_str(); 118 | if !self.is_file { 119 | let first_match = self.children.iter_mut().find(|c| dir_name == c.name)?; 120 | return first_match.get_in_mut(path); 121 | } 122 | None 123 | } 124 | None => Some(self), 125 | } 126 | } 127 | 128 | pub fn get_children(&self) -> &Vec { 129 | &self.children 130 | } 131 | 132 | // used only for postprocessing - could refactor - move functionality here 133 | pub fn get_children_mut(&mut self) -> &mut Vec { 134 | &mut self.children 135 | } 136 | } 137 | 138 | fn name_as_str(name: &OsStr) -> Result<&str, S::Error> { 139 | name.to_str().ok_or_else(|| { 140 | serde::ser::Error::custom(format!("name {:?} contains invalid UTF-8 characters", name)) 141 | }) 142 | } 143 | 144 | impl Serialize for FlareTreeNode { 145 | fn serialize(&self, serializer: S) -> Result 146 | where 147 | S: Serializer, 148 | { 149 | let mut state = serializer.serialize_struct("FlareTreeNode", 3)?; 150 | let name = name_as_str::(&self.name)?; 151 | state.serialize_field("name", &name)?; 152 | if !self.data.is_empty() { 153 | state.serialize_field("data", &self.data)?; 154 | } 155 | if !self.is_file { 156 | state.serialize_field("children", &self.children)?; 157 | } 158 | 159 | state.end() 160 | } 161 | } 162 | 163 | #[cfg(test)] 164 | mod test { 165 | use super::*; 166 | use pretty_assertions::assert_eq; 167 | use serde_json::json; 168 | use std::path::Path; 169 | use test_shared::{assert_eq_json_str, assert_eq_json_value}; 170 | 171 | #[test] 172 | fn can_build_tree() { 173 | let mut root = FlareTreeNode::dir("root"); 174 | root.append_child(FlareTreeNode::file("child")); 175 | 176 | assert_eq!( 177 | root, 178 | FlareTreeNode { 179 | name: OsString::from("root"), 180 | is_file: false, 181 | children: vec![FlareTreeNode { 182 | name: OsString::from("child"), 183 | is_file: true, 184 | data: IndicatorData::default(), 185 | children: Vec::new(), 186 | }], 187 | 188 | data: IndicatorData::default(), 189 | } 190 | ); 191 | } 192 | 193 | fn build_test_tree() -> FlareTreeNode { 194 | let mut root = FlareTreeNode::dir("root"); 195 | root.append_child(FlareTreeNode::file("root_file_1.txt")); 196 | root.append_child(FlareTreeNode::file("root_file_2.txt")); 197 | let mut child1 = FlareTreeNode::dir("child1"); 198 | child1.append_child(FlareTreeNode::file("child1_file_1.txt")); 199 | let mut grand_child = FlareTreeNode::dir("grandchild"); 200 | grand_child.append_child(FlareTreeNode::file("grandchild_file.txt")); 201 | child1.append_child(grand_child); 202 | child1.append_child(FlareTreeNode::file("child1_file_2.txt")); 203 | let mut child2 = FlareTreeNode::dir("child2"); 204 | let child2_file = FlareTreeNode::file("child2_file.txt"); 205 | child2.append_child(child2_file); 206 | root.append_child(child1); 207 | root.append_child(child2); 208 | root 209 | } 210 | 211 | #[test] 212 | fn can_get_elements_from_tree() { 213 | let tree = build_test_tree(); 214 | 215 | let mut path = std::path::Path::new("child1/grandchild/grandchild_file.txt").components(); 216 | let grandchild = tree.get_in(&mut path); 217 | assert_eq!( 218 | grandchild.expect("Grandchild not found!").name(), 219 | "grandchild_file.txt" 220 | ); 221 | } 222 | 223 | #[test] 224 | fn can_get_top_level_element_from_tree() { 225 | let tree = build_test_tree(); 226 | 227 | let mut path = std::path::Path::new("child1").components(); 228 | let child1 = tree.get_in(&mut path); 229 | assert_eq!(child1.expect("child1 not found!").name(), "child1"); 230 | 231 | let mut path2 = std::path::Path::new("root_file_1.txt").components(); 232 | let child2 = tree.get_in(&mut path2); 233 | assert_eq!( 234 | child2.expect("root_file_1 not found!").name(), 235 | "root_file_1.txt" 236 | ); 237 | } 238 | 239 | #[test] 240 | fn getting_missing_elements_returns_none() { 241 | let tree = build_test_tree(); 242 | let mut path = std::path::Path::new("child1/grandchild/nonesuch").components(); 243 | let missing = tree.get_in(&mut path); 244 | assert!(missing.is_none()); 245 | 246 | let mut path2 = 247 | Path::new("child1/grandchild/grandchild_file.txt/files_have_no_kids").components(); 248 | let missing2 = tree.get_in(&mut path2); 249 | assert!(missing2.is_none()); 250 | 251 | let mut path3 = Path::new("no_file_at_root").components(); 252 | let missing3 = tree.get_in(&mut path3); 253 | assert!(missing3.is_none()); 254 | } 255 | 256 | #[test] 257 | fn can_get_mut_elements_from_tree() { 258 | let mut tree = build_test_tree(); 259 | let grandchild = tree 260 | .get_in_mut(&mut Path::new("child1/grandchild/grandchild_file.txt").components()) 261 | .expect("Grandchild not found!"); 262 | assert_eq!(grandchild.name(), "grandchild_file.txt"); 263 | grandchild.name = OsString::from("fish"); 264 | let grandchild2 = tree.get_in_mut(&mut Path::new("child1/grandchild/fish").components()); 265 | assert_eq!(grandchild2.expect("fish not found!").name(), "fish"); 266 | 267 | let grandchild_dir = tree 268 | .get_in_mut(&mut Path::new("child1/grandchild").components()) 269 | .expect("Grandchild dir not found!"); 270 | assert_eq!(grandchild_dir.name(), "grandchild"); 271 | grandchild_dir.append_child(FlareTreeNode::file("new_kid_on_the_block.txt")); 272 | let new_kid = tree 273 | .get_in_mut(&mut Path::new("child1/grandchild/new_kid_on_the_block.txt").components()) 274 | .expect("New kid not found!"); 275 | assert_eq!(new_kid.name(), "new_kid_on_the_block.txt"); 276 | } 277 | 278 | #[test] 279 | fn can_serialize_directory_to_json() { 280 | let root = FlareTreeNode::dir("root"); 281 | 282 | assert_eq_json_str( 283 | &root, 284 | r#"{ 285 | "name":"root", 286 | "children": [] 287 | }"#, 288 | ); 289 | } 290 | 291 | #[test] 292 | fn can_serialize_file_to_json() { 293 | let file = FlareTreeNode::file("foo.txt"); 294 | 295 | assert_eq_json_str( 296 | &file, 297 | r#"{ 298 | "name":"foo.txt" 299 | }"#, 300 | ); 301 | } 302 | 303 | #[test] 304 | fn can_serialize_simple_tree_to_json() { 305 | let mut root = FlareTreeNode::dir("root"); 306 | root.append_child(FlareTreeNode::file("child.txt")); 307 | root.append_child(FlareTreeNode::dir("child2")); 308 | 309 | assert_eq_json_value( 310 | &root, 311 | &json!({ 312 | "name":"root", 313 | "children":[ 314 | { 315 | "name": "child.txt" 316 | }, 317 | { 318 | "name":"child2", 319 | "children":[] 320 | } 321 | ] 322 | }), 323 | ); 324 | } 325 | 326 | #[test] 327 | fn can_serialize_simple_polyglot_data_to_json() { 328 | let mut root = FlareTreeNode::dir("root"); 329 | root.append_child(FlareTreeNode::file("child.txt")); 330 | root.append_child(FlareTreeNode::dir("child2")); 331 | 332 | assert_eq_json_value( 333 | &root, 334 | &json!({ 335 | "name":"root", 336 | "children":[ 337 | { 338 | "name": "child.txt" 339 | }, 340 | { 341 | "name":"child2", 342 | "children":[] 343 | } 344 | ] 345 | }), 346 | ); 347 | } 348 | } 349 | -------------------------------------------------------------------------------- /src/git_file_history.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | use crate::git_logger::{CommitChange, FileChange, GitLog, GitLogEntry, User}; 3 | use anyhow::Error; 4 | use chrono::offset::TimeZone; 5 | use chrono::Utc; 6 | use git2::Oid; 7 | use indicatif::{ProgressBar, ProgressStyle}; 8 | use serde::Serialize; 9 | use std::collections::HashMap; 10 | use std::convert::TryInto; 11 | use std::path::Path; 12 | use std::path::PathBuf; 13 | 14 | /// For each file we just keep a simplified history - what the changes were, by whom, and when. 15 | #[derive(Debug, Serialize, Builder)] 16 | #[builder(setter(into), pattern = "owned")] 17 | pub struct FileHistoryEntry { 18 | pub id: String, 19 | pub committer: User, 20 | pub commit_time: u64, 21 | pub author: User, 22 | pub author_time: u64, 23 | pub co_authors: Vec, 24 | pub change: CommitChange, 25 | pub lines_added: u64, 26 | pub lines_deleted: u64, 27 | } 28 | 29 | impl FileHistoryEntry { 30 | fn from(entry: &GitLogEntry, file_change: &FileChange) -> FileHistoryEntry { 31 | let entry = entry.clone(); 32 | let file_change = file_change.clone(); 33 | FileHistoryEntry { 34 | id: entry.id().clone(), 35 | committer: entry.committer().clone(), 36 | commit_time: *entry.commit_time(), 37 | author: entry.author().clone(), 38 | author_time: *entry.author_time(), 39 | co_authors: entry.co_authors().clone(), 40 | change: *file_change.change(), 41 | lines_added: *file_change.lines_added(), 42 | lines_deleted: *file_change.lines_deleted(), 43 | } 44 | } 45 | } 46 | 47 | #[cfg(test)] 48 | impl FileHistoryEntryBuilder { 49 | pub fn test_default() -> Self { 50 | FileHistoryEntryBuilder::default() 51 | .co_authors(Vec::new()) 52 | .change(CommitChange::Add) 53 | .lines_added(0u64) 54 | .lines_deleted(0u64) 55 | } 56 | pub fn emails(self, email: &str) -> Self { 57 | self.committer(User::new(None, Some(email))) 58 | .author(User::new(None, Some(email))) 59 | } 60 | 61 | pub fn times(self, time: u64) -> Self { 62 | self.commit_time(time).author_time(time) 63 | } 64 | } 65 | 66 | #[derive(Debug, Serialize)] 67 | pub struct GitFileHistory { 68 | /// repo work dir - always canonical 69 | workdir: PathBuf, 70 | history_by_file: HashMap>, 71 | last_commit: u64, 72 | } 73 | 74 | impl GitFileHistory { 75 | pub fn new(log: &mut GitLog) -> Result { 76 | let mut last_commit: u64 = 0; 77 | let mut history_by_file = HashMap::>::new(); 78 | info!("Loading git log"); 79 | let progress_bar = ProgressBar::new_spinner() 80 | .with_style(ProgressStyle::default_spinner().template("[{elapsed}] {msg}")?); 81 | progress_bar.tick(); 82 | // TODO: this was removed in indicatif 0.17 - do we need it? 83 | // see https://github.com/console-rs/indicatif/issues/393 84 | // progress_bar.set_draw_delta(100); 85 | 86 | // for handling renames, this needs to be a 2-pass process 87 | 88 | // This is ugly! I need to think of cleaning up, probably in one of two ways: 89 | // 1. ditch the whole "expose an iterator" interface - if we're loading it all into memory anyway, there's no point, could make the code cleaner and maybe get rid of the ugly use of Rc> 90 | // 2. fully split the parsing into two passes, one to get parent/child info and one to get file summary. This would use less memory - but might be slower? YAGNI I think. 91 | 92 | let log_iterator = log.iterator()?; 93 | // I can't find a cleaner way for an iterator to have side effects 94 | let git_file_future_registry = log_iterator.git_file_future_registry(); 95 | let mut progress_last_updated: u64 = 0; 96 | let log_entries: Vec> = log_iterator 97 | // .progress_with(progress_bar) 98 | .inspect(|entry| { 99 | if let Ok(entry) = entry { 100 | let commit_time = *entry.commit_time(); 101 | // eprintln!("plu {} ct {}", progress_last_updated, commit_time); 102 | if progress_last_updated == 0 // never shown 103 | || (commit_time > progress_last_updated) // time gone backwards 104 | || (progress_last_updated - commit_time) > 60 * 60 105 | // more than an hour change 106 | { 107 | let fmt_time = Utc.timestamp(commit_time as i64, 0).to_string(); 108 | progress_bar.set_message(fmt_time); 109 | progress_last_updated = commit_time; 110 | progress_bar.inc(1); 111 | } 112 | } 113 | }) 114 | .collect(); 115 | progress_bar.finish(); 116 | 117 | // safe to borrow this now as the iterator has gone and can't mutate any more 118 | let git_file_future_registry = git_file_future_registry.borrow(); 119 | 120 | info!("Processing git log with {} entries", log_entries.len()); 121 | let entrybar = ProgressBar::new(log_entries.len().try_into()?); 122 | for entry in log_entries { 123 | entrybar.tick(); 124 | match entry { 125 | Ok(entry) => { 126 | let commit_time = *entry.commit_time(); 127 | // let fmt_time = Utc.timestamp(commit_time as i64, 0).to_string(); 128 | // progress_bar.set_message(&fmt_time); 129 | if commit_time > last_commit { 130 | last_commit = commit_time; 131 | } 132 | for file_change in entry.clone().file_changes() { 133 | // TODO: use Oids so we don't need ugly conversion. 134 | let final_filename = git_file_future_registry 135 | .final_name(&Oid::from_str(entry.id()).unwrap(), file_change.file()); 136 | if let Some(filename) = final_filename { 137 | let hash_entry = 138 | history_by_file.entry(filename).or_insert_with(Vec::new); 139 | let new_entry = FileHistoryEntry::from(&entry, file_change); 140 | hash_entry.push(new_entry); 141 | } else { 142 | trace!( 143 | "Not storing history for deleted file {:?}", 144 | file_change.file() 145 | ); 146 | } 147 | } 148 | } 149 | Err(e) => { 150 | warn!("Ignoring invalid git log entry: {:?}", e); 151 | } 152 | } 153 | } 154 | entrybar.finish(); 155 | 156 | Ok(GitFileHistory { 157 | workdir: log.workdir().to_owned(), 158 | history_by_file, 159 | last_commit, 160 | }) 161 | } 162 | 163 | /// true if this repo is valid for this file - file must exist (as we canonicalize it) 164 | pub fn is_repo_for(&self, file: &Path) -> Result { 165 | let canonical_file = file.canonicalize()?; 166 | Ok(canonical_file.starts_with(&self.workdir)) 167 | } 168 | 169 | /// get git history for this file - file must exist (as we canonicalize it) 170 | pub fn history_for(&self, file: &Path) -> Result>, Error> { 171 | let canonical_file = file.canonicalize()?; 172 | let relative_file = canonical_file.strip_prefix(&self.workdir)?; 173 | Ok(self.history_by_file.get(relative_file)) 174 | } 175 | 176 | pub fn last_commit(&self) -> u64 { 177 | self.last_commit 178 | } 179 | } 180 | 181 | #[cfg(test)] 182 | mod test { 183 | use super::*; 184 | use crate::git_logger::GitLogConfig; 185 | use pretty_assertions::assert_eq; 186 | use tempfile::tempdir; 187 | use test_shared::{assert_eq_json_file, unzip_test_sample}; 188 | 189 | #[test] 190 | fn can_get_log_by_filename() -> Result<(), Error> { 191 | let gitdir = tempdir()?; 192 | let git_root = unzip_test_sample("git_sample", gitdir.path())?; 193 | 194 | let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?; 195 | 196 | let history = GitFileHistory::new(&mut git_log)?; 197 | 198 | assert_eq!(history.workdir.canonicalize()?, git_root.canonicalize()?); 199 | 200 | // assert_eq_json_str(&history.history_by_file, "{}"); 201 | assert_eq_json_file( 202 | &history.history_by_file, 203 | "./tests/expected/git/git_sample_by_filename.json", 204 | ); 205 | 206 | Ok(()) 207 | } 208 | 209 | #[test] 210 | fn can_tell_if_file_is_in_git_repo() -> Result<(), Error> { 211 | let gitdir = tempdir()?; 212 | let git_root = unzip_test_sample("git_sample", gitdir.path())?; 213 | 214 | let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?; 215 | 216 | let history = GitFileHistory::new(&mut git_log)?; 217 | 218 | assert!(history.is_repo_for(&git_root.join("simple/parent.clj"))?); 219 | 220 | Ok(()) 221 | } 222 | 223 | #[test] 224 | fn can_get_history_for_file() -> Result<(), Error> { 225 | let gitdir = tempdir()?; 226 | let git_root = unzip_test_sample("git_sample", gitdir.path())?; 227 | 228 | let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?; 229 | 230 | let history = GitFileHistory::new(&mut git_log)?; 231 | 232 | let file_history = history.history_for(&git_root.join("simple/parent.clj"))?; 233 | 234 | assert!(file_history.is_some()); 235 | 236 | let ids: Vec<_> = file_history.unwrap().iter().map(|h| &h.id).collect(); 237 | assert_eq!( 238 | ids, 239 | vec![ 240 | "0dbd54d4c524ecc776f381e660cce9b2dd92162c", 241 | "a0ae9997cfdf49fd0cbf54dacc72c778af337519", 242 | "ca239efb9b26db57ac9e2ec3e2df1c42578a46f8" 243 | ] 244 | ); 245 | 246 | assert_eq!(history.last_commit(), 1_558_533_240); 247 | 248 | Ok(()) 249 | } 250 | 251 | #[test] 252 | fn no_history_for_files_not_known() -> Result<(), Error> { 253 | let gitdir = tempdir()?; 254 | let git_root = unzip_test_sample("git_sample", gitdir.path())?; 255 | 256 | let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?; 257 | 258 | let history = GitFileHistory::new(&mut git_log)?; 259 | 260 | let new_file = git_root.join("simple/nonesuch.clj"); 261 | std::fs::File::create(&new_file)?; 262 | 263 | let file_history = history.history_for(&new_file)?; 264 | 265 | assert!(file_history.is_none()); 266 | 267 | Ok(()) 268 | } 269 | 270 | #[test] 271 | fn can_get_history_for_complex_renamed_files() -> Result<(), Error> { 272 | let gitdir = tempdir()?; 273 | let git_root = unzip_test_sample("rename_complex", gitdir.path())?; 274 | /* 275 | This is generated by the script in tests/data/builders/renaming/rename_complex.sh 276 | 277 | log is: 278 | 279 | * 3629e5a (HEAD -> master) restoring deleted z 280 | * 261e027 merging dave work with fixes 281 | |\ 282 | | * c3b47c3 (dave_work) rename bb to b, a2 back to a 283 | | * 500a621 rename a1 to a2, add bb, kill z 284 | * | fac9419 merging jay work 285 | |\ \ 286 | | * | 34b904b (jay_work) rename bee to b, aa back to a 287 | | * | 3bd2d90 rename a1 to aa, add bee 288 | | |/ 289 | * | 8be47df rename a1 back to a prep merging 290 | |/ 291 | * 388e644 rename a to a1 292 | * bd6d7df initial commit 293 | */ 294 | 295 | let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?; 296 | 297 | let history = GitFileHistory::new(&mut git_log)?; 298 | 299 | let file_history = history.history_for(&git_root.join("a.txt"))?; 300 | 301 | let ids: Vec<_> = file_history.unwrap().iter().map(|h| &h.id).collect(); 302 | assert_eq!( 303 | ids, 304 | // all of these refs have a file that ends up being "a.txt" via renames and merges: 305 | vec![ 306 | "c3b47c335ebd9dbb9b0c9922bc258555a2cf71c9", 307 | "500a621e9e83612f51dbce15202cd7bef3c88f00", 308 | "34b904b010abf316167bba7a7ce2b4a5996cc0d1", 309 | "3bd2d9088ee5b051ada1bd30f07e7bcd390f6327", 310 | "8be47dfc0a25ec27941413619f632a1fa66e5ba5", 311 | "388e644e9240aa333fe669069bb00d418ffca500", 312 | "bd6d7dfa063ec95ebc3bad7bffd4262e3702b77c", 313 | ] 314 | ); 315 | 316 | Ok(()) 317 | } 318 | 319 | #[test] 320 | fn deleted_files_dont_have_history() -> Result<(), Error> { 321 | let gitdir = tempdir()?; 322 | let git_root = unzip_test_sample("rename_complex", gitdir.path())?; 323 | 324 | let mut git_log = GitLog::new(&git_root, GitLogConfig::default())?; 325 | 326 | let history = GitFileHistory::new(&mut git_log)?; 327 | 328 | let file_history = history.history_for(&git_root.join("z.txt"))?; 329 | 330 | assert!(file_history.is_some()); 331 | 332 | let ids: Vec<_> = file_history.unwrap().iter().map(|h| &h.id).collect(); 333 | assert_eq!( 334 | ids, 335 | // z.txt is only using the final commit, not the earlier file that was deleted. 336 | vec!["3629e5a8d8d7547bac749530eb540d0f61535cd1",] 337 | ); 338 | 339 | Ok(()) 340 | } 341 | } 342 | -------------------------------------------------------------------------------- /src/git.rs: -------------------------------------------------------------------------------- 1 | use crate::flare::FlareTreeNode; 2 | use crate::git_file_history::{FileHistoryEntry, GitFileHistory}; 3 | use crate::git_logger::{CommitChange, GitLog, GitLogConfig, User}; 4 | use crate::git_user_dictionary::GitUserDictionary; 5 | use crate::polyglot_data::GitMetadata; 6 | use crate::toxicity_indicator_calculator::ToxicityIndicatorCalculator; 7 | use anyhow::{Context, Error}; 8 | use chrono::{NaiveDateTime, NaiveTime}; 9 | 10 | use serde::{Deserialize, Serialize}; 11 | 12 | use std::cmp::Ordering; 13 | use std::collections::HashSet; 14 | use std::collections::{BTreeSet, HashMap}; 15 | use std::iter::once; 16 | 17 | use std::path::Path; 18 | 19 | use git2::Repository; 20 | 21 | /// a struct representing git data for a file 22 | #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] 23 | pub struct GitData { 24 | pub last_update: u64, 25 | pub age_in_days: u64, 26 | // we only have a creation date if there was an Add change in the dates scanned 27 | pub creation_date: Option, 28 | pub user_count: usize, 29 | pub users: Vec, // dictionary IDs 30 | pub details: Vec, 31 | pub activity: Vec, 32 | } 33 | 34 | /// Git information for a given day _and_ unique set of users, summarized 35 | /// New as of 0.3.3 - we now generate new `GitDetails` per user set - the file format hasn't changed but 36 | /// instead of a single `GitDetails` per day, there might be multiple. 37 | /// Also dates are summarized by "author date" - had to pick author or commit date, and 38 | /// author dates seem more reliable. But it's named "`commit_day`" as that's more understandable 39 | /// WIP: for better coupling data, I want individual commits, rather than summarizing per day. 40 | #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] 41 | pub struct GitDetails { 42 | /// Note this is based on "author date" - commit dates can be all over the place with PRs, rebasing and the like. 43 | pub commit_day: u64, 44 | pub users: BTreeSet, // dictionary IDs, ordered 45 | pub commits: u64, 46 | pub lines_added: u64, 47 | pub lines_deleted: u64, 48 | } 49 | 50 | impl Ord for GitDetails { 51 | fn cmp(&self, other: &Self) -> Ordering { 52 | let day_ordering = self.commit_day.cmp(&other.commit_day); 53 | if day_ordering != Ordering::Equal { 54 | return day_ordering; 55 | } 56 | self.users.cmp(&other.users) 57 | } 58 | } 59 | 60 | impl PartialOrd for GitDetails { 61 | fn partial_cmp(&self, other: &Self) -> Option { 62 | Some(self.cmp(other)) 63 | } 64 | } 65 | 66 | /// this is the key to keep details stored uniquely 67 | #[derive(Debug, PartialEq, Eq, Hash)] 68 | struct GitDetailsKey { 69 | pub commit_day: u64, 70 | pub users: BTreeSet, 71 | } 72 | 73 | /// Fine-grained git activity, for the fine-grained coupling calculations 74 | /// this is very verbose so probably shouldn't be kept in final JSON 75 | #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] 76 | pub struct GitActivity { 77 | pub author_time: u64, 78 | pub commit_time: u64, 79 | pub users: BTreeSet, // dictionary IDs 80 | pub change: CommitChange, 81 | pub lines_added: u64, 82 | pub lines_deleted: u64, 83 | } 84 | impl Ord for GitActivity { 85 | fn cmp(&self, other: &Self) -> Ordering { 86 | self.commit_time.cmp(&other.commit_time) 87 | } 88 | } 89 | 90 | impl PartialOrd for GitActivity { 91 | fn partial_cmp(&self, other: &Self) -> Option { 92 | Some(self.cmp(other)) 93 | } 94 | } 95 | 96 | /// History of any git roots discovered by the calculator 97 | /// Split from `GitCalculator` as we need to mutate the dictionary while borrowing the history immutably 98 | #[derive(Debug)] 99 | pub struct GitHistories { 100 | git_file_histories: Vec, 101 | /// config used to initialize any git histories 102 | git_log_config: GitLogConfig, 103 | } 104 | 105 | #[derive(Debug)] 106 | pub struct GitCalculator { 107 | histories: GitHistories, 108 | dictionary: GitUserDictionary, 109 | } 110 | 111 | // Git data for a directory - just remote git info 112 | #[derive(Debug, Clone, PartialEq, Eq, Serialize)] 113 | pub struct GitInfo { 114 | pub remote_url: Option, 115 | pub head: Option, 116 | } 117 | 118 | // Git data for a file _or_ a directory 119 | #[derive(Debug, PartialEq, Eq, Clone, Serialize)] 120 | #[serde(untagged)] 121 | pub enum GitNodeData { 122 | File { 123 | #[serde(flatten)] 124 | data: GitData, 125 | }, 126 | Dir { 127 | #[serde(flatten)] 128 | data: GitInfo, 129 | }, 130 | } 131 | 132 | fn repository_head(repository: &Repository) -> Result { 133 | let head = repository.head()?; 134 | let head_ref = head.resolve()?; 135 | Ok(head_ref.peel_to_commit()?.id().to_string()) 136 | } 137 | 138 | impl GitInfo { 139 | pub fn new(path: &Path, repository: &Repository) -> Self { 140 | let remote = repository.find_remote("origin"); 141 | let remote_url = match remote { 142 | Err(e) => { 143 | warn!("Error fetching origin for {:?}: {}", path, e); 144 | None 145 | } 146 | Ok(remote) => remote.url().map(str::to_owned), 147 | }; 148 | let head = match repository_head(repository) { 149 | Err(e) => { 150 | warn!("Error fetching head for {:?}: {}", path, e); 151 | None 152 | } 153 | Ok(head) => Some(head), 154 | }; 155 | GitInfo { remote_url, head } 156 | } 157 | } 158 | 159 | fn start_of_day(secs_since_epoch: u64) -> u64 { 160 | let date_time = NaiveDateTime::from_timestamp(secs_since_epoch as i64, 0); 161 | date_time 162 | .date() 163 | .and_time(NaiveTime::from_num_seconds_from_midnight(0, 0)) 164 | .timestamp() as u64 165 | } 166 | impl GitHistories { 167 | fn git_history(&self, filename: &Path) -> Option<&GitFileHistory> { 168 | self.git_file_histories 169 | .iter() 170 | .find(|h| h.is_repo_for(filename).unwrap()) 171 | // TODO can we get rid of unwrap here? 172 | // it's tricky as we can't return a Result. 173 | } 174 | 175 | fn add_history_for(&mut self, filename: &Path) -> Result<(), Error> { 176 | info!("Adding new git log for {:?}", &filename); 177 | let mut git_log = GitLog::new(filename, self.git_log_config)?; 178 | info!("Found working dir: {:?}", git_log.workdir()); 179 | let history = GitFileHistory::new(&mut git_log)?; 180 | self.git_file_histories.push(history); 181 | Ok(()) 182 | } 183 | fn unique_changers( 184 | history: &FileHistoryEntry, 185 | dictionary: &mut GitUserDictionary, 186 | ) -> BTreeSet { 187 | let mut users: Vec<&User> = history 188 | .co_authors 189 | .iter() 190 | .chain(once(&history.author)) 191 | .chain(once(&history.committer)) 192 | .collect(); 193 | users.sort(); 194 | users.dedup(); 195 | // this used to use a HashSet but I want deterministic ordering and so I want it in a vec anyway 196 | users.into_iter().map(|u| dictionary.register(u)).collect() 197 | } 198 | 199 | fn stats_from_history( 200 | dictionary: &mut GitUserDictionary, 201 | last_commit: u64, 202 | history: &[FileHistoryEntry], 203 | ) -> Option { 204 | // for now, just get latest change - maybe non-trivial change? (i.e. ignore rename/copy) - or this could be configurable 205 | // and get set of all authors - maybe deduplicate by email. 206 | if history.is_empty() { 207 | return None; 208 | } 209 | let mut details: HashMap = HashMap::new(); 210 | 211 | let first_date = history.iter().map(|h| h.author_time).min(); 212 | 213 | let mut creation_date = history 214 | .iter() 215 | .filter(|h| h.change == CommitChange::Add) 216 | .map(|h| h.author_time) 217 | .min(); 218 | 219 | if let Some(creation) = creation_date { 220 | // TODO: test this! 221 | if first_date.unwrap() < creation { 222 | debug!( 223 | "File has a git date {:?} before the first Add operation {:?}", 224 | first_date.unwrap(), 225 | creation 226 | ); 227 | creation_date = None; 228 | } 229 | } 230 | 231 | let last_update = history.iter().map(|h| h.commit_time).max()?; 232 | 233 | let age_in_days = (last_commit - last_update) / (60 * 60 * 24); 234 | 235 | let changers: HashSet = history 236 | .iter() 237 | .flat_map(|h| GitHistories::unique_changers(h, dictionary)) 238 | .collect(); 239 | 240 | let mut activity_vec: Vec = Vec::new(); 241 | 242 | for entry in history { 243 | let author_day = start_of_day(entry.author_time); 244 | let unique_changers = GitHistories::unique_changers(entry, dictionary); 245 | let key = GitDetailsKey { 246 | commit_day: author_day, 247 | users: unique_changers.clone(), 248 | }; 249 | let daily_details = details.entry(key).or_insert(GitDetails { 250 | commit_day: author_day, 251 | users: unique_changers.clone(), 252 | commits: 0, 253 | lines_added: 0, 254 | lines_deleted: 0, 255 | }); 256 | daily_details.commits += 1; 257 | daily_details 258 | .users 259 | .extend(unique_changers.clone().into_iter()); 260 | daily_details.lines_added += entry.lines_added; 261 | daily_details.lines_deleted += entry.lines_deleted; 262 | 263 | let activity: GitActivity = GitActivity { 264 | commit_time: entry.commit_time, 265 | author_time: entry.author_time, 266 | users: unique_changers, 267 | change: entry.change, 268 | lines_added: entry.lines_added, 269 | lines_deleted: entry.lines_deleted, 270 | }; 271 | activity_vec.push(activity); 272 | } 273 | 274 | let mut changer_list: Vec = changers.into_iter().collect(); 275 | changer_list.sort_unstable(); 276 | 277 | let mut details_vec: Vec = details 278 | .into_iter() 279 | .map(|(_k, v)| v) 280 | .collect::>(); 281 | details_vec.sort(); 282 | 283 | Some(GitData { 284 | last_update, 285 | age_in_days, 286 | creation_date, 287 | user_count: changer_list.len(), 288 | users: changer_list, 289 | details: details_vec, 290 | activity: activity_vec, 291 | }) 292 | } 293 | } 294 | 295 | impl GitCalculator { 296 | pub fn new(config: GitLogConfig) -> Self { 297 | GitCalculator { 298 | histories: GitHistories { 299 | git_file_histories: Vec::new(), 300 | git_log_config: config, 301 | }, 302 | dictionary: GitUserDictionary::default(), 303 | } 304 | } 305 | } 306 | 307 | impl ToxicityIndicatorCalculator for GitCalculator { 308 | fn name(&self) -> String { 309 | "git".to_string() 310 | } 311 | fn visit_node(&mut self, node: &mut FlareTreeNode, path: &Path) -> Result<(), Error> { 312 | if path.is_file() { 313 | // TODO: refactor this into a method on histories (I tried this but got into a mess with mutable and immutable refs to self!) 314 | let history = match self.histories.git_history(path) { 315 | Some(history) => history, 316 | None => { 317 | info!("Loading git history for {}", path.display()); 318 | self.histories 319 | .add_history_for(path) 320 | .with_context(|| format!("Loading git history based on {:?}", path))?; 321 | info!("history loaded."); 322 | self.histories.git_history(path).unwrap() 323 | } 324 | }; 325 | let last_commit = history.last_commit(); 326 | let file_history = history 327 | .history_for(path) 328 | .with_context(|| format!("getting git file history for {:?}", path))?; 329 | 330 | if let Some(file_history) = file_history { 331 | let stats = GitHistories::stats_from_history( 332 | &mut self.dictionary, 333 | last_commit, 334 | file_history, 335 | ); 336 | node.indicators_mut().git = stats.map(|stats| GitNodeData::File { data: stats }); 337 | } else { 338 | // probably outside date range 339 | debug!("No git history found for file: {:?}", path); 340 | } 341 | } else { 342 | let git_path = path.join(".git"); 343 | if git_path.is_dir() { 344 | match Repository::discover(path) { 345 | Ok(repository) => { 346 | let info = GitInfo::new(path, &repository); 347 | node.indicators_mut().git = Some(GitNodeData::Dir { data: info }); 348 | } 349 | Err(e) => { 350 | warn!( 351 | "Can't find git repository at {:?}, {} - ignoring .git directory", 352 | path, e 353 | ); 354 | } 355 | } 356 | } 357 | } 358 | Ok(()) 359 | } 360 | 361 | fn apply_metadata( 362 | &self, 363 | metadata: &mut crate::polyglot_data::IndicatorMetadata, 364 | ) -> Result<(), Error> { 365 | metadata.git = Some(GitMetadata { 366 | users: self.dictionary.clone(), 367 | }); 368 | Ok(()) 369 | } 370 | } 371 | 372 | // Hacky - I need this constructor for coupling tests, until I build better integration tests 373 | #[cfg(test)] 374 | impl GitData { 375 | pub fn fake_with_activity(activity: Vec) -> Self { 376 | Self { 377 | last_update: 0, 378 | age_in_days: 0, 379 | creation_date: None, 380 | user_count: 0, 381 | users: Vec::new(), 382 | details: Vec::new(), 383 | activity, 384 | } 385 | } 386 | } 387 | 388 | #[cfg(test)] 389 | mod test { 390 | use super::*; 391 | use crate::git_file_history::FileHistoryEntryBuilder; 392 | use crate::git_logger::{CommitChange, User}; 393 | use pretty_assertions::assert_eq; 394 | 395 | lazy_static! { 396 | static ref USER_JO: User = User::new(None, Some("jo@smith.com")); 397 | static ref USER_X: User = User::new(None, Some("x@smith.com")); 398 | static ref USER_Y: User = User::new(Some("Why"), Some("y@smith.com")); 399 | } 400 | 401 | #[test] 402 | fn gets_basic_stats_from_git_events() -> Result<(), Error> { 403 | let one_day_in_secs: u64 = 60 * 60 * 24; 404 | 405 | let first_day = one_day_in_secs; 406 | 407 | let events: Vec = vec![ 408 | FileHistoryEntryBuilder::test_default() 409 | .emails("jo@smith.com") 410 | .times(first_day) 411 | .id("1111") 412 | .build() 413 | .map_err(Error::msg)?, 414 | FileHistoryEntryBuilder::test_default() 415 | .emails("x@smith.com") 416 | .times(first_day + 3 * one_day_in_secs) 417 | .author(User::new(Some("Why"), Some("y@smith.com"))) 418 | .id("2222") 419 | .build() 420 | .map_err(Error::msg)?, 421 | ]; 422 | let mut dictionary = GitUserDictionary::default(); 423 | 424 | let today = first_day + 5 * one_day_in_secs; 425 | 426 | let stats = GitHistories::stats_from_history(&mut dictionary, today, &events).unwrap(); 427 | 428 | assert_eq!(stats.last_update, first_day + 3 * one_day_in_secs); 429 | assert_eq!(stats.age_in_days, 2); 430 | assert_eq!(stats.creation_date, Some(86400)); 431 | assert_eq!(stats.user_count, 3); 432 | assert_eq!(stats.users, vec![0, 1, 2]); 433 | // don't assert details - details used to be optional, so it is tested in next test. 434 | 435 | assert_eq!(dictionary.user_count(), 3); 436 | assert_eq!(dictionary.user_id(&USER_JO), Some(&0)); 437 | assert_eq!(dictionary.user_id(&USER_X), Some(&1)); 438 | assert_eq!(dictionary.user_id(&USER_Y), Some(&2)); 439 | 440 | Ok(()) 441 | } 442 | 443 | #[test] 444 | fn gets_detailed_stats_from_git_events() -> Result<(), Error> { 445 | let one_day_in_secs: u64 = 60 * 60 * 24; 446 | 447 | let first_day = one_day_in_secs; 448 | 449 | let events: Vec = vec![ 450 | FileHistoryEntryBuilder::test_default() 451 | .emails("jo@smith.com") 452 | .times(first_day) 453 | .id("1111") 454 | .build() 455 | .map_err(Error::msg)?, 456 | FileHistoryEntryBuilder::test_default() 457 | .emails("jo@smith.com") 458 | .times(first_day) 459 | .author(User::new(Some("Why"), Some("y@smith.com"))) // second author so new stats 460 | .id("1111") 461 | .build() 462 | .map_err(Error::msg)?, 463 | FileHistoryEntryBuilder::test_default() 464 | .emails("x@smith.com") 465 | .times(first_day + 3 * one_day_in_secs) 466 | .author(User::new(Some("Why"), Some("y@smith.com"))) 467 | .id("2222") 468 | .build() 469 | .map_err(Error::msg)?, 470 | ]; 471 | 472 | let mut dictionary = GitUserDictionary::default(); 473 | 474 | let today = first_day + 5 * one_day_in_secs; 475 | 476 | let stats = GitHistories::stats_from_history(&mut dictionary, today, &events); 477 | 478 | let jo_set: BTreeSet = vec![0].into_iter().collect(); 479 | let xy_set: BTreeSet = vec![1, 2].into_iter().collect(); 480 | let jo_y_set: BTreeSet = vec![0, 1].into_iter().collect(); 481 | 482 | let expected_details: Vec = vec![ 483 | GitDetails { 484 | commit_day: 86400, 485 | users: jo_set.clone(), 486 | commits: 1, 487 | lines_added: 0, 488 | lines_deleted: 0, 489 | }, 490 | GitDetails { 491 | commit_day: 86400, 492 | users: jo_y_set.clone(), 493 | commits: 1, 494 | lines_added: 0, 495 | lines_deleted: 0, 496 | }, 497 | GitDetails { 498 | commit_day: 345_600, 499 | users: xy_set.clone(), 500 | commits: 1, 501 | lines_added: 0, 502 | lines_deleted: 0, 503 | }, 504 | ]; 505 | 506 | let expected_activity: Vec = vec![ 507 | GitActivity { 508 | author_time: 86400, 509 | commit_time: 86400, 510 | users: jo_set, 511 | change: CommitChange::Add, 512 | lines_added: 0, 513 | lines_deleted: 0, 514 | }, 515 | GitActivity { 516 | author_time: 86400, 517 | commit_time: 86400, 518 | users: jo_y_set, 519 | change: CommitChange::Add, 520 | lines_added: 0, 521 | lines_deleted: 0, 522 | }, 523 | GitActivity { 524 | author_time: 345_600, 525 | commit_time: 345_600, 526 | users: xy_set, 527 | change: CommitChange::Add, 528 | lines_added: 0, 529 | lines_deleted: 0, 530 | }, 531 | ]; 532 | 533 | assert_eq!( 534 | stats, 535 | Some(GitData { 536 | last_update: first_day + 3 * one_day_in_secs, 537 | age_in_days: 2, 538 | creation_date: Some(86400), 539 | user_count: 3, 540 | users: vec![0, 1, 2], 541 | details: expected_details, 542 | activity: expected_activity, 543 | }) 544 | ); 545 | 546 | assert_eq!(dictionary.user_count(), 3); 547 | assert_eq!(dictionary.user_id(&USER_JO), Some(&0)); 548 | assert_eq!(dictionary.user_id(&USER_Y), Some(&1)); 549 | assert_eq!(dictionary.user_id(&USER_X), Some(&2)); 550 | 551 | Ok(()) 552 | } 553 | } 554 | -------------------------------------------------------------------------------- /src/git_logger.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | use crate::git_file_future::{FileNameChange, GitFileFutureRegistry}; 3 | use anyhow::Error; 4 | use git2::Revwalk; 5 | use git2::{Commit, Delta, DiffDelta, ObjectType, Odb, Oid, Patch, Repository, Tree}; 6 | use regex::Regex; 7 | use serde::{Deserialize, Serialize}; 8 | use std::cell::RefCell; 9 | use std::path::{Path, PathBuf}; 10 | use std::rc::Rc; 11 | use std::time::{Duration, SystemTime}; 12 | 13 | #[derive(Debug, Clone, Copy)] 14 | pub struct GitLogConfig { 15 | /// include merge commits in file stats - usually excluded by `git log` - see https://stackoverflow.com/questions/37801342/using-git-log-to-display-files-changed-during-merge 16 | include_merges: bool, 17 | /// earliest commmit for filtering 18 | earliest_time: Option, 19 | } 20 | 21 | impl GitLogConfig { 22 | pub fn default() -> GitLogConfig { 23 | GitLogConfig { 24 | include_merges: false, 25 | earliest_time: None, 26 | } 27 | } 28 | 29 | #[allow(dead_code)] 30 | pub fn include_merges(self, include_merges: bool) -> GitLogConfig { 31 | let mut config = self; 32 | config.include_merges = include_merges; 33 | config 34 | } 35 | /// filter log by unix timestamp 36 | pub fn since(self, earliest_time: Option) -> GitLogConfig { 37 | let mut config = self; 38 | config.earliest_time = earliest_time; 39 | config 40 | } 41 | /// filter log by number of years before now 42 | pub fn since_years(self, years: Option) -> GitLogConfig { 43 | if let Some(years) = years { 44 | let years_ago = SystemTime::now() - Duration::from_secs(60 * 60 * 24 * 365 * years); 45 | let years_ago_secs = years_ago 46 | .duration_since(SystemTime::UNIX_EPOCH) 47 | .unwrap() 48 | .as_secs(); 49 | self.since(Some(years_ago_secs)) 50 | } else { 51 | self.since(None) 52 | } 53 | } 54 | } 55 | 56 | pub struct GitLog { 57 | /// repo work dir - always canonical 58 | workdir: PathBuf, 59 | repo: Repository, 60 | config: GitLogConfig, 61 | } 62 | 63 | pub struct GitLogIterator<'a> { 64 | git_log: &'a GitLog, 65 | odb: Odb<'a>, 66 | revwalk: Revwalk<'a>, 67 | // this is an RC as we need to use it after the iterator has been consumed 68 | git_file_future_registry: Rc>, 69 | } 70 | 71 | /// simplified user info - based on `git2::Signature` 72 | /// everything is derived, seems to work OK as the structure is so simple 73 | #[derive(Debug, PartialEq, Eq, Hash, Clone, PartialOrd, Ord, Serialize)] 74 | pub struct User { 75 | name: Option, 76 | email: Option, 77 | } 78 | 79 | impl User { 80 | pub fn new(name: Option<&str>, email: Option<&str>) -> User { 81 | User { 82 | name: name.map(std::borrow::ToOwned::to_owned), 83 | email: email.map(std::borrow::ToOwned::to_owned), 84 | } 85 | } 86 | 87 | pub fn as_lower_case(&self) -> User { 88 | User { 89 | name: self.name.as_ref().map(|s| s.to_lowercase()), 90 | email: self.email.as_ref().map(|s| s.to_lowercase()), 91 | } 92 | } 93 | } 94 | 95 | /// simplified commit log entry 96 | #[derive(Debug, Serialize, Clone, Getters)] 97 | pub struct GitLogEntry { 98 | id: String, 99 | summary: String, 100 | parents: Vec, 101 | committer: User, 102 | commit_time: u64, 103 | author: User, 104 | author_time: u64, 105 | co_authors: Vec, 106 | file_changes: Vec, 107 | } 108 | 109 | /// the various kinds of git change we care about - a serializable subset of `git2::Delta` 110 | #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Copy)] 111 | pub enum CommitChange { 112 | Add, 113 | Rename, 114 | Delete, 115 | Modify, 116 | Copied, 117 | } 118 | 119 | /// Stats for file changes 120 | #[derive(Debug, Serialize, Clone, Getters)] 121 | pub struct FileChange { 122 | file: PathBuf, 123 | old_file: Option, 124 | change: CommitChange, 125 | lines_added: u64, 126 | lines_deleted: u64, 127 | } 128 | 129 | impl GitLog { 130 | pub fn workdir(&self) -> &Path { 131 | &self.workdir 132 | } 133 | 134 | pub fn new(start_dir: &Path, config: GitLogConfig) -> Result { 135 | let repo = Repository::discover(start_dir)?; 136 | 137 | let workdir = repo 138 | .workdir() 139 | .ok_or_else(|| anyhow!("bare repository - no workdir"))? 140 | .canonicalize()?; 141 | 142 | debug!("work dir: {:?}", workdir); 143 | 144 | Ok(GitLog { 145 | workdir, 146 | repo, 147 | config, 148 | }) 149 | } 150 | 151 | pub fn iterator(&self) -> Result, Error> { 152 | let odb = self.repo.odb()?; 153 | let mut revwalk = self.repo.revwalk()?; 154 | revwalk.set_sorting(git2::Sort::TOPOLOGICAL)?; 155 | revwalk.push_head()?; 156 | Ok(GitLogIterator { 157 | git_log: self, 158 | odb, 159 | revwalk, 160 | git_file_future_registry: Rc::new(RefCell::new(GitFileFutureRegistry::new())), 161 | }) 162 | } 163 | } 164 | 165 | impl<'a> Iterator for GitLogIterator<'a> { 166 | type Item = Result; 167 | 168 | fn next(&mut self) -> Option { 169 | let mut next_item = self.revwalk.next(); 170 | while next_item.is_some() { 171 | let oid = next_item.unwrap(); 172 | // this is a bit ugly - revwalk iterates over Result types, so some entries aren't Oids at all 173 | // but I want an error context, and it's easier to create it here than in all the spots later that might 174 | // return errors. 175 | let error_context = if let Ok(valid_oid) = oid { 176 | format!("Processing oid {:?}", valid_oid) 177 | } else { 178 | "Processing unkown oid from revwalk".to_string() 179 | }; 180 | let c = self.summarise_commit(oid); 181 | match c { 182 | Ok(Some(c)) => { 183 | let commit_in_range = self 184 | .git_log 185 | .config 186 | .earliest_time 187 | .map_or(true, |earliest| c.commit_time >= earliest); 188 | 189 | if commit_in_range { 190 | self.register_file_futures(&c); 191 | return Some(Ok(c)); 192 | } else { 193 | return None; // short circuit! 194 | } 195 | } 196 | Ok(None) => {} 197 | Err(e) => return Some(Err(e.context(error_context))), 198 | }; 199 | next_item = self.revwalk.next(); 200 | } 201 | None 202 | } 203 | } 204 | 205 | impl<'a> GitLogIterator<'a> { 206 | pub fn git_file_future_registry(&self) -> Rc> { 207 | self.git_file_future_registry.clone() 208 | } 209 | 210 | /// registers renames and deletes 211 | fn register_file_futures(&mut self, entry: &GitLogEntry) { 212 | // TODO: probably should be using Oid not String globally, then this would be simpler: 213 | let parents: Vec = entry 214 | .parents 215 | .iter() 216 | .map(|id| Oid::from_str(id).unwrap()) 217 | .collect(); 218 | let mut file_changes: Vec<(PathBuf, FileNameChange)> = Vec::new(); 219 | for file_change in &entry.file_changes { 220 | match file_change.change { 221 | CommitChange::Rename => { 222 | let old_name = file_change.old_file.as_ref().unwrap().clone(); 223 | let new_name = file_change.file.clone(); 224 | file_changes.push((old_name, FileNameChange::Renamed(new_name))); 225 | } 226 | CommitChange::Delete => { 227 | let name = file_change.file.clone(); 228 | file_changes.push((name, FileNameChange::Deleted())); 229 | } 230 | _ => (), 231 | } 232 | } 233 | self.git_file_future_registry.borrow_mut().register( 234 | &Oid::from_str(&entry.id).unwrap(), 235 | &parents, 236 | &file_changes, 237 | ); 238 | } 239 | 240 | /// Summarises a git commit 241 | /// returns Error if error, Result if the id was not actually a commit, or Result> if valid 242 | fn summarise_commit( 243 | &self, 244 | oid: Result, 245 | ) -> Result, Error> { 246 | let oid = oid?; 247 | let kind = self.odb.read(oid)?.kind(); 248 | match kind { 249 | ObjectType::Commit => { 250 | let commit = self.git_log.repo.find_commit(oid)?; 251 | debug!("processing {:?}", commit); 252 | let author = commit.author(); 253 | let committer = commit.committer(); 254 | let author_time = author.when().seconds() as u64; 255 | let commit_time = committer.when().seconds() as u64; 256 | let other_time = commit.time().seconds() as u64; 257 | if commit_time != other_time { 258 | error!( 259 | "Commit {:?} time {:?} != commit time {:?}", 260 | commit, other_time, commit_time 261 | ); 262 | } 263 | let co_authors = if let Some(message) = commit.message() { 264 | find_coauthors(message) 265 | } else { 266 | Vec::new() 267 | }; 268 | 269 | let commit_tree = commit.tree()?; 270 | let file_changes = commit_file_changes( 271 | &self.git_log.repo, 272 | &commit, 273 | &commit_tree, 274 | self.git_log.config, 275 | ); 276 | Ok(Some(GitLogEntry { 277 | id: oid.to_string(), 278 | summary: commit.summary().unwrap_or("[no message]").to_string(), 279 | parents: commit.parent_ids().map(|p| p.to_string()).collect(), 280 | committer: signature_to_user(&committer), 281 | commit_time, 282 | author: signature_to_user(&author), 283 | author_time, 284 | co_authors, 285 | file_changes, 286 | })) 287 | } 288 | _ => { 289 | info!("ignoring object type: {}", kind); 290 | Ok(None) 291 | } 292 | } 293 | } 294 | } 295 | 296 | fn signature_to_user(signature: &git2::Signature<'_>) -> User { 297 | User { 298 | name: signature.name().map(std::borrow::ToOwned::to_owned), 299 | email: signature.email().map(std::borrow::ToOwned::to_owned), 300 | } 301 | } 302 | 303 | fn trim_string(s: &str) -> Option<&str> { 304 | let trimmed = s.trim(); 305 | if trimmed.is_empty() { 306 | None 307 | } else { 308 | Some(trimmed) 309 | } 310 | } 311 | 312 | fn find_coauthors(message: &str) -> Vec { 313 | lazy_static! { 314 | static ref CO_AUTH_LINE: Regex = Regex::new(r"(?m)^\s*Co-authored-by:(.*)$").unwrap(); 315 | static ref CO_AUTH_ANGLE_BRACKETS: Regex = Regex::new(r"^(.*)<([^>]+)>\s*$").unwrap(); 316 | } 317 | 318 | CO_AUTH_LINE 319 | .captures_iter(message) 320 | .map(|capture_group| { 321 | let co_author_text = &capture_group[1]; 322 | if let Some(co_author_bits) = CO_AUTH_ANGLE_BRACKETS.captures(co_author_text) { 323 | User::new( 324 | trim_string(co_author_bits.get(1).unwrap().as_str()), 325 | trim_string(co_author_bits.get(2).unwrap().as_str()), 326 | ) 327 | } else if co_author_text.contains('@') { 328 | // no angle brackets, but an @ 329 | User::new(None, trim_string(co_author_text)) 330 | } else { 331 | User::new(trim_string(co_author_text), None) 332 | } 333 | }) 334 | .collect() 335 | } 336 | 337 | fn commit_file_changes( 338 | repo: &Repository, 339 | commit: &Commit<'_>, 340 | commit_tree: &Tree<'_>, 341 | config: GitLogConfig, 342 | ) -> Vec { 343 | if commit.parent_count() == 0 { 344 | info!("Commit {} has no parent", commit.id()); 345 | 346 | scan_diffs(repo, commit_tree, None, commit, None).expect("Can't scan for diffs") 347 | } else if commit.parent_count() > 1 && !config.include_merges { 348 | debug!( 349 | "Not showing file changes for merge commit {:?}", 350 | commit.id() 351 | ); 352 | Vec::new() 353 | } else { 354 | commit 355 | .parents() 356 | .flat_map(|parent| { 357 | debug!("Getting changes for parent {:?}:", parent); 358 | let parent_tree = parent.tree().expect("can't get parent tree"); 359 | scan_diffs(repo, commit_tree, Some(&parent_tree), commit, Some(&parent)) 360 | .expect("Can't scan for diffs") 361 | }) 362 | .collect() 363 | } 364 | } 365 | 366 | fn scan_diffs( 367 | repo: &Repository, 368 | commit_tree: &Tree<'_>, 369 | parent_tree: Option<&Tree<'_>>, 370 | commit: &Commit<'_>, 371 | parent: Option<&Commit<'_>>, 372 | ) -> Result, Error> { 373 | let mut diff = repo.diff_tree_to_tree(parent_tree, Some(commit_tree), None)?; 374 | // Identify renames, None means default settings - see https://libgit2.org/libgit2/#HEAD/group/diff/git_diff_find_similar 375 | diff.find_similar(None)?; 376 | let file_changes = diff 377 | .deltas() 378 | .enumerate() 379 | .filter_map(|(delta_index, delta)| { 380 | // can we / should we get bytes for binary changes? Adds show as 0 lines. 381 | let patch = 382 | Patch::from_diff(&diff, delta_index).expect("can't get a patch from a diff"); 383 | let (_, lines_added, lines_deleted) = if let Some(patch) = patch { 384 | patch 385 | .line_stats() 386 | .expect("Couldn't get line stats from a patch") 387 | } else { 388 | warn!("No patch possible diffing {:?} -> {:?}", commit, parent); 389 | (0, 0, 0) 390 | }; 391 | summarise_delta(&delta, lines_added as u64, lines_deleted as u64) 392 | }); 393 | Ok(file_changes.collect()) 394 | } 395 | 396 | fn summarise_delta( 397 | delta: &DiffDelta<'_>, 398 | lines_added: u64, 399 | lines_deleted: u64, 400 | ) -> Option { 401 | match delta.status() { 402 | Delta::Added => { 403 | let name = delta.new_file().path().unwrap(); 404 | Some(FileChange { 405 | file: name.to_path_buf(), 406 | old_file: None, 407 | change: CommitChange::Add, 408 | lines_added, 409 | lines_deleted, 410 | }) 411 | } 412 | Delta::Renamed => { 413 | let old_name = delta.old_file().path().unwrap(); 414 | let new_name = delta.new_file().path().unwrap(); 415 | Some(FileChange { 416 | file: new_name.to_path_buf(), 417 | old_file: Some(old_name.to_path_buf()), 418 | change: CommitChange::Rename, 419 | lines_added, 420 | lines_deleted, 421 | }) 422 | } 423 | Delta::Deleted => { 424 | let name = delta.old_file().path().unwrap(); 425 | Some(FileChange { 426 | file: name.to_path_buf(), 427 | old_file: None, 428 | change: CommitChange::Delete, 429 | lines_added, 430 | lines_deleted, 431 | }) 432 | } 433 | Delta::Modified => { 434 | let name = delta.new_file().path().unwrap(); 435 | Some(FileChange { 436 | file: name.to_path_buf(), 437 | old_file: None, 438 | change: CommitChange::Modify, 439 | lines_added, 440 | lines_deleted, 441 | }) 442 | } 443 | Delta::Copied => { 444 | let old_name = delta.old_file().path().unwrap(); 445 | let new_name = delta.new_file().path().unwrap(); 446 | Some(FileChange { 447 | file: new_name.to_path_buf(), 448 | old_file: Some(old_name.to_path_buf()), 449 | change: CommitChange::Copied, 450 | lines_added, 451 | lines_deleted, 452 | }) 453 | } 454 | _ => { 455 | error!("Not able to handle delta of status {:?}", delta.status()); 456 | None 457 | } 458 | } 459 | } 460 | 461 | #[cfg(test)] 462 | mod test { 463 | use super::*; 464 | use pretty_assertions::assert_eq; 465 | use serde_json::json; 466 | use tempfile::tempdir; 467 | use test_shared::{assert_eq_json_file, assert_eq_json_value, unzip_test_sample}; 468 | 469 | #[test] 470 | fn users_can_be_lowercased() { 471 | assert_eq!( 472 | User::new(Some("Fred"), Some("Fred@Gmail.com")).as_lower_case(), 473 | User::new(Some("fred"), Some("fred@gmail.com")) 474 | ); 475 | assert_eq!( 476 | User::new(None, Some("Fred@Gmail.com")).as_lower_case(), 477 | User::new(None, Some("fred@gmail.com")) 478 | ); 479 | assert_eq!( 480 | User::new(Some("Fred"), None).as_lower_case(), 481 | User::new(Some("fred"), None) 482 | ); 483 | assert_eq!(User::new(None, None).as_lower_case(), User::new(None, None)); 484 | } 485 | 486 | #[test] 487 | fn authorless_message_has_no_coauthors() { 488 | assert_eq!(find_coauthors("do be do be do"), Vec::::new()); 489 | } 490 | 491 | #[test] 492 | fn can_get_coauthors_from_message() { 493 | let message = "This is a commit message 494 | not valid: Co-authored-by: fred jones 495 | Co-authored-by: valid user 496 | Co-authored-by: White Space \t\r 497 | Co-authored-by: 498 | Co-authored-by: bad@user 499 | ignore random lines 500 | Co-authored-by: if there's no at it's a name 501 | Co-authored-by: if there's an @ it's email@thing.com 502 | ignore trailing lines 503 | "; 504 | 505 | let expected = vec![ 506 | User::new(Some("valid user"), Some("valid@thing.com")), 507 | User::new( 508 | Some("White Space"), 509 | Some("handles_trailing_whitespace@any-domain.com"), 510 | ), 511 | User::new(None, Some("be.lenient@any-domain.com")), 512 | User::new( 513 | Some("bad@user"), 514 | Some("this isn't really trying to be clever"), 515 | ), 516 | User::new(Some("if there's no at it's a name"), None), 517 | User::new(None, Some("if there's an @ it's email@thing.com")), 518 | ]; 519 | 520 | assert_eq!(find_coauthors(message), expected); 521 | } 522 | 523 | #[test] 524 | fn can_extract_basic_git_log() -> Result<(), Error> { 525 | let gitdir = tempdir()?; 526 | let git_root = unzip_test_sample("git_sample", gitdir.path())?; 527 | let git_log = GitLog::new(&git_root, GitLogConfig::default())?; 528 | 529 | assert_eq!(git_log.workdir.canonicalize()?, git_root.canonicalize()?); 530 | 531 | let err_count = git_log.iterator()?.filter(|x| Result::is_err(x)).count(); 532 | assert_eq!(err_count, 0); 533 | 534 | let entries: Vec<_> = git_log.iterator()?.filter_map(Result::ok).collect(); 535 | 536 | assert_eq_json_file(&entries, "./tests/expected/git/git_sample.json"); 537 | 538 | Ok(()) 539 | } 540 | 541 | #[test] 542 | fn git_log_can_include_merge_changes() -> Result<(), Error> { 543 | let gitdir = tempdir()?; 544 | let git_root = unzip_test_sample("git_sample", gitdir.path())?; 545 | 546 | let git_log = GitLog::new(&git_root, GitLogConfig::default().include_merges(true))?; 547 | 548 | let err_count = git_log.iterator()?.filter(Result::is_err).count(); 549 | assert_eq!(err_count, 0); 550 | 551 | let entries: Vec<_> = git_log.iterator()?.filter_map(Result::ok).collect(); 552 | 553 | assert_eq_json_file(&entries, "./tests/expected/git/git_sample_with_merges.json"); 554 | 555 | Ok(()) 556 | } 557 | 558 | #[allow(clippy::unreadable_literal)] 559 | #[test] 560 | fn git_log_can_limit_to_recent_history() -> Result<(), Error> { 561 | let gitdir = tempdir()?; 562 | let git_root = unzip_test_sample("git_sample", gitdir.path())?; 563 | 564 | let git_log = GitLog::new(&git_root, GitLogConfig::default().since(Some(1558521694)))?; 565 | 566 | let err_count = git_log.iterator()?.filter(Result::is_err).count(); 567 | assert_eq!(err_count, 0); 568 | 569 | let ids: Vec<_> = git_log 570 | .iterator()? 571 | .filter_map(Result::ok) 572 | .map(|h| (h.summary.clone(), h.commit_time)) 573 | .collect(); 574 | assert_eq!( 575 | ids, 576 | vec![ 577 | ("renaming".to_owned(), 1558533240u64), 578 | ("just changed parent.clj".to_owned(), 1558524371u64), 579 | ("Merge branch \'fiddling\'".to_owned(), 1558521695u64) 580 | ] 581 | ); 582 | 583 | Ok(()) 584 | } 585 | 586 | #[test] 587 | fn git_log_tracks_renames() -> Result<(), Error> { 588 | let gitdir = tempdir()?; 589 | let git_root = unzip_test_sample("rename_simple", gitdir.path())?; 590 | 591 | let git_log = GitLog::new(&git_root, GitLogConfig::default())?; 592 | 593 | let err_count = git_log.iterator()?.filter(Result::is_err).count(); 594 | assert_eq!(err_count, 0); 595 | 596 | let mut entries: Vec<_> = git_log.iterator()?.filter_map(Result::ok).collect(); 597 | entries.sort_by(|a, b| a.author_time.cmp(&b.author_time)); 598 | 599 | let changes: Vec = entries.iter().map(|entry| entry.summary.clone()).collect(); 600 | 601 | assert_eq!( 602 | changes, 603 | vec![ 604 | "initial commit", 605 | "unrelated commit", 606 | "moving a to c", 607 | "moving and renaming" 608 | ] 609 | ); 610 | 611 | let file_changes: Vec> = entries 612 | .iter() 613 | .map(|entry| { 614 | let mut entries = entry.file_changes.clone(); 615 | entries.sort_by(|a, b| a.file.cmp(&b.file)); 616 | entries 617 | }) 618 | .collect(); 619 | 620 | assert_eq_json_value( 621 | &file_changes, 622 | &json!([ 623 | [{"change":"Add", 624 | "file":"a.txt", 625 | "lines_added": 4, 626 | "lines_deleted": 0, 627 | "old_file": null} 628 | ], 629 | [{"change":"Add", 630 | "file":"b.txt", 631 | "lines_added": 1, 632 | "lines_deleted": 0, 633 | "old_file": null} 634 | ], 635 | [{"change":"Rename", 636 | "file":"c.txt", 637 | "lines_added": 0, 638 | "lines_deleted": 0, 639 | "old_file": "a.txt"} 640 | ], 641 | [{"change":"Rename", 642 | "file":"d.txt", 643 | "lines_added": 1, 644 | "lines_deleted": 0, 645 | "old_file": "c.txt"} 646 | ] 647 | ] 648 | ), 649 | ); 650 | 651 | Ok(()) 652 | } 653 | } 654 | /* 655 |