├── TODO ├── CODEOWNERS ├── shallow-explore ├── .gitignore ├── go.mod ├── LICENSE ├── utils ├── explore_test.go └── explore.go ├── go.sum ├── README.md └── main.go /TODO: -------------------------------------------------------------------------------- 1 | - Unit tests 2 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @tmickleydoyle 2 | -------------------------------------------------------------------------------- /shallow-explore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmickleydoyle/shallow-explore/HEAD/shallow-explore -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/tmickleydoyle/shallow-explore 2 | 3 | go 1.17 4 | 5 | require ( 6 | github.com/charmbracelet/lipgloss v0.7.1 7 | github.com/guptarohit/asciigraph v0.5.5 8 | ) 9 | 10 | require ( 11 | github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect 12 | github.com/lucasb-eyer/go-colorful v1.2.0 // indirect 13 | github.com/mattn/go-isatty v0.0.19 // indirect 14 | github.com/mattn/go-runewidth v0.0.14 // indirect 15 | github.com/muesli/reflow v0.3.0 // indirect 16 | github.com/muesli/termenv v0.15.1 // indirect 17 | github.com/rivo/uniseg v0.4.4 // indirect 18 | golang.org/x/sys v0.9.0 // indirect 19 | ) 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Thomas Mickley-Doyle 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /utils/explore_test.go: -------------------------------------------------------------------------------- 1 | package explore 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestConvertStringToInt(t *testing.T) { 8 | intArray, origArray := ConvertStringToInt([]string{"1", "2", "3"}) 9 | 10 | if len(intArray) == 0 { 11 | t.Errorf("Array could not convert strings to numbers") 12 | } 13 | if len(origArray) == 0 { 14 | t.Errorf("Original array did not return: ") 15 | } 16 | 17 | stringArray, _ := ConvertStringToInt([]string{"one", "two", "three"}) 18 | 19 | if len(stringArray) != 0 { 20 | t.Errorf("Incorrectly converted string to float") 21 | } 22 | } 23 | 24 | func TestMinMaxValues(t *testing.T) { 25 | minValue, maxValue := MinMaxValues([]float64{1.0, 2.0, 3.0}) 26 | 27 | if minValue != 1 { 28 | t.Errorf("Min value is not calculated properly") 29 | } 30 | if maxValue != 3 { 31 | t.Errorf("Min value is not calculated properly") 32 | } 33 | } 34 | 35 | func TestMeanValue(t *testing.T) { 36 | meanValue := MeanValue([]float64{1.0, 2.0, 3.0}) 37 | 38 | if meanValue != 2 { 39 | t.Errorf("Mean value is not calculated properly") 40 | } 41 | } 42 | 43 | func TestMedianValue(t *testing.T) { 44 | medianValue := MedianValue([]float64{1.0, 2.0, 3.0}) 45 | 46 | if medianValue != 2 { 47 | t.Errorf("Mean value is not calculated properly") 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= 2 | github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= 3 | github.com/charmbracelet/lipgloss v0.4.0 h1:768h64EFkGUr8V5yAKV7/Ta0NiVceiPaV+PphaW1K9g= 4 | github.com/charmbracelet/lipgloss v0.4.0/go.mod h1:vmdkHvce7UzX6xkyf4cca8WlwdQ5RQr8fzta+xl7BOM= 5 | github.com/charmbracelet/lipgloss v0.7.1 h1:17WMwi7N1b1rVWOjMT+rCh7sQkvDU75B2hbZpc5Kc1E= 6 | github.com/charmbracelet/lipgloss v0.7.1/go.mod h1:yG0k3giv8Qj8edTCbbg6AlQ5e8KNWpFujkNawKNhE2c= 7 | github.com/guptarohit/asciigraph v0.5.2 h1:aG4kATuuyHQMdTi89KKVIRIcDSIHrsKIozo/UsUE5AM= 8 | github.com/guptarohit/asciigraph v0.5.2/go.mod h1:dYl5wwK4gNsnFf9Zp+l06rFiDZ5YtXM6x7SRWZ3KGag= 9 | github.com/guptarohit/asciigraph v0.5.5 h1:ccFnUF8xYIOUPPY3tmdvRyHqmn1MYI9iv1pLKX+/ZkQ= 10 | github.com/guptarohit/asciigraph v0.5.5/go.mod h1:dYl5wwK4gNsnFf9Zp+l06rFiDZ5YtXM6x7SRWZ3KGag= 11 | github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= 12 | github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= 13 | github.com/mattn/go-isatty v0.0.13 h1:qdl+GuBjcsKKDco5BsxPJlId98mSWNKqYA+Co0SC1yA= 14 | github.com/mattn/go-isatty v0.0.13/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= 15 | github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= 16 | github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= 17 | github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= 18 | github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= 19 | github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU= 20 | github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= 21 | github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU= 22 | github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= 23 | github.com/muesli/reflow v0.2.1-0.20210115123740-9e1d0d53df68 h1:y1p/ycavWjGT9FnmSjdbWUlLGvcxrY0Rw3ATltrxOhk= 24 | github.com/muesli/reflow v0.2.1-0.20210115123740-9e1d0d53df68/go.mod h1:Xk+z4oIWdQqJzsxyjgl3P22oYZnHdZ8FFTHAQQt5BMQ= 25 | github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s= 26 | github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8= 27 | github.com/muesli/termenv v0.9.0 h1:wnbOaGz+LUR3jNT0zOzinPnyDaCZUQRZj9GxK8eRVl8= 28 | github.com/muesli/termenv v0.9.0/go.mod h1:R/LzAKf+suGs4IsO95y7+7DpFHO0KABgnZqtlyx2mBw= 29 | github.com/muesli/termenv v0.15.1 h1:UzuTb/+hhlBugQz28rpzey4ZuKcZ03MeKsoG7IJZIxs= 30 | github.com/muesli/termenv v0.15.1/go.mod h1:HeAQPTzpfs016yGtA4g00CsdYnVLJvxsS4ANqrZs2sQ= 31 | github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= 32 | github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= 33 | github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= 34 | github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis= 35 | github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= 36 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 37 | golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 38 | golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U= 39 | golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 40 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 41 | golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s= 42 | golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # shallow-explore 2 | From the command line, quickly explore data from a CSV file. 3 | 4 | `shallow-explore` is a [Golang](https://go.dev/) backed command-line tool for iterating over columns from a CSV file. This is a gut check tool to make sure the assumptions about the data are within the expected range of normal. 5 | 6 | ## How-To 7 | 8 | After installation, run the following command to start analyzing data: 9 | 10 | ```bash 11 | # Style (default): light mode 12 | shallow-explore -csv ~/complete/path/to/file/sample.csv 13 | 14 | # Style: dark mode 15 | shallow-explore -csv ~/complete/path/to/file/sample.csv -style dark 16 | 17 | # Style: light mode 18 | shallow-explore -csv ~/complete/path/to/file/sample.csv -style light 19 | ``` 20 | 21 | Note: The complete path of the file is required to load the data into the program. 22 | 23 | ### New Features 24 | 25 | #### Data Summary 26 | Generate a comprehensive summary of your CSV file with statistics about each column: 27 | 28 | ```bash 29 | shallow-explore -csv ~/path/to/file/sample.csv -summary 30 | ``` 31 | 32 | #### Data Filtering 33 | Filter data based on column values: 34 | 35 | ```bash 36 | # Filter rows where the "Age" column equals "30" 37 | shallow-explore -csv ~/path/to/file/sample.csv -filter true -column "Age" -condition equals -value "30" 38 | 39 | # Available conditions: equals, contains, greater_than, less_than, starts_with, ends_with 40 | shallow-explore -csv ~/path/to/file/sample.csv -filter true -column "Name" -condition contains -value "Smith" 41 | ``` 42 | 43 | #### Export to JSON 44 | Export your CSV data to JSON format: 45 | 46 | ```bash 47 | shallow-explore -csv ~/path/to/file/sample.csv -export-json -export-path "output.json" 48 | 49 | # The export-path is optional. If not provided, a timestamped filename will be generated 50 | shallow-explore -csv ~/path/to/file/sample.csv -export-json 51 | ``` 52 | 53 | #### Data Correlation 54 | Calculate the correlation between two numeric columns: 55 | 56 | ```bash 57 | shallow-explore -csv ~/path/to/file/sample.csv -correlate -col1 "Height" -col2 "Weight" 58 | ``` 59 | 60 | #### Anomaly Detection 61 | Detect anomalies in numeric columns using Z-score method: 62 | 63 | ```bash 64 | # Default threshold is 3.0 65 | shallow-explore -csv ~/path/to/file/sample.csv -anomalies 66 | 67 | # Custom threshold 68 | shallow-explore -csv ~/path/to/file/sample.csv -anomalies -threshold 2.5 69 | ``` 70 | 71 | ### Output 72 | 73 | `shallow-explore` supports three types of data: integers, floats, and strings. 74 | 75 | The following output is an example of an integer or float column. The column name at the top of the frame followed by a summary line graph of the items, and some quick statistics about the data. 76 | 77 | Screen Shot 2022-01-11 at 8 31 11 PM 78 | 79 | For string-based data, the column name is still at the top of the output. Below the column name lives a horizontal histogram and a count of unique entities found in the column. 80 | 81 | Screen Shot 2022-01-11 at 8 30 47 PM 82 | 83 | ## Installation 84 | 85 | If Golang is installed, run the following command: 86 | 87 | ```bash 88 | go install github.com/tmickleydoyle/shallow-explore 89 | ``` 90 | 91 | ## Instructions for Installing Go 92 | 93 | [Go docs](https://go.dev/) 94 | 95 | ### Installation with Homebrew 96 | 97 | ```bash 98 | brew install go 99 | ``` 100 | 101 | ## Why I Built This Tool 102 | 103 | I find myself running and rerunning the same basic statistical analysis on data to get an understanding of how trends are moving. I figured why not make it easier and share it with everyone else! I hope this speeds up your decision making :heart: 104 | 105 | ## Feature Overview 106 | 107 | - **CSV Data Exploration**: Visualize and analyze CSV data with automatic recognition of data types 108 | - **Data Summary**: Get a comprehensive overview of your CSV data with column types and completeness percentages 109 | - **Data Filtering**: Filter CSV data based on various conditions to focus on specific subsets 110 | - **JSON Export**: Export your CSV data to JSON format for use in other applications 111 | - **Data Correlation**: Calculate Pearson correlation coefficients between numeric columns 112 | - **Anomaly Detection**: Find outliers in numeric data using Z-score method 113 | - **Customizable Display**: Choose between light and dark mode for better visibility 114 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "log" 7 | "path/filepath" 8 | "strings" 9 | "time" 10 | 11 | "github.com/guptarohit/asciigraph" 12 | "github.com/charmbracelet/lipgloss" 13 | explore "github.com/tmickleydoyle/shallow-explore/utils" 14 | ) 15 | 16 | var ( 17 | file string 18 | path string 19 | style string 20 | filter string 21 | column string 22 | condition string 23 | value string 24 | exportJson bool 25 | exportPath string 26 | correlate bool 27 | col1 string 28 | col2 string 29 | summary bool 30 | anomalies bool 31 | threshold float64 32 | ) 33 | 34 | var styleDark = lipgloss.NewStyle(). 35 | Bold(true). 36 | Foreground(lipgloss.Color("#FAFAFA")). 37 | Background(lipgloss.Color("#808080")). 38 | PaddingTop(1). 39 | PaddingBottom(1). 40 | PaddingLeft(2). 41 | PaddingRight(2). 42 | BorderStyle(lipgloss.NormalBorder()). 43 | BorderForeground(lipgloss.Color("#FAFAFA")) 44 | 45 | var styleLight = lipgloss.NewStyle(). 46 | Bold(true). 47 | Foreground(lipgloss.Color("#808080")). 48 | Background(lipgloss.Color("#FAFAFA")). 49 | PaddingTop(1). 50 | PaddingBottom(1). 51 | PaddingLeft(2). 52 | PaddingRight(2). 53 | BorderStyle(lipgloss.NormalBorder()). 54 | BorderForeground(lipgloss.Color("#808080")) 55 | 56 | func main() { 57 | // Basic flags 58 | flag.StringVar(&file, "csv", "", "path to CSV file") 59 | flag.StringVar(&style, "style", "", "output style (dark or light)") 60 | 61 | // Data filtering flags 62 | flag.StringVar(&filter, "filter", "", "enable filtering mode") 63 | flag.StringVar(&column, "column", "", "column to filter on") 64 | flag.StringVar(&condition, "condition", "equals", "filter condition (equals, contains, greater_than, less_than, starts_with, ends_with)") 65 | flag.StringVar(&value, "value", "", "filter value to compare against") 66 | 67 | // JSON export flags 68 | flag.BoolVar(&exportJson, "export-json", false, "export data to JSON") 69 | flag.StringVar(&exportPath, "export-path", "", "path for exported JSON file") 70 | 71 | // Correlation flags 72 | flag.BoolVar(&correlate, "correlate", false, "calculate correlation between two columns") 73 | flag.StringVar(&col1, "col1", "", "first column for correlation") 74 | flag.StringVar(&col2, "col2", "", "second column for correlation") 75 | 76 | // Summary flag 77 | flag.BoolVar(&summary, "summary", false, "generate summary of CSV file") 78 | 79 | // Anomaly detection 80 | flag.BoolVar(&anomalies, "anomalies", false, "detect anomalies in numeric columns") 81 | flag.Float64Var(&threshold, "threshold", 3.0, "z-score threshold for anomaly detection (default: 3.0)") 82 | 83 | flag.Parse() 84 | 85 | if file == "" { 86 | log.Fatal("Could not find the path to the CSV file") 87 | } 88 | path = file 89 | records := explore.ReadCSVFile(path) 90 | 91 | // Set default style 92 | var selectedStyle lipgloss.Style 93 | if style == "dark" { 94 | selectedStyle = styleDark 95 | } else { 96 | selectedStyle = styleLight 97 | } 98 | 99 | // Handle summary mode 100 | if summary { 101 | summaryData := explore.CSVSummary(records) 102 | fmt.Println(selectedStyle.Render(fmt.Sprintf("CSV Summary for: %s\n", path))) 103 | 104 | // Display summary information 105 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Total Rows: %d\n", summaryData["total_rows"]))) 106 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Total Columns: %d\n", summaryData["total_columns"]))) 107 | 108 | // Display column names and types 109 | fmt.Println(selectedStyle.Render("Column Information:")) 110 | columnTypes := summaryData["column_types"].(map[string]string) 111 | completeness := summaryData["completeness"].(map[string]float64) 112 | 113 | for _, colName := range summaryData["column_names"].([]string) { 114 | fmt.Println(selectedStyle.Render(fmt.Sprintf(" %s (Type: %s, Completeness: %.1f%%)", 115 | colName, columnTypes[colName], completeness[colName]))) 116 | } 117 | return 118 | } 119 | 120 | // Handle filtering 121 | if filter != "" && column != "" && value != "" { 122 | records = explore.FilterCSVData(records, column, condition, value) 123 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Filtered data where %s %s %s (Found %d records)\n", 124 | column, condition, value, len(records)-1))) 125 | } 126 | 127 | // Handle JSON export 128 | if exportJson { 129 | if exportPath == "" { 130 | // Generate default filename based on input file if not provided 131 | base := filepath.Base(path) 132 | baseName := strings.TrimSuffix(base, filepath.Ext(base)) 133 | exportPath = baseName + "_" + time.Now().Format("20060102_150405") + ".json" 134 | } 135 | 136 | err := explore.ExportToJSON(records, exportPath) 137 | if err != nil { 138 | log.Fatalf("Failed to export to JSON: %v", err) 139 | } 140 | 141 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Data exported to JSON: %s\n", exportPath))) 142 | return 143 | } 144 | 145 | // Handle correlation calculation 146 | if correlate && col1 != "" && col2 != "" { 147 | correlation, err := explore.CalculateCorrelation(records, col1, col2) 148 | if err != nil { 149 | log.Fatalf("Failed to calculate correlation: %v", err) 150 | } 151 | 152 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Correlation between '%s' and '%s': %.4f\n", 153 | col1, col2, correlation))) 154 | 155 | // Interpretation guide 156 | var interpretation string 157 | corrAbs := math.Abs(correlation) 158 | 159 | if corrAbs >= 0.9 { 160 | interpretation = "Very strong relationship" 161 | } else if corrAbs >= 0.7 { 162 | interpretation = "Strong relationship" 163 | } else if corrAbs >= 0.5 { 164 | interpretation = "Moderate relationship" 165 | } else if corrAbs >= 0.3 { 166 | interpretation = "Weak relationship" 167 | } else { 168 | interpretation = "Little to no relationship" 169 | } 170 | 171 | if correlation < 0 { 172 | interpretation += " (negative/inverse)" 173 | } else { 174 | interpretation += " (positive/direct)" 175 | } 176 | 177 | fmt.Println(selectedStyle.Render("Interpretation: " + interpretation)) 178 | return 179 | } 180 | 181 | // Handle anomaly detection 182 | if anomalies { 183 | anomalousRows := explore.DetectAnomalies(records, threshold) 184 | 185 | if len(anomalousRows) == 0 { 186 | fmt.Println(selectedStyle.Render(fmt.Sprintf("No anomalies detected with threshold %.1f\n", threshold))) 187 | } else { 188 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Anomalies detected with threshold %.1f:\n", threshold))) 189 | 190 | for colName, rows := range anomalousRows { 191 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Column '%s': %d anomalies found", colName, len(rows)))) 192 | 193 | // Show first 5 anomalies at most 194 | displayRows := rows 195 | if len(rows) > 5 { 196 | displayRows = rows[:5] 197 | } 198 | 199 | // Get column index 200 | colIndex := -1 201 | for i, col := range records[0] { 202 | if col == colName { 203 | colIndex = i 204 | break 205 | } 206 | } 207 | 208 | for _, rowIdx := range displayRows { 209 | if colIndex < len(records[rowIdx]) { 210 | fmt.Println(selectedStyle.Render(fmt.Sprintf(" Row %d: %s", rowIdx, records[rowIdx][colIndex]))) 211 | } 212 | } 213 | 214 | if len(rows) > 5 { 215 | fmt.Println(selectedStyle.Render(fmt.Sprintf(" ... and %d more anomalies", len(rows)-5))) 216 | } 217 | } 218 | } 219 | return 220 | } 221 | 222 | // Default behavior: explore each column 223 | for column := range records[0] { 224 | colValues := []string{} 225 | 226 | for i := 1; i < len(records); i++ { 227 | colValues = append(colValues, records[i][column]) 228 | } 229 | 230 | transformedArray, _ := explore.ConvertStringToInt(colValues) 231 | plotArray, stringValues := explore.ConvertStringToInt(colValues) 232 | column := fmt.Sprintf("Column: %s\n\n", records[0][column]) 233 | 234 | if len(transformedArray) > 0 { 235 | min, max := explore.MinMaxValues(transformedArray) 236 | mean := explore.MeanValue(transformedArray) 237 | median := explore.MedianValue(transformedArray) 238 | statsOutput := explore.FloatOutput(min, max, mean, median) 239 | graph := asciigraph.Plot(plotArray, asciigraph.Height(20), asciigraph.Width(90), asciigraph.Caption(statsOutput)) 240 | fmt.Println(selectedStyle.Render(column + graph)) 241 | } else { 242 | valuesMap := explore.CountValues(stringValues) 243 | sortedMap := explore.SortMapByValue(valuesMap) 244 | histogram := explore.HistTopTen(sortedMap, column) 245 | fmt.Println(selectedStyle.Render(column + histogram)) 246 | } 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /utils/explore.go: -------------------------------------------------------------------------------- 1 | package explore 2 | 3 | import ( 4 | "encoding/csv" 5 | "fmt" 6 | "log" 7 | "math" 8 | "sort" 9 | "strconv" 10 | "strings" 11 | "bytes" 12 | "encoding/json" 13 | "io" 14 | "io/ioutil" 15 | "net/http" 16 | "os" 17 | "time" 18 | ) 19 | 20 | type Sorted struct { 21 | Key string 22 | Value int 23 | } 24 | 25 | type SortedList []Sorted 26 | 27 | func (p SortedList) Len() int { return len(p) } 28 | func (p SortedList) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 29 | func (p SortedList) Less(i, j int) bool { return p[i].Value > p[j].Value } 30 | 31 | func ReadCSVFile(filePath string) [][]string { 32 | f, err := os.Open(filePath) 33 | if err != nil { 34 | log.Fatal("Unable to read input file "+filePath, err) 35 | } 36 | defer f.Close() 37 | 38 | csvReader := csv.NewReader(f) 39 | records, err := csvReader.ReadAll() 40 | if err != nil { 41 | log.Fatal("Unable to parse file as CSV for "+filePath, err) 42 | } 43 | 44 | return records 45 | } 46 | 47 | func ConvertStringToInt(stringArray []string) ([]float64, []string) { 48 | var intArray []float64 49 | 50 | for _, i := range stringArray { 51 | if convertNum, err := strconv.ParseFloat(i, 64); err == nil { 52 | intArray = append(intArray, convertNum) 53 | } else if intValue, err := strconv.ParseInt(i, 10, 64); err == nil { 54 | convertNum := float64(intValue) 55 | intArray = append(intArray, convertNum) 56 | } 57 | } 58 | 59 | return intArray, stringArray 60 | } 61 | 62 | func MinMaxValues(intArray []float64) (float64, float64) { 63 | var max float64 = intArray[0] 64 | var min float64 = intArray[0] 65 | for _, value := range intArray { 66 | if max < value { 67 | max = value 68 | } 69 | if min > value { 70 | min = value 71 | } 72 | } 73 | return min, max 74 | } 75 | 76 | func MeanValue(intArray []float64) float64 { 77 | total := 0.0 78 | 79 | for _, v := range intArray { 80 | total += v 81 | } 82 | 83 | return math.Round(total / float64(len(intArray))) 84 | } 85 | 86 | func MedianValue(intArray []float64) float64 { 87 | sort.Float64s(intArray) 88 | mNumber := len(intArray) / 2 89 | 90 | if len(intArray)%2 != 0 { 91 | return intArray[mNumber] 92 | } 93 | 94 | return (intArray[mNumber-1] + intArray[mNumber]) / 2 95 | } 96 | 97 | func FloatOutput(min float64, max float64, mean float64, median float64) string { 98 | outputString := 99 | `(The plot is a general trend of all points) 100 | 101 | Min: %.2f 102 | Max: %.2f 103 | Mean: %.2f 104 | Median: %.2f 105 | ` 106 | finalOutput := fmt.Sprintf(outputString, min, max, mean, median) 107 | return finalOutput 108 | } 109 | 110 | func CountValues(stringArray []string) map[string]int { 111 | valuesMap := make(map[string]int) 112 | 113 | for _, v := range stringArray { 114 | v := string(v) 115 | if _, ok := valuesMap[v]; ok { 116 | valuesMap[v] = valuesMap[v] + 1 117 | } else { 118 | valuesMap[v] = 1 119 | } 120 | } 121 | 122 | return valuesMap 123 | } 124 | 125 | func SortMapByValue(valueMap map[string]int) SortedList { 126 | sortedMap := make(SortedList, len(valueMap)) 127 | 128 | i := 0 129 | for k, v := range valueMap { 130 | sortedMap[i] = Sorted{k, v} 131 | i++ 132 | } 133 | 134 | sort.Sort(sortedMap) 135 | 136 | return sortedMap 137 | } 138 | 139 | func HistTopTen(sortedList SortedList, column string) string { 140 | var key string 141 | var barValue int 142 | max := float64(sortedList[0].Value) 143 | histString := "Horizontal Histogram" 144 | 145 | if len(sortedList) > 10 { 146 | histString = histString + " - Top Ten\n\n" 147 | } else { 148 | histString = histString + "\n\n" 149 | } 150 | 151 | i := 0 152 | for d := range sortedList { 153 | if i < 10 { 154 | if max > 75 { 155 | barValue = int((float64(sortedList[d].Value) / max) * 75) 156 | } else { 157 | barValue = sortedList[d].Value 158 | } 159 | bar := strings.Repeat("☐", barValue) 160 | if len(sortedList[d].Key) > 20 { 161 | key = sortedList[d].Key[:17] + "..." 162 | } else { 163 | key = sortedList[d].Key 164 | } 165 | histString = histString + fmt.Sprintf("%20v: %s (%d)\n", key, bar, sortedList[d].Value) 166 | } 167 | i++ 168 | } 169 | 170 | histString = histString + fmt.Sprintf("%20v: %d\n", "Unique Strings", len(sortedList)) 171 | 172 | return histString 173 | } 174 | 175 | type ChatGPTRequest struct { 176 | Message string `json:"message"` 177 | } 178 | 179 | type ChatGPTResponse struct { 180 | Reply string `json:"reply"` 181 | } 182 | 183 | // CSVSummary generates a summary of the entire CSV file 184 | func CSVSummary(records [][]string) map[string]interface{} { 185 | summary := make(map[string]interface{}) 186 | 187 | // Basic file info 188 | summary["total_rows"] = len(records) - 1 // Excluding header 189 | summary["total_columns"] = len(records[0]) 190 | summary["column_names"] = records[0] 191 | 192 | // Column type inference 193 | columnTypes := make(map[string]string) 194 | for i, colName := range records[0] { 195 | // Check first 10 rows (or fewer if less data) to infer type 196 | samples := []string{} 197 | for j := 1; j < min(len(records), 11); j++ { 198 | if i < len(records[j]) { 199 | samples = append(samples, records[j][i]) 200 | } 201 | } 202 | columnTypes[colName] = inferColumnType(samples) 203 | } 204 | summary["column_types"] = columnTypes 205 | 206 | // Data completeness 207 | completeness := make(map[string]float64) 208 | for i, colName := range records[0] { 209 | filledCount := 0 210 | for j := 1; j < len(records); j++ { 211 | if i < len(records[j]) && records[j][i] != "" { 212 | filledCount++ 213 | } 214 | } 215 | completeness[colName] = float64(filledCount) / float64(len(records)-1) * 100 216 | } 217 | summary["completeness"] = completeness 218 | 219 | return summary 220 | } 221 | 222 | // Helper function for CSVSummary 223 | func min(a, b int) int { 224 | if a < b { 225 | return a 226 | } 227 | return b 228 | } 229 | 230 | // Helper function to infer column type 231 | func inferColumnType(samples []string) string { 232 | isInt := true 233 | isFloat := true 234 | 235 | for _, sample := range samples { 236 | if sample == "" { 237 | continue // Skip empty values for type inference 238 | } 239 | 240 | // Try parsing as int 241 | _, err := strconv.ParseInt(sample, 10, 64) 242 | if err != nil { 243 | isInt = false 244 | } 245 | 246 | // Try parsing as float 247 | _, err = strconv.ParseFloat(sample, 64) 248 | if err != nil { 249 | isFloat = false 250 | } 251 | 252 | if !isInt && !isFloat { 253 | break 254 | } 255 | } 256 | 257 | if isInt { 258 | return "integer" 259 | } else if isFloat { 260 | return "float" 261 | } 262 | return "string" 263 | } 264 | 265 | // FilterCSVData filters CSV data based on column and condition 266 | func FilterCSVData(records [][]string, columnName string, condition string, value string) [][]string { 267 | filtered := [][]string{records[0]} // Start with header row 268 | 269 | // Find column index 270 | colIndex := -1 271 | for i, col := range records[0] { 272 | if col == columnName { 273 | colIndex = i 274 | break 275 | } 276 | } 277 | 278 | if colIndex == -1 { 279 | return filtered // Column not found 280 | } 281 | 282 | // Process each row 283 | for i := 1; i < len(records); i++ { 284 | row := records[i] 285 | if colIndex >= len(row) { 286 | continue // Skip rows with insufficient columns 287 | } 288 | 289 | cellValue := row[colIndex] 290 | includeRow := false 291 | 292 | switch condition { 293 | case "equals": 294 | includeRow = cellValue == value 295 | case "contains": 296 | includeRow = strings.Contains(cellValue, value) 297 | case "greater_than": 298 | cellFloat, err1 := strconv.ParseFloat(cellValue, 64) 299 | valueFloat, err2 := strconv.ParseFloat(value, 64) 300 | if err1 == nil && err2 == nil { 301 | includeRow = cellFloat > valueFloat 302 | } 303 | case "less_than": 304 | cellFloat, err1 := strconv.ParseFloat(cellValue, 64) 305 | valueFloat, err2 := strconv.ParseFloat(value, 64) 306 | if err1 == nil && err2 == nil { 307 | includeRow = cellFloat < valueFloat 308 | } 309 | case "starts_with": 310 | includeRow = strings.HasPrefix(cellValue, value) 311 | case "ends_with": 312 | includeRow = strings.HasSuffix(cellValue, value) 313 | default: 314 | includeRow = false 315 | } 316 | 317 | if includeRow { 318 | filtered = append(filtered, row) 319 | } 320 | } 321 | 322 | return filtered 323 | } 324 | 325 | // ExportToJSON exports CSV data to a JSON file 326 | func ExportToJSON(records [][]string, outputPath string) error { 327 | result := []map[string]string{} 328 | headers := records[0] 329 | 330 | // Convert each row to a map 331 | for i := 1; i < len(records); i++ { 332 | row := make(map[string]string) 333 | for j, header := range headers { 334 | if j < len(records[i]) { 335 | row[header] = records[i][j] 336 | } else { 337 | row[header] = "" 338 | } 339 | } 340 | result = append(result, row) 341 | } 342 | 343 | // Marshal to JSON 344 | jsonData, err := json.MarshalIndent(result, "", " ") 345 | if err != nil { 346 | return err 347 | } 348 | 349 | // Write to file 350 | return ioutil.WriteFile(outputPath, jsonData, 0644) 351 | } 352 | 353 | // CalculateCorrelation calculates Pearson correlation between two numeric columns 354 | func CalculateCorrelation(records [][]string, column1 string, column2 string) (float64, error) { 355 | // Find column indices 356 | col1Index := -1 357 | col2Index := -1 358 | for i, col := range records[0] { 359 | if col == column1 { 360 | col1Index = i 361 | } 362 | if col == column2 { 363 | col2Index = i 364 | } 365 | } 366 | 367 | if col1Index == -1 || col2Index == -1 { 368 | return 0, fmt.Errorf("column not found") 369 | } 370 | 371 | // Extract numeric values 372 | var x []float64 373 | var y []float64 374 | 375 | for i := 1; i < len(records); i++ { 376 | row := records[i] 377 | if col1Index < len(row) && col2Index < len(row) { 378 | val1, err1 := strconv.ParseFloat(row[col1Index], 64) 379 | val2, err2 := strconv.ParseFloat(row[col2Index], 64) 380 | if err1 == nil && err2 == nil { 381 | x = append(x, val1) 382 | y = append(y, val2) 383 | } 384 | } 385 | } 386 | 387 | if len(x) < 2 { 388 | return 0, fmt.Errorf("insufficient numeric data for correlation") 389 | } 390 | 391 | // Calculate means 392 | xMean := 0.0 393 | yMean := 0.0 394 | for i := range x { 395 | xMean += x[i] 396 | yMean += y[i] 397 | } 398 | xMean /= float64(len(x)) 399 | yMean /= float64(len(y)) 400 | 401 | // Calculate correlation 402 | numerator := 0.0 403 | xDenom := 0.0 404 | yDenom := 0.0 405 | 406 | for i := range x { 407 | xDiff := x[i] - xMean 408 | yDiff := y[i] - yMean 409 | numerator += xDiff * yDiff 410 | xDenom += xDiff * xDiff 411 | yDenom += yDiff * yDiff 412 | } 413 | 414 | if xDenom == 0 || yDenom == 0 { 415 | return 0, nil // No variation in at least one variable 416 | } 417 | 418 | return numerator / math.Sqrt(xDenom * yDenom), nil 419 | } 420 | 421 | // DetectAnomalies detects anomalies in numeric columns using Z-score method 422 | func DetectAnomalies(records [][]string, threshold float64) map[string][]int { 423 | anomolies := make(map[string][]int) 424 | 425 | // Process each column 426 | for colIndex, colName := range records[0] { 427 | // Extract numeric values 428 | values := []float64{} 429 | valueIndices := []int{} 430 | 431 | for i := 1; i < len(records); i++ { 432 | if colIndex < len(records[i]) { 433 | if val, err := strconv.ParseFloat(records[i][colIndex], 64); err == nil { 434 | values = append(values, val) 435 | valueIndices = append(valueIndices, i) 436 | } 437 | } 438 | } 439 | 440 | // Need enough data points for meaningful anomaly detection 441 | if len(values) < 5 { 442 | continue 443 | } 444 | 445 | // Calculate mean and standard deviation 446 | mean := 0.0 447 | for _, val := range values { 448 | mean += val 449 | } 450 | mean /= float64(len(values)) 451 | 452 | stdDev := 0.0 453 | for _, val := range values { 454 | stdDev += math.Pow(val - mean, 2) 455 | } 456 | stdDev = math.Sqrt(stdDev / float64(len(values))) 457 | 458 | if stdDev == 0 { 459 | continue // Skip columns with no variation 460 | } 461 | 462 | // Find anomalies (values with Z-score above threshold) 463 | anomalousRows := []int{} 464 | for i, val := range values { 465 | zScore := math.Abs(val - mean) / stdDev 466 | if zScore > threshold { 467 | anomalousRows = append(anomalousRows, valueIndices[i]) 468 | } 469 | } 470 | 471 | if len(anomalousRows) > 0 { 472 | anomolies[colName] = anomalousRows 473 | } 474 | } 475 | 476 | return anomolies 477 | } 478 | 479 | func CallChatGPTAPI(message string, authToken string) (string, error) { 480 | payload := ChatGPTRequest{ 481 | Message: message, 482 | } 483 | 484 | payloadBytes, err := json.Marshal(payload) 485 | if err != nil { 486 | return "", err 487 | } 488 | 489 | client := &http.Client{} 490 | 491 | req, err := http.NewRequest("POST", "https://api.chatgpt.com/v1/chat/completions", bytes.NewBuffer(payloadBytes)) 492 | if err != nil { 493 | return "", err 494 | } 495 | 496 | req.Header.Set("Content-Type", "application/json") 497 | req.Header.Set("Authorization", "Bearer "+authToken) 498 | 499 | resp, err := client.Do(req) 500 | if err != nil { 501 | return "", err 502 | } 503 | defer resp.Body.Close() 504 | 505 | respBody, err := ioutil.ReadAll(resp.Body) 506 | if err != nil { 507 | return "", err 508 | } 509 | 510 | var apiResponse ChatGPTResponse 511 | err = json.Unmarshal(respBody, &apiResponse) 512 | if err != nil { 513 | return "", err 514 | } 515 | 516 | return apiResponse.Reply, nil 517 | } --------------------------------------------------------------------------------