├── TODO
├── CODEOWNERS
├── shallow-explore
├── .gitignore
├── go.mod
├── LICENSE
├── utils
├── explore_test.go
└── explore.go
├── go.sum
├── README.md
└── main.go
/TODO:
--------------------------------------------------------------------------------
1 | - Unit tests
2 |
--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @tmickleydoyle
2 |
--------------------------------------------------------------------------------
/shallow-explore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmickleydoyle/shallow-explore/HEAD/shallow-explore
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 |
8 | # Test binary, built with `go test -c`
9 | *.test
10 |
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 |
14 | # Dependency directories (remove the comment below to include it)
15 | # vendor/
16 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/tmickleydoyle/shallow-explore
2 |
3 | go 1.17
4 |
5 | require (
6 | github.com/charmbracelet/lipgloss v0.7.1
7 | github.com/guptarohit/asciigraph v0.5.5
8 | )
9 |
10 | require (
11 | github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
12 | github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
13 | github.com/mattn/go-isatty v0.0.19 // indirect
14 | github.com/mattn/go-runewidth v0.0.14 // indirect
15 | github.com/muesli/reflow v0.3.0 // indirect
16 | github.com/muesli/termenv v0.15.1 // indirect
17 | github.com/rivo/uniseg v0.4.4 // indirect
18 | golang.org/x/sys v0.9.0 // indirect
19 | )
20 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Thomas Mickley-Doyle
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/utils/explore_test.go:
--------------------------------------------------------------------------------
1 | package explore
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestConvertStringToInt(t *testing.T) {
8 | intArray, origArray := ConvertStringToInt([]string{"1", "2", "3"})
9 |
10 | if len(intArray) == 0 {
11 | t.Errorf("Array could not convert strings to numbers")
12 | }
13 | if len(origArray) == 0 {
14 | t.Errorf("Original array did not return: ")
15 | }
16 |
17 | stringArray, _ := ConvertStringToInt([]string{"one", "two", "three"})
18 |
19 | if len(stringArray) != 0 {
20 | t.Errorf("Incorrectly converted string to float")
21 | }
22 | }
23 |
24 | func TestMinMaxValues(t *testing.T) {
25 | minValue, maxValue := MinMaxValues([]float64{1.0, 2.0, 3.0})
26 |
27 | if minValue != 1 {
28 | t.Errorf("Min value is not calculated properly")
29 | }
30 | if maxValue != 3 {
31 | t.Errorf("Min value is not calculated properly")
32 | }
33 | }
34 |
35 | func TestMeanValue(t *testing.T) {
36 | meanValue := MeanValue([]float64{1.0, 2.0, 3.0})
37 |
38 | if meanValue != 2 {
39 | t.Errorf("Mean value is not calculated properly")
40 | }
41 | }
42 |
43 | func TestMedianValue(t *testing.T) {
44 | medianValue := MedianValue([]float64{1.0, 2.0, 3.0})
45 |
46 | if medianValue != 2 {
47 | t.Errorf("Mean value is not calculated properly")
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
2 | github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
3 | github.com/charmbracelet/lipgloss v0.4.0 h1:768h64EFkGUr8V5yAKV7/Ta0NiVceiPaV+PphaW1K9g=
4 | github.com/charmbracelet/lipgloss v0.4.0/go.mod h1:vmdkHvce7UzX6xkyf4cca8WlwdQ5RQr8fzta+xl7BOM=
5 | github.com/charmbracelet/lipgloss v0.7.1 h1:17WMwi7N1b1rVWOjMT+rCh7sQkvDU75B2hbZpc5Kc1E=
6 | github.com/charmbracelet/lipgloss v0.7.1/go.mod h1:yG0k3giv8Qj8edTCbbg6AlQ5e8KNWpFujkNawKNhE2c=
7 | github.com/guptarohit/asciigraph v0.5.2 h1:aG4kATuuyHQMdTi89KKVIRIcDSIHrsKIozo/UsUE5AM=
8 | github.com/guptarohit/asciigraph v0.5.2/go.mod h1:dYl5wwK4gNsnFf9Zp+l06rFiDZ5YtXM6x7SRWZ3KGag=
9 | github.com/guptarohit/asciigraph v0.5.5 h1:ccFnUF8xYIOUPPY3tmdvRyHqmn1MYI9iv1pLKX+/ZkQ=
10 | github.com/guptarohit/asciigraph v0.5.5/go.mod h1:dYl5wwK4gNsnFf9Zp+l06rFiDZ5YtXM6x7SRWZ3KGag=
11 | github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
12 | github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
13 | github.com/mattn/go-isatty v0.0.13 h1:qdl+GuBjcsKKDco5BsxPJlId98mSWNKqYA+Co0SC1yA=
14 | github.com/mattn/go-isatty v0.0.13/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
15 | github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
16 | github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
17 | github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
18 | github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
19 | github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU=
20 | github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
21 | github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
22 | github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
23 | github.com/muesli/reflow v0.2.1-0.20210115123740-9e1d0d53df68 h1:y1p/ycavWjGT9FnmSjdbWUlLGvcxrY0Rw3ATltrxOhk=
24 | github.com/muesli/reflow v0.2.1-0.20210115123740-9e1d0d53df68/go.mod h1:Xk+z4oIWdQqJzsxyjgl3P22oYZnHdZ8FFTHAQQt5BMQ=
25 | github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s=
26 | github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8=
27 | github.com/muesli/termenv v0.9.0 h1:wnbOaGz+LUR3jNT0zOzinPnyDaCZUQRZj9GxK8eRVl8=
28 | github.com/muesli/termenv v0.9.0/go.mod h1:R/LzAKf+suGs4IsO95y7+7DpFHO0KABgnZqtlyx2mBw=
29 | github.com/muesli/termenv v0.15.1 h1:UzuTb/+hhlBugQz28rpzey4ZuKcZ03MeKsoG7IJZIxs=
30 | github.com/muesli/termenv v0.15.1/go.mod h1:HeAQPTzpfs016yGtA4g00CsdYnVLJvxsS4ANqrZs2sQ=
31 | github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
32 | github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
33 | github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
34 | github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
35 | github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
36 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
37 | golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
38 | golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U=
39 | golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
40 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
41 | golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s=
42 | golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
43 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # shallow-explore
2 | From the command line, quickly explore data from a CSV file.
3 |
4 | `shallow-explore` is a [Golang](https://go.dev/) backed command-line tool for iterating over columns from a CSV file. This is a gut check tool to make sure the assumptions about the data are within the expected range of normal.
5 |
6 | ## How-To
7 |
8 | After installation, run the following command to start analyzing data:
9 |
10 | ```bash
11 | # Style (default): light mode
12 | shallow-explore -csv ~/complete/path/to/file/sample.csv
13 |
14 | # Style: dark mode
15 | shallow-explore -csv ~/complete/path/to/file/sample.csv -style dark
16 |
17 | # Style: light mode
18 | shallow-explore -csv ~/complete/path/to/file/sample.csv -style light
19 | ```
20 |
21 | Note: The complete path of the file is required to load the data into the program.
22 |
23 | ### New Features
24 |
25 | #### Data Summary
26 | Generate a comprehensive summary of your CSV file with statistics about each column:
27 |
28 | ```bash
29 | shallow-explore -csv ~/path/to/file/sample.csv -summary
30 | ```
31 |
32 | #### Data Filtering
33 | Filter data based on column values:
34 |
35 | ```bash
36 | # Filter rows where the "Age" column equals "30"
37 | shallow-explore -csv ~/path/to/file/sample.csv -filter true -column "Age" -condition equals -value "30"
38 |
39 | # Available conditions: equals, contains, greater_than, less_than, starts_with, ends_with
40 | shallow-explore -csv ~/path/to/file/sample.csv -filter true -column "Name" -condition contains -value "Smith"
41 | ```
42 |
43 | #### Export to JSON
44 | Export your CSV data to JSON format:
45 |
46 | ```bash
47 | shallow-explore -csv ~/path/to/file/sample.csv -export-json -export-path "output.json"
48 |
49 | # The export-path is optional. If not provided, a timestamped filename will be generated
50 | shallow-explore -csv ~/path/to/file/sample.csv -export-json
51 | ```
52 |
53 | #### Data Correlation
54 | Calculate the correlation between two numeric columns:
55 |
56 | ```bash
57 | shallow-explore -csv ~/path/to/file/sample.csv -correlate -col1 "Height" -col2 "Weight"
58 | ```
59 |
60 | #### Anomaly Detection
61 | Detect anomalies in numeric columns using Z-score method:
62 |
63 | ```bash
64 | # Default threshold is 3.0
65 | shallow-explore -csv ~/path/to/file/sample.csv -anomalies
66 |
67 | # Custom threshold
68 | shallow-explore -csv ~/path/to/file/sample.csv -anomalies -threshold 2.5
69 | ```
70 |
71 | ### Output
72 |
73 | `shallow-explore` supports three types of data: integers, floats, and strings.
74 |
75 | The following output is an example of an integer or float column. The column name at the top of the frame followed by a summary line graph of the items, and some quick statistics about the data.
76 |
77 |
78 |
79 | For string-based data, the column name is still at the top of the output. Below the column name lives a horizontal histogram and a count of unique entities found in the column.
80 |
81 |
82 |
83 | ## Installation
84 |
85 | If Golang is installed, run the following command:
86 |
87 | ```bash
88 | go install github.com/tmickleydoyle/shallow-explore
89 | ```
90 |
91 | ## Instructions for Installing Go
92 |
93 | [Go docs](https://go.dev/)
94 |
95 | ### Installation with Homebrew
96 |
97 | ```bash
98 | brew install go
99 | ```
100 |
101 | ## Why I Built This Tool
102 |
103 | I find myself running and rerunning the same basic statistical analysis on data to get an understanding of how trends are moving. I figured why not make it easier and share it with everyone else! I hope this speeds up your decision making :heart:
104 |
105 | ## Feature Overview
106 |
107 | - **CSV Data Exploration**: Visualize and analyze CSV data with automatic recognition of data types
108 | - **Data Summary**: Get a comprehensive overview of your CSV data with column types and completeness percentages
109 | - **Data Filtering**: Filter CSV data based on various conditions to focus on specific subsets
110 | - **JSON Export**: Export your CSV data to JSON format for use in other applications
111 | - **Data Correlation**: Calculate Pearson correlation coefficients between numeric columns
112 | - **Anomaly Detection**: Find outliers in numeric data using Z-score method
113 | - **Customizable Display**: Choose between light and dark mode for better visibility
114 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 | "fmt"
6 | "log"
7 | "path/filepath"
8 | "strings"
9 | "time"
10 |
11 | "github.com/guptarohit/asciigraph"
12 | "github.com/charmbracelet/lipgloss"
13 | explore "github.com/tmickleydoyle/shallow-explore/utils"
14 | )
15 |
16 | var (
17 | file string
18 | path string
19 | style string
20 | filter string
21 | column string
22 | condition string
23 | value string
24 | exportJson bool
25 | exportPath string
26 | correlate bool
27 | col1 string
28 | col2 string
29 | summary bool
30 | anomalies bool
31 | threshold float64
32 | )
33 |
34 | var styleDark = lipgloss.NewStyle().
35 | Bold(true).
36 | Foreground(lipgloss.Color("#FAFAFA")).
37 | Background(lipgloss.Color("#808080")).
38 | PaddingTop(1).
39 | PaddingBottom(1).
40 | PaddingLeft(2).
41 | PaddingRight(2).
42 | BorderStyle(lipgloss.NormalBorder()).
43 | BorderForeground(lipgloss.Color("#FAFAFA"))
44 |
45 | var styleLight = lipgloss.NewStyle().
46 | Bold(true).
47 | Foreground(lipgloss.Color("#808080")).
48 | Background(lipgloss.Color("#FAFAFA")).
49 | PaddingTop(1).
50 | PaddingBottom(1).
51 | PaddingLeft(2).
52 | PaddingRight(2).
53 | BorderStyle(lipgloss.NormalBorder()).
54 | BorderForeground(lipgloss.Color("#808080"))
55 |
56 | func main() {
57 | // Basic flags
58 | flag.StringVar(&file, "csv", "", "path to CSV file")
59 | flag.StringVar(&style, "style", "", "output style (dark or light)")
60 |
61 | // Data filtering flags
62 | flag.StringVar(&filter, "filter", "", "enable filtering mode")
63 | flag.StringVar(&column, "column", "", "column to filter on")
64 | flag.StringVar(&condition, "condition", "equals", "filter condition (equals, contains, greater_than, less_than, starts_with, ends_with)")
65 | flag.StringVar(&value, "value", "", "filter value to compare against")
66 |
67 | // JSON export flags
68 | flag.BoolVar(&exportJson, "export-json", false, "export data to JSON")
69 | flag.StringVar(&exportPath, "export-path", "", "path for exported JSON file")
70 |
71 | // Correlation flags
72 | flag.BoolVar(&correlate, "correlate", false, "calculate correlation between two columns")
73 | flag.StringVar(&col1, "col1", "", "first column for correlation")
74 | flag.StringVar(&col2, "col2", "", "second column for correlation")
75 |
76 | // Summary flag
77 | flag.BoolVar(&summary, "summary", false, "generate summary of CSV file")
78 |
79 | // Anomaly detection
80 | flag.BoolVar(&anomalies, "anomalies", false, "detect anomalies in numeric columns")
81 | flag.Float64Var(&threshold, "threshold", 3.0, "z-score threshold for anomaly detection (default: 3.0)")
82 |
83 | flag.Parse()
84 |
85 | if file == "" {
86 | log.Fatal("Could not find the path to the CSV file")
87 | }
88 | path = file
89 | records := explore.ReadCSVFile(path)
90 |
91 | // Set default style
92 | var selectedStyle lipgloss.Style
93 | if style == "dark" {
94 | selectedStyle = styleDark
95 | } else {
96 | selectedStyle = styleLight
97 | }
98 |
99 | // Handle summary mode
100 | if summary {
101 | summaryData := explore.CSVSummary(records)
102 | fmt.Println(selectedStyle.Render(fmt.Sprintf("CSV Summary for: %s\n", path)))
103 |
104 | // Display summary information
105 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Total Rows: %d\n", summaryData["total_rows"])))
106 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Total Columns: %d\n", summaryData["total_columns"])))
107 |
108 | // Display column names and types
109 | fmt.Println(selectedStyle.Render("Column Information:"))
110 | columnTypes := summaryData["column_types"].(map[string]string)
111 | completeness := summaryData["completeness"].(map[string]float64)
112 |
113 | for _, colName := range summaryData["column_names"].([]string) {
114 | fmt.Println(selectedStyle.Render(fmt.Sprintf(" %s (Type: %s, Completeness: %.1f%%)",
115 | colName, columnTypes[colName], completeness[colName])))
116 | }
117 | return
118 | }
119 |
120 | // Handle filtering
121 | if filter != "" && column != "" && value != "" {
122 | records = explore.FilterCSVData(records, column, condition, value)
123 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Filtered data where %s %s %s (Found %d records)\n",
124 | column, condition, value, len(records)-1)))
125 | }
126 |
127 | // Handle JSON export
128 | if exportJson {
129 | if exportPath == "" {
130 | // Generate default filename based on input file if not provided
131 | base := filepath.Base(path)
132 | baseName := strings.TrimSuffix(base, filepath.Ext(base))
133 | exportPath = baseName + "_" + time.Now().Format("20060102_150405") + ".json"
134 | }
135 |
136 | err := explore.ExportToJSON(records, exportPath)
137 | if err != nil {
138 | log.Fatalf("Failed to export to JSON: %v", err)
139 | }
140 |
141 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Data exported to JSON: %s\n", exportPath)))
142 | return
143 | }
144 |
145 | // Handle correlation calculation
146 | if correlate && col1 != "" && col2 != "" {
147 | correlation, err := explore.CalculateCorrelation(records, col1, col2)
148 | if err != nil {
149 | log.Fatalf("Failed to calculate correlation: %v", err)
150 | }
151 |
152 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Correlation between '%s' and '%s': %.4f\n",
153 | col1, col2, correlation)))
154 |
155 | // Interpretation guide
156 | var interpretation string
157 | corrAbs := math.Abs(correlation)
158 |
159 | if corrAbs >= 0.9 {
160 | interpretation = "Very strong relationship"
161 | } else if corrAbs >= 0.7 {
162 | interpretation = "Strong relationship"
163 | } else if corrAbs >= 0.5 {
164 | interpretation = "Moderate relationship"
165 | } else if corrAbs >= 0.3 {
166 | interpretation = "Weak relationship"
167 | } else {
168 | interpretation = "Little to no relationship"
169 | }
170 |
171 | if correlation < 0 {
172 | interpretation += " (negative/inverse)"
173 | } else {
174 | interpretation += " (positive/direct)"
175 | }
176 |
177 | fmt.Println(selectedStyle.Render("Interpretation: " + interpretation))
178 | return
179 | }
180 |
181 | // Handle anomaly detection
182 | if anomalies {
183 | anomalousRows := explore.DetectAnomalies(records, threshold)
184 |
185 | if len(anomalousRows) == 0 {
186 | fmt.Println(selectedStyle.Render(fmt.Sprintf("No anomalies detected with threshold %.1f\n", threshold)))
187 | } else {
188 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Anomalies detected with threshold %.1f:\n", threshold)))
189 |
190 | for colName, rows := range anomalousRows {
191 | fmt.Println(selectedStyle.Render(fmt.Sprintf("Column '%s': %d anomalies found", colName, len(rows))))
192 |
193 | // Show first 5 anomalies at most
194 | displayRows := rows
195 | if len(rows) > 5 {
196 | displayRows = rows[:5]
197 | }
198 |
199 | // Get column index
200 | colIndex := -1
201 | for i, col := range records[0] {
202 | if col == colName {
203 | colIndex = i
204 | break
205 | }
206 | }
207 |
208 | for _, rowIdx := range displayRows {
209 | if colIndex < len(records[rowIdx]) {
210 | fmt.Println(selectedStyle.Render(fmt.Sprintf(" Row %d: %s", rowIdx, records[rowIdx][colIndex])))
211 | }
212 | }
213 |
214 | if len(rows) > 5 {
215 | fmt.Println(selectedStyle.Render(fmt.Sprintf(" ... and %d more anomalies", len(rows)-5)))
216 | }
217 | }
218 | }
219 | return
220 | }
221 |
222 | // Default behavior: explore each column
223 | for column := range records[0] {
224 | colValues := []string{}
225 |
226 | for i := 1; i < len(records); i++ {
227 | colValues = append(colValues, records[i][column])
228 | }
229 |
230 | transformedArray, _ := explore.ConvertStringToInt(colValues)
231 | plotArray, stringValues := explore.ConvertStringToInt(colValues)
232 | column := fmt.Sprintf("Column: %s\n\n", records[0][column])
233 |
234 | if len(transformedArray) > 0 {
235 | min, max := explore.MinMaxValues(transformedArray)
236 | mean := explore.MeanValue(transformedArray)
237 | median := explore.MedianValue(transformedArray)
238 | statsOutput := explore.FloatOutput(min, max, mean, median)
239 | graph := asciigraph.Plot(plotArray, asciigraph.Height(20), asciigraph.Width(90), asciigraph.Caption(statsOutput))
240 | fmt.Println(selectedStyle.Render(column + graph))
241 | } else {
242 | valuesMap := explore.CountValues(stringValues)
243 | sortedMap := explore.SortMapByValue(valuesMap)
244 | histogram := explore.HistTopTen(sortedMap, column)
245 | fmt.Println(selectedStyle.Render(column + histogram))
246 | }
247 | }
248 | }
249 |
--------------------------------------------------------------------------------
/utils/explore.go:
--------------------------------------------------------------------------------
1 | package explore
2 |
3 | import (
4 | "encoding/csv"
5 | "fmt"
6 | "log"
7 | "math"
8 | "sort"
9 | "strconv"
10 | "strings"
11 | "bytes"
12 | "encoding/json"
13 | "io"
14 | "io/ioutil"
15 | "net/http"
16 | "os"
17 | "time"
18 | )
19 |
20 | type Sorted struct {
21 | Key string
22 | Value int
23 | }
24 |
25 | type SortedList []Sorted
26 |
27 | func (p SortedList) Len() int { return len(p) }
28 | func (p SortedList) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
29 | func (p SortedList) Less(i, j int) bool { return p[i].Value > p[j].Value }
30 |
31 | func ReadCSVFile(filePath string) [][]string {
32 | f, err := os.Open(filePath)
33 | if err != nil {
34 | log.Fatal("Unable to read input file "+filePath, err)
35 | }
36 | defer f.Close()
37 |
38 | csvReader := csv.NewReader(f)
39 | records, err := csvReader.ReadAll()
40 | if err != nil {
41 | log.Fatal("Unable to parse file as CSV for "+filePath, err)
42 | }
43 |
44 | return records
45 | }
46 |
47 | func ConvertStringToInt(stringArray []string) ([]float64, []string) {
48 | var intArray []float64
49 |
50 | for _, i := range stringArray {
51 | if convertNum, err := strconv.ParseFloat(i, 64); err == nil {
52 | intArray = append(intArray, convertNum)
53 | } else if intValue, err := strconv.ParseInt(i, 10, 64); err == nil {
54 | convertNum := float64(intValue)
55 | intArray = append(intArray, convertNum)
56 | }
57 | }
58 |
59 | return intArray, stringArray
60 | }
61 |
62 | func MinMaxValues(intArray []float64) (float64, float64) {
63 | var max float64 = intArray[0]
64 | var min float64 = intArray[0]
65 | for _, value := range intArray {
66 | if max < value {
67 | max = value
68 | }
69 | if min > value {
70 | min = value
71 | }
72 | }
73 | return min, max
74 | }
75 |
76 | func MeanValue(intArray []float64) float64 {
77 | total := 0.0
78 |
79 | for _, v := range intArray {
80 | total += v
81 | }
82 |
83 | return math.Round(total / float64(len(intArray)))
84 | }
85 |
86 | func MedianValue(intArray []float64) float64 {
87 | sort.Float64s(intArray)
88 | mNumber := len(intArray) / 2
89 |
90 | if len(intArray)%2 != 0 {
91 | return intArray[mNumber]
92 | }
93 |
94 | return (intArray[mNumber-1] + intArray[mNumber]) / 2
95 | }
96 |
97 | func FloatOutput(min float64, max float64, mean float64, median float64) string {
98 | outputString :=
99 | `(The plot is a general trend of all points)
100 |
101 | Min: %.2f
102 | Max: %.2f
103 | Mean: %.2f
104 | Median: %.2f
105 | `
106 | finalOutput := fmt.Sprintf(outputString, min, max, mean, median)
107 | return finalOutput
108 | }
109 |
110 | func CountValues(stringArray []string) map[string]int {
111 | valuesMap := make(map[string]int)
112 |
113 | for _, v := range stringArray {
114 | v := string(v)
115 | if _, ok := valuesMap[v]; ok {
116 | valuesMap[v] = valuesMap[v] + 1
117 | } else {
118 | valuesMap[v] = 1
119 | }
120 | }
121 |
122 | return valuesMap
123 | }
124 |
125 | func SortMapByValue(valueMap map[string]int) SortedList {
126 | sortedMap := make(SortedList, len(valueMap))
127 |
128 | i := 0
129 | for k, v := range valueMap {
130 | sortedMap[i] = Sorted{k, v}
131 | i++
132 | }
133 |
134 | sort.Sort(sortedMap)
135 |
136 | return sortedMap
137 | }
138 |
139 | func HistTopTen(sortedList SortedList, column string) string {
140 | var key string
141 | var barValue int
142 | max := float64(sortedList[0].Value)
143 | histString := "Horizontal Histogram"
144 |
145 | if len(sortedList) > 10 {
146 | histString = histString + " - Top Ten\n\n"
147 | } else {
148 | histString = histString + "\n\n"
149 | }
150 |
151 | i := 0
152 | for d := range sortedList {
153 | if i < 10 {
154 | if max > 75 {
155 | barValue = int((float64(sortedList[d].Value) / max) * 75)
156 | } else {
157 | barValue = sortedList[d].Value
158 | }
159 | bar := strings.Repeat("☐", barValue)
160 | if len(sortedList[d].Key) > 20 {
161 | key = sortedList[d].Key[:17] + "..."
162 | } else {
163 | key = sortedList[d].Key
164 | }
165 | histString = histString + fmt.Sprintf("%20v: %s (%d)\n", key, bar, sortedList[d].Value)
166 | }
167 | i++
168 | }
169 |
170 | histString = histString + fmt.Sprintf("%20v: %d\n", "Unique Strings", len(sortedList))
171 |
172 | return histString
173 | }
174 |
175 | type ChatGPTRequest struct {
176 | Message string `json:"message"`
177 | }
178 |
179 | type ChatGPTResponse struct {
180 | Reply string `json:"reply"`
181 | }
182 |
183 | // CSVSummary generates a summary of the entire CSV file
184 | func CSVSummary(records [][]string) map[string]interface{} {
185 | summary := make(map[string]interface{})
186 |
187 | // Basic file info
188 | summary["total_rows"] = len(records) - 1 // Excluding header
189 | summary["total_columns"] = len(records[0])
190 | summary["column_names"] = records[0]
191 |
192 | // Column type inference
193 | columnTypes := make(map[string]string)
194 | for i, colName := range records[0] {
195 | // Check first 10 rows (or fewer if less data) to infer type
196 | samples := []string{}
197 | for j := 1; j < min(len(records), 11); j++ {
198 | if i < len(records[j]) {
199 | samples = append(samples, records[j][i])
200 | }
201 | }
202 | columnTypes[colName] = inferColumnType(samples)
203 | }
204 | summary["column_types"] = columnTypes
205 |
206 | // Data completeness
207 | completeness := make(map[string]float64)
208 | for i, colName := range records[0] {
209 | filledCount := 0
210 | for j := 1; j < len(records); j++ {
211 | if i < len(records[j]) && records[j][i] != "" {
212 | filledCount++
213 | }
214 | }
215 | completeness[colName] = float64(filledCount) / float64(len(records)-1) * 100
216 | }
217 | summary["completeness"] = completeness
218 |
219 | return summary
220 | }
221 |
222 | // Helper function for CSVSummary
223 | func min(a, b int) int {
224 | if a < b {
225 | return a
226 | }
227 | return b
228 | }
229 |
230 | // Helper function to infer column type
231 | func inferColumnType(samples []string) string {
232 | isInt := true
233 | isFloat := true
234 |
235 | for _, sample := range samples {
236 | if sample == "" {
237 | continue // Skip empty values for type inference
238 | }
239 |
240 | // Try parsing as int
241 | _, err := strconv.ParseInt(sample, 10, 64)
242 | if err != nil {
243 | isInt = false
244 | }
245 |
246 | // Try parsing as float
247 | _, err = strconv.ParseFloat(sample, 64)
248 | if err != nil {
249 | isFloat = false
250 | }
251 |
252 | if !isInt && !isFloat {
253 | break
254 | }
255 | }
256 |
257 | if isInt {
258 | return "integer"
259 | } else if isFloat {
260 | return "float"
261 | }
262 | return "string"
263 | }
264 |
265 | // FilterCSVData filters CSV data based on column and condition
266 | func FilterCSVData(records [][]string, columnName string, condition string, value string) [][]string {
267 | filtered := [][]string{records[0]} // Start with header row
268 |
269 | // Find column index
270 | colIndex := -1
271 | for i, col := range records[0] {
272 | if col == columnName {
273 | colIndex = i
274 | break
275 | }
276 | }
277 |
278 | if colIndex == -1 {
279 | return filtered // Column not found
280 | }
281 |
282 | // Process each row
283 | for i := 1; i < len(records); i++ {
284 | row := records[i]
285 | if colIndex >= len(row) {
286 | continue // Skip rows with insufficient columns
287 | }
288 |
289 | cellValue := row[colIndex]
290 | includeRow := false
291 |
292 | switch condition {
293 | case "equals":
294 | includeRow = cellValue == value
295 | case "contains":
296 | includeRow = strings.Contains(cellValue, value)
297 | case "greater_than":
298 | cellFloat, err1 := strconv.ParseFloat(cellValue, 64)
299 | valueFloat, err2 := strconv.ParseFloat(value, 64)
300 | if err1 == nil && err2 == nil {
301 | includeRow = cellFloat > valueFloat
302 | }
303 | case "less_than":
304 | cellFloat, err1 := strconv.ParseFloat(cellValue, 64)
305 | valueFloat, err2 := strconv.ParseFloat(value, 64)
306 | if err1 == nil && err2 == nil {
307 | includeRow = cellFloat < valueFloat
308 | }
309 | case "starts_with":
310 | includeRow = strings.HasPrefix(cellValue, value)
311 | case "ends_with":
312 | includeRow = strings.HasSuffix(cellValue, value)
313 | default:
314 | includeRow = false
315 | }
316 |
317 | if includeRow {
318 | filtered = append(filtered, row)
319 | }
320 | }
321 |
322 | return filtered
323 | }
324 |
325 | // ExportToJSON exports CSV data to a JSON file
326 | func ExportToJSON(records [][]string, outputPath string) error {
327 | result := []map[string]string{}
328 | headers := records[0]
329 |
330 | // Convert each row to a map
331 | for i := 1; i < len(records); i++ {
332 | row := make(map[string]string)
333 | for j, header := range headers {
334 | if j < len(records[i]) {
335 | row[header] = records[i][j]
336 | } else {
337 | row[header] = ""
338 | }
339 | }
340 | result = append(result, row)
341 | }
342 |
343 | // Marshal to JSON
344 | jsonData, err := json.MarshalIndent(result, "", " ")
345 | if err != nil {
346 | return err
347 | }
348 |
349 | // Write to file
350 | return ioutil.WriteFile(outputPath, jsonData, 0644)
351 | }
352 |
353 | // CalculateCorrelation calculates Pearson correlation between two numeric columns
354 | func CalculateCorrelation(records [][]string, column1 string, column2 string) (float64, error) {
355 | // Find column indices
356 | col1Index := -1
357 | col2Index := -1
358 | for i, col := range records[0] {
359 | if col == column1 {
360 | col1Index = i
361 | }
362 | if col == column2 {
363 | col2Index = i
364 | }
365 | }
366 |
367 | if col1Index == -1 || col2Index == -1 {
368 | return 0, fmt.Errorf("column not found")
369 | }
370 |
371 | // Extract numeric values
372 | var x []float64
373 | var y []float64
374 |
375 | for i := 1; i < len(records); i++ {
376 | row := records[i]
377 | if col1Index < len(row) && col2Index < len(row) {
378 | val1, err1 := strconv.ParseFloat(row[col1Index], 64)
379 | val2, err2 := strconv.ParseFloat(row[col2Index], 64)
380 | if err1 == nil && err2 == nil {
381 | x = append(x, val1)
382 | y = append(y, val2)
383 | }
384 | }
385 | }
386 |
387 | if len(x) < 2 {
388 | return 0, fmt.Errorf("insufficient numeric data for correlation")
389 | }
390 |
391 | // Calculate means
392 | xMean := 0.0
393 | yMean := 0.0
394 | for i := range x {
395 | xMean += x[i]
396 | yMean += y[i]
397 | }
398 | xMean /= float64(len(x))
399 | yMean /= float64(len(y))
400 |
401 | // Calculate correlation
402 | numerator := 0.0
403 | xDenom := 0.0
404 | yDenom := 0.0
405 |
406 | for i := range x {
407 | xDiff := x[i] - xMean
408 | yDiff := y[i] - yMean
409 | numerator += xDiff * yDiff
410 | xDenom += xDiff * xDiff
411 | yDenom += yDiff * yDiff
412 | }
413 |
414 | if xDenom == 0 || yDenom == 0 {
415 | return 0, nil // No variation in at least one variable
416 | }
417 |
418 | return numerator / math.Sqrt(xDenom * yDenom), nil
419 | }
420 |
421 | // DetectAnomalies detects anomalies in numeric columns using Z-score method
422 | func DetectAnomalies(records [][]string, threshold float64) map[string][]int {
423 | anomolies := make(map[string][]int)
424 |
425 | // Process each column
426 | for colIndex, colName := range records[0] {
427 | // Extract numeric values
428 | values := []float64{}
429 | valueIndices := []int{}
430 |
431 | for i := 1; i < len(records); i++ {
432 | if colIndex < len(records[i]) {
433 | if val, err := strconv.ParseFloat(records[i][colIndex], 64); err == nil {
434 | values = append(values, val)
435 | valueIndices = append(valueIndices, i)
436 | }
437 | }
438 | }
439 |
440 | // Need enough data points for meaningful anomaly detection
441 | if len(values) < 5 {
442 | continue
443 | }
444 |
445 | // Calculate mean and standard deviation
446 | mean := 0.0
447 | for _, val := range values {
448 | mean += val
449 | }
450 | mean /= float64(len(values))
451 |
452 | stdDev := 0.0
453 | for _, val := range values {
454 | stdDev += math.Pow(val - mean, 2)
455 | }
456 | stdDev = math.Sqrt(stdDev / float64(len(values)))
457 |
458 | if stdDev == 0 {
459 | continue // Skip columns with no variation
460 | }
461 |
462 | // Find anomalies (values with Z-score above threshold)
463 | anomalousRows := []int{}
464 | for i, val := range values {
465 | zScore := math.Abs(val - mean) / stdDev
466 | if zScore > threshold {
467 | anomalousRows = append(anomalousRows, valueIndices[i])
468 | }
469 | }
470 |
471 | if len(anomalousRows) > 0 {
472 | anomolies[colName] = anomalousRows
473 | }
474 | }
475 |
476 | return anomolies
477 | }
478 |
479 | func CallChatGPTAPI(message string, authToken string) (string, error) {
480 | payload := ChatGPTRequest{
481 | Message: message,
482 | }
483 |
484 | payloadBytes, err := json.Marshal(payload)
485 | if err != nil {
486 | return "", err
487 | }
488 |
489 | client := &http.Client{}
490 |
491 | req, err := http.NewRequest("POST", "https://api.chatgpt.com/v1/chat/completions", bytes.NewBuffer(payloadBytes))
492 | if err != nil {
493 | return "", err
494 | }
495 |
496 | req.Header.Set("Content-Type", "application/json")
497 | req.Header.Set("Authorization", "Bearer "+authToken)
498 |
499 | resp, err := client.Do(req)
500 | if err != nil {
501 | return "", err
502 | }
503 | defer resp.Body.Close()
504 |
505 | respBody, err := ioutil.ReadAll(resp.Body)
506 | if err != nil {
507 | return "", err
508 | }
509 |
510 | var apiResponse ChatGPTResponse
511 | err = json.Unmarshal(respBody, &apiResponse)
512 | if err != nil {
513 | return "", err
514 | }
515 |
516 | return apiResponse.Reply, nil
517 | }
--------------------------------------------------------------------------------