├── .gitignore
├── README.md
├── csvql.nimble
├── demo.svg
├── logo.png
└── src
└── csvql.nim
/.gitignore:
--------------------------------------------------------------------------------
1 | /nimcache/*
2 | csvql
3 | src/nimcache/*
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # CSVql - Nim
3 | Query You'r CSVs data like a boss.
4 |
5 | ```bash
6 | Options(opt-arg sep :|=|spc):
7 | -h, --help write this help to stdout
8 | -q=, --query= string REQUIRED Placing the query as an ANSI-SQL the table name should be replaced with the path
9 | to the csv instead
10 | i.e:
11 | 1) SELECT name, day, hour FROM '/path/to/csv/test.csv' as
12 | t1 LIMIT 10
13 | 2) SELECT name, lastname, birthday FROM '/path/to/csv/test.csv' as
14 | t1 LEFT JOIN '/path/to/csv/test2.csv' as t2 ON t1.name = t2.name
15 | ```
16 |
17 | * Delimiter are gussed so no need to specified.
18 |
19 | Query over CSV with simple AnsiSQL.
20 |
21 |
22 |
23 |
24 |
25 | # Limitations:
26 |
27 | ## 1. Header must be present.
28 | If you'r header is with spaces / special chars they will be replaced.
29 |
30 | Space will replace to (dash `_`) and special chars will be removed.
31 |
32 | ``#`` char will be written as no (# of drivers -> no_of_drivers)
33 |
34 | ## 2. Delimiter is guessed in app so the delimiters are limited to the following:
35 | ```nim
36 | const possibleDelimiters = @[";", ",", ":", "~", "*", "$", "#", "@", "/", "%", "^", "\t"]
37 | ```
38 | Please make sure its one of the following delimiters.
39 |
40 | ## 3. Fit to memory.
41 | As you may already know - sqlite is used to store your data in-memory. so the CSV/s should be fit.
42 |
43 | ## 4. SQLite should be installed.
44 |
45 |
46 | # TODOs:
47 | 1. Tests.
48 |
--------------------------------------------------------------------------------
/csvql.nimble:
--------------------------------------------------------------------------------
1 | # Package
2 | version = "3.0.2"
3 | author = "Benny Elgazar"
4 | description = "csvql - query csv using ansisql."
5 | license = "MIT"
6 | bin = @["csvql"]
7 | srcDir = "src"
8 |
9 | requires "nim >= 1.0.2"
10 | requires "cligen >= 0.9.41"
11 |
--------------------------------------------------------------------------------
/demo.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bennyelg/csvql/171cf3dd68beeb15112a4264b7f23cd09ee7f1c5/logo.png
--------------------------------------------------------------------------------
/src/csvql.nim:
--------------------------------------------------------------------------------
1 | import
2 | os,
3 | parsecsv,
4 | streams,
5 | sequtils,
6 | strutils,
7 | db_sqlite,
8 | strformat,
9 | re,
10 | times,
11 | cligen,
12 | terminal
13 |
14 |
15 | type
16 | Csv = ref object
17 | path: string
18 | alias: string
19 | hasHeader: bool
20 | columns: seq[string]
21 | types: seq[string]
22 | delimiter: char
23 |
24 | Database = object
25 | connection: DbConn
26 |
27 |
28 | template guessType(s: string, t: untyped): bool =
29 | var result = false
30 | try:
31 | discard t(s)
32 | result = true
33 | except Exception:
34 | discard
35 | result
36 |
37 | proc guessCsvDelimiter(filePath: string): char =
38 | const maxSampleCountOfRows = 10
39 | const possibleDelimiters = [";", ",", ":", "~", "*", "$", "#", "@", "/", "%", "^", "\t"]
40 | var fs = newFileStream(filePath, fmRead)
41 | var line = ""
42 | var rowNO = 0
43 | var results: seq[tuple[rowNo: int, deli: string, count: int]] = @[]
44 | while fs.readLine(line):
45 | if rowNO < maxSampleCountOfRows:
46 | for delimiter in possibleDelimiters:
47 | results.add((rowNO, delimiter, line.split(delimiter).len))
48 | inc rowNO
49 | else:
50 | break
51 | var highestCount = 0
52 | var mostPossibleDelimiter: seq[char] = @[]
53 | for delimiter in possibleDelimiters:
54 | let resultSetForTheCurrentDelimiter = results.filterIt(it.deli == delimiter)
55 | for row in resultSetForTheCurrentDelimiter:
56 | highestCount = max(results.mapIt(it.count))
57 |
58 | for row in resultSetForTheCurrentDelimiter:
59 | if row.count == highestCount:
60 | mostPossibleDelimiter.add(row.deli)
61 |
62 | var mostPossibleDelimiterClean = deduplicate(mostPossibleDelimiter)
63 | if mostPossibleDelimiterClean.len == 1:
64 | return mostPossibleDelimiter[0]
65 | elif mostPossibleDelimiterClean.len > 1:
66 | var highestCntDelimiter = 0
67 | var highestPossibleDelimiterPosition = 0
68 | for idx, d in mostPossibleDelimiterClean:
69 | var countOfPossibleDelimiter = count(mostPossibleDelimiterClean, d)
70 | if highestCntDelimiter < countOfPossibleDelimiter:
71 | highestCntDelimiter = countOfPossibleDelimiter
72 | highestPossibleDelimiterPosition = idx
73 | return mostPossibleDelimiterClean[highestPossibleDelimiterPosition]
74 | else:
75 | return ','
76 |
77 | proc appendCsv*(path, alias: string, hasHeader: bool = false): Csv {.inline.} =
78 | ## return a new Csv object.
79 | Csv(
80 | path: path,
81 | alias: alias,
82 | hasHeader: hasHeader,
83 | delimiter: guessCsvDelimiter(path) # the default until we find otherwise.
84 | )
85 |
86 | func getTypesWithMostProbability(types: seq[seq[string]]): seq[string] =
87 | var totalTimeIsThere = 0
88 | var pos = 0
89 | for idx, row in types:
90 | var rowRecurrentCount = count(types, row)
91 | if totalTimeIsThere < rowRecurrentCount:
92 | pos = idx
93 | totalTimeIsThere = rowRecurrentCount
94 |
95 | types[pos]
96 |
97 | func figureColumnsTypes(rowsSamples: seq[seq[string]]): seq[string] =
98 | var types: seq[seq[string]] = @[]
99 | for row in rowsSamples:
100 | var rowTypes: seq[string] = @[]
101 | for item in row:
102 | if guessType(item, parseInt):
103 | rowTypes.add("int")
104 | elif guessType(item, parseFloat):
105 | rowTypes.add("float")
106 | else:
107 | rowTypes.add("string")
108 | types.add(rowTypes)
109 |
110 | let csvTypes = getTypesWithMostProbability(types)
111 | return csvTypes
112 |
113 | proc parseCsv*(csv: Csv) =
114 | const numOfSamplingRows = 50
115 | var csvStream = newFileStream(csv.path, fmRead)
116 | var parser: CsvParser
117 | var rowsSampleCount: seq[seq[string]]
118 | parser.open(csvStream, csv.path, csv.delimiter)
119 |
120 | if csv.hasHeader:
121 | parser.readHeaderRow()
122 | csv.columns = parser.headers.mapIt(
123 | it.toLowerAscii.multiReplace(
124 | [(" ", "_"), ("'",""), (".", ""), ("(", ""), (")", ""), ("%", ""), ("#", "no")]
125 | ))
126 | var samplingCounter = 0
127 | while parser.readRow():
128 | let row = parser.row
129 | if numOfSamplingRows > samplingCounter:
130 | rowsSampleCount.add(row)
131 | inc samplingCounter
132 | else:
133 | break
134 |
135 | if not csv.hasHeader:
136 | var columns: seq[string] = @[]
137 | for idx in countup(1, rowsSampleCount[0].len):
138 | columns.add(fmt"c_{idx}")
139 | csv.columns = columns
140 |
141 | csv.types = figureColumnsTypes(rowsSampleCount)
142 |
143 | template openConnection*(): Database =
144 | Database(
145 | connection: open(":memory:", "", "", "")
146 | )
147 |
148 | proc createTableUsingCsvProperties(db: Database, csv: Csv) =
149 | var columnsWithTypes: seq[string] = @[]
150 | for idx, column in csv.columns:
151 | columnsWithTypes.add(fmt"{column} {csv.types[idx]}")
152 |
153 | var statement = fmt"""
154 | CREATE TABLE {csv.alias} (
155 | {columnsWithTypes.join(",")}
156 | );
157 | """
158 | db.connection.exec(SqlQuery(statement))
159 |
160 | proc executeChunk(args: tuple[db: Database, tableName: string, columns: seq[string], rows: seq[seq[string]]]) =
161 | var statement = fmt"""
162 | INSERT INTO {args.tableName}({args.columns.join(",")})
163 | VALUES
164 | """
165 | var insertableRows: seq[string] = newSeqOfCap[string](args.rows.len)
166 | for row in args.rows:
167 | var insertableRow = "(" & row.mapIt("'" & it.replace("?", "").replace("'","") & "'").join(",") & ")"
168 | insertableRows.add(insertableRow)
169 | let executableStatement = statement & insertableRows.join(",") & ";"
170 | discard args.db.connection.tryExec(SqlQuery(executableStatement))
171 |
172 | proc insertCsvRowsIntoTable(db: Database, csv: Csv) =
173 | db.connection.exec(sql"PRAGMA synchronous=OFF")
174 | db.connection.exec(sql"BEGIN TRANSACTION;")
175 |
176 | const defaultChunkSize = 1
177 | var rowsChunk: seq[seq[string]] = @[]
178 | var csvStream = newFileStream(csv.path, fmRead)
179 | var parser: CsvParser
180 | parser.open(csvStream, csv.path, csv.delimiter)
181 | if csv.hasHeader:
182 | parser.readHeaderRow()
183 | while parser.readRow():
184 | rowsChunk.add(parser.row)
185 | if rowsChunk.len == defaultChunkSize:
186 | executeChunk((db: db, tableName: csv.alias, columns: csv.columns, rows: rowsChunk))
187 | rowsChunk = @[]
188 | if rowsChunk.len > 0:
189 | executeChunk((db: db, tableName: csv.alias, columns: csv.columns, rows: rowsChunk))
190 |
191 | db.connection.exec(sql"COMMIT;")
192 |
193 | func `*`(size: int, del: string): string {.inline.} =
194 | result = "+"
195 | for i in countup(0, size):
196 | result &= del
197 | return result & "+"
198 |
199 | func getQueryColumns(csvs: seq[Csv], query: string): seq[string] =
200 | let columnsRequested = query.toLowerAscii().split("from")[0].replace("select", "").strip().split(",")
201 | var columns: seq[string] = @[]
202 | if columnsRequested[0] == "*":
203 | for csv in csvs:
204 | columns.add(csv.columns.join(","))
205 | return deduplicate(columns).join(",").split(",").mapIt(it.strip())
206 | else:
207 | for t in columnsRequested:
208 | if ".*" in t:
209 | var tableRequestedPosition = parseInt(t.strip().split(".*")[0].replace("t", "")) - 1
210 | var tableName = t.strip().split(".*")[0]
211 | columns.add(csvs[tableRequestedPosition].columns.mapIt(tableName & "." & it.strip()))
212 | else:
213 | columns.add(t)
214 |
215 | return columns
216 |
217 | return columnsRequested.mapIt(it.strip())
218 |
219 | func getLongestWordsByPosition(rs: seq[tuple[r: Row, length: int]]): seq[int] =
220 | var lengths: seq[int] = newSeq[int](rs[0].r.len)
221 | for row in rs:
222 | for idx, word in row.r:
223 | if word.len > lengths[idx]:
224 | lengths[idx] = word.len
225 | return lengths
226 |
227 | proc exportResults(columns: seq[string], resultSet: seq[seq[string]]): string =
228 | let dt = format(now(), "yyyy-mm-ddHH:mm:ss").replace("-","_").replace(":", "_")
229 | let generatedFilePath = getTempDir() & dt & ".csv"
230 | var fs = newFileStream(generatedFilePath, fmWrite)
231 | for idx, row in resultSet:
232 | if idx == 0:
233 | fs.writeLine(columns.join(","))
234 | fs.writeLine(row.join(","))
235 |
236 | return generatedFilePath
237 |
238 | proc displayResults(db: Database, csvs: seq[Csv], query: string, exportResult: bool = false) =
239 | var queryColumns = getQueryColumns(csvs, query)
240 | var rows: seq[tuple[r: Row, length: int]] = @[]
241 | for row in db.connection.fastRows(SqlQuery(query)):
242 | rows.add((
243 | r: row,
244 | length: ("|" & row.join(",")).len
245 | )
246 | )
247 | var maxLengthOfWordsByPosition = getLongestWordsByPosition(rows)
248 | var fin: seq[seq[string]] = @[]
249 | var columns: seq[string] = @[]
250 | for row in rows:
251 | var words: seq[string] = @[]
252 | for idx, word in row.r:
253 | if maxLengthOfWordsByPosition[idx] > queryColumns[idx].len:
254 | words.add(center(word, maxLengthOfWordsByPosition[idx]))
255 | else:
256 | words.add(center(word, queryColumns[idx].len))
257 | fin.add(words)
258 | for idx, column in queryColumns:
259 | columns.add(center(column, maxLengthOfWordsByPosition[idx]))
260 | echo(fin[0].join("|").len * "-")
261 | echo("|" & columns.join("|") & " |")
262 | echo(fin[0].join("|").len * "-")
263 | for row in fin:
264 | echo("|" & row.join("|") & " |")
265 | echo(row.join("|").len * "-")
266 |
267 | if exportResult:
268 | const exportResultHeader = """
269 | ----------
270 | ::Export::
271 | ----------
272 | """.unindent
273 | styledWriteLine(stdout, fgRed, exportResultHeader, resetStyle)
274 | let generatedCsvPath = exportResults(queryColumns, rows.mapIt(it.r))
275 | styledWriteLine(stdout, fgGreen, "File is ready & can be located in: " & generatedCsvPath, resetStyle)
276 |
277 | proc parseQuery(query: string): (seq[Csv], string) =
278 | let csvsPaths = re.findAll(query, re"'(.*?).csv'")
279 | var csvs = newSeqOfCap[Csv](csvsPaths.len)
280 | var newQuery = query
281 | const propertiesHeader = """
282 | ----------------------
283 | ::Parsing Properties::
284 | ----------------------""".unindent
285 | styledWriteLine(stdout, fgRed, propertiesHeader, resetStyle)
286 |
287 | for idx, csvPath in csvsPaths:
288 | doAssert existsFile(csvPath.replace("'", "")), "\nCSV File not found: " & csvPath
289 | styledWriteLine(stdout, fgGreen, fmt"t{idx + 1} = {csvPath}", resetStyle)
290 | let csv = appendCsv(csvPath.replace("'", ""), fmt"t{idx + 1}", true)
291 | csvs.add(csv)
292 | newQuery = newQuery.replace(csvPath.replace("'", ""), fmt"t{idx + 1}")
293 |
294 |
295 | return (csvs, newQuery)
296 |
297 | proc csvQL(query: string, exportResult: bool = false) =
298 | doAssert query.len > 0, "\nSQL Query must not be empty string."
299 | let startTime = cpuTime()
300 | let db = openConnection()
301 | let (csvs, adjustedQuery) = parseQuery(query)
302 | const generatedQueryHeader = """
303 | -------------------
304 | ::Generated Query::
305 | -------------------""".unindent
306 | styledWriteLine(stdout, fgRed, generatedQueryHeader, resetStyle)
307 | styledWriteLine(stdout, fgGreen, adjustedQuery, resetStyle)
308 |
309 | for csv in csvs:
310 | parseCsv(csv)
311 | db.createTableUsingCsvProperties(csv)
312 | db.insertCsvRowsIntoTable(csv)
313 | const queryResultHeader = """
314 | ----------
315 | ::Result::
316 | ----------""".unindent
317 | styledWriteLine(stdout, fgRed, queryResultHeader, resetStyle)
318 | displayResults(db, csvs, adjustedQuery, exportResult)
319 | styledWriteLine(stdout, fgYellow, fmt"* Total Duration: {cpuTime() - startTime:.6f} seconds.", resetStyle)
320 |
321 | when isMainModule:
322 | setControlCHook((proc {.noconv.} = quit" CTRL+C Pressed, shutting down, bye!. "))
323 | dispatch(csvQL, help = { "query" :
324 | """Placing the query as an ANSI-SQL the table name should be replaced with the path to the csv instead
325 | i.e:
326 | 1) SELECT name, day, hour FROM '/path/to/csv/test.csv' as t1 LIMIT 10
327 | 2) SELECT name, lastname, birthday FROM '/path/to/csv/test.csv' as t1 LEFT JOIN '/path/to/csv/test2.csv' as t2 ON t1.name = t2.name
328 | """, "exportResult": "set to true if you want to export the result set." })
329 |
--------------------------------------------------------------------------------