├── test ├── resources │ ├── test-empty.csv │ ├── test-1.csv │ ├── test-2.csv │ ├── test-error-short-buffer.csv │ ├── test-4.csv │ ├── test-read-required-for-field.csv │ ├── test-error-short-buffer-quoted.csv │ └── test-error-short-buffer-quoted-with-double.csv └── csv_tokenizer.zig ├── zig.mod ├── .gitignore ├── .envrc ├── docs └── performance.md ├── .github └── workflows │ └── ci.yml ├── LICENSE ├── flake.nix ├── README.md ├── flake.lock └── src └── main.zig /test/resources/test-empty.csv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/resources/test-1.csv: -------------------------------------------------------------------------------- 1 | 1,abc 2 | -------------------------------------------------------------------------------- /test/resources/test-2.csv: -------------------------------------------------------------------------------- 1 | 1,abc 2 | 2,def ghc 3 | -------------------------------------------------------------------------------- /test/resources/test-error-short-buffer.csv: -------------------------------------------------------------------------------- 1 | 1234567890 2 | -------------------------------------------------------------------------------- /test/resources/test-4.csv: -------------------------------------------------------------------------------- 1 | 1,"def ghc" 2 | 2,"abc ""def""" 3 | -------------------------------------------------------------------------------- /test/resources/test-read-required-for-field.csv: -------------------------------------------------------------------------------- 1 | 12345,67890 2 | -------------------------------------------------------------------------------- /test/resources/test-error-short-buffer-quoted.csv: -------------------------------------------------------------------------------- 1 | "1234567890" 2 | -------------------------------------------------------------------------------- /test/resources/test-error-short-buffer-quoted-with-double.csv: -------------------------------------------------------------------------------- 1 | "1234567890""" 2 | -------------------------------------------------------------------------------- /zig.mod: -------------------------------------------------------------------------------- 1 | id: m2fmjf9o3txtpyqs5i3108poalp4vjy369irlh9upgvm7w6y 2 | name: csv 3 | main: src/main.zig 4 | dependencies: 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | zig-cache/ 2 | zig-out/ 3 | /release/ 4 | /debug/ 5 | /build/ 6 | /build-*/ 7 | /docgen_tmp/ 8 | /.direnv/ 9 | /.zig-cache/ 10 | -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | # If we are a computer with nix available, then use that to setup 2 | # the build environment with exactly what we need. 3 | if has nix; then 4 | use flake 5 | else 6 | echo "nix not found" 7 | fi 8 | -------------------------------------------------------------------------------- /docs/performance.md: -------------------------------------------------------------------------------- 1 | # Performance 2 | 3 | Following command can be used to generate some regular testing data: 4 | 5 | ```bash 6 | ruby -e '(1..100000).each {|i| puts sprintf("%d,abcdefghijkl,%010d,mnopqrs,%020d,tuvwx,\"...\"\"z\"\"...\"", i, i, i)}' 7 | ``` -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: CI 4 | 5 | # Controls when the action will run. 6 | on: 7 | # Triggers the workflow on push or pull request events but only for the main branch 8 | push: 9 | branches: [ main ] 10 | pull_request: 11 | branches: [ main ] 12 | 13 | # Allows you to run this workflow manually from the Actions tab 14 | workflow_dispatch: 15 | 16 | env: 17 | zig_version: 0.13.0 18 | 19 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 20 | jobs: 21 | test: 22 | runs-on: ubuntu-latest 23 | steps: 24 | - uses: actions/checkout@v3 25 | - uses: goto-bus-stop/setup-zig@v2 26 | with: 27 | version: ${{ env.zig_version }} 28 | - run: zig build test 29 | lint: 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: actions/checkout@v3 33 | - uses: goto-bus-stop/setup-zig@v2 34 | with: 35 | version: ${{ env.zig_version }} 36 | - run: zig fmt --check . 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 @_beho 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | description = "zig-csv"; 3 | # based on https://github.com/mitchellh/zig-overlay/blob/2c86c36e7fe65faac08bdf85d041cf7b798f8ee8/templates/init/flake.nix 4 | 5 | inputs = { 6 | # zls at 0.11.0 7 | nixpkgs.url = "github:nixos/nixpkgs/ff1a94e523ae9fb272e0581f068baee5d1068476"; 8 | flake-utils.url = "github:numtide/flake-utils"; 9 | zig.url = "github:mitchellh/zig-overlay"; 10 | zls.url = "github:zigtools/zls?ref=refs/tags/0.13.0"; 11 | }; 12 | 13 | outputs = { 14 | self, 15 | nixpkgs, 16 | flake-utils, 17 | ... 18 | } @ inputs : 19 | let 20 | overlays = [ 21 | (final: prev: { 22 | zigpkgs = inputs.zig.packages.${prev.system}; 23 | zlspkgs = inputs.zls.packages.${prev.system}; 24 | }) 25 | ]; 26 | # Our supported systems are the same supported systems as the Zig binaries 27 | systems = builtins.attrNames inputs.zig.packages; 28 | in 29 | flake-utils.lib.eachSystem systems ( 30 | system: let 31 | pkgs = import nixpkgs { inherit overlays system; }; 32 | in rec { 33 | devShells.default = pkgs.mkShell { 34 | nativeBuildInputs = [ 35 | pkgs.zigpkgs."0.13.0" 36 | ]; 37 | buildInputs = [ 38 | pkgs.zlspkgs.zls 39 | ]; 40 | }; 41 | 42 | # For compatibility with older versions of the `nix` binary 43 | devShell = self.devShells.${system}.default; 44 | } 45 | ); 46 | } 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # zig-csv ![CI](https://github.com/beho/zig-csv/workflows/CI/badge.svg) 2 | 3 | Low-level CSV parser library for [Zig language](https://github.com/ziglang/zig). Each non-empty line in input is parsed as one or more tokens of type `field`, followed by `row_end`. 4 | 5 | _This library was conceived as Zig learning project and it was not used by me in production software._ 6 | 7 | ## Features 8 | 9 | - Reads UTF-8 files. 10 | - Provides iterator interface to stream of tokens. 11 | - Handles quoted fields in which column/row separator can be used. Quote itself can be used in field by doubling it (e.g. `"This is quote: ""."`) 12 | - Configurable column separator (default `,`), row separator (`\n`) and quote (`"`). 13 | - **Currently only single byte characters.** 14 | - Parser does not allocate – caller provides a buffer that parser operates in. **Buffer must be longer than a longest field in input.** 15 | 16 | ## Example 17 | 18 | Following code reads CSV tokens from a file while very naively printing them as table to standard output. 19 | 20 | ```zig 21 | const std = @import("std"); 22 | const csv = @import("csv"); 23 | 24 | pub fn main() anyerror!void { 25 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); 26 | defer arena.deinit(); 27 | 28 | const allocator = &arena.allocator; 29 | var buffer = try allocator.alloc(u8, 4096); 30 | 31 | const args = try std.process.argsAlloc(allocator); 32 | defer std.process.argsFree(allocator, args); 33 | 34 | if (args.len != 2) { 35 | std.log.warn("Single arg is expected", .{}); 36 | std.process.exit(1); 37 | } 38 | 39 | const file = try std.fs.cwd().openFile(args[1], .{}); 40 | defer file.close(); 41 | 42 | var csv_tokenizer = try csv.CsvTokenizer(std.fs.File.Reader).init(file.reader(), buffer, .{}); 43 | const stdout = std.io.getStdOut().writer(); 44 | 45 | while (try csv_tokenizer.next()) |token| { 46 | switch (token) { 47 | .field => |val| { 48 | try stdout.writeAll(val); 49 | try stdout.writeAll("\t"); 50 | }, 51 | .row_end => { 52 | try stdout.writeAll("\n"); 53 | }, 54 | } 55 | } 56 | } 57 | ``` 58 | -------------------------------------------------------------------------------- /test/csv_tokenizer.zig: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 beho 2 | // 3 | // This library is free software; you can redistribute it and/or modify it 4 | // under the terms of the MIT license. See LICENSE for details. 5 | 6 | const std = @import("std"); 7 | const testing = std.testing; 8 | const expect = testing.expect; 9 | const csv_mod = @import("csv"); 10 | 11 | var default_buffer = [_]u8{0} ** 1024; 12 | 13 | fn getTokenizer(file: std.fs.File, buffer: []u8, config: csv_mod.CsvConfig) !csv_mod.CsvTokenizer(std.fs.File.Reader) { 14 | const reader = file.reader(); 15 | const csv = try csv_mod.CsvTokenizer(std.fs.File.Reader).init(reader, buffer, config); 16 | return csv; 17 | } 18 | 19 | fn expectToken(comptime expected: csv_mod.CsvToken, maybe_actual: ?csv_mod.CsvToken) !void { 20 | if (maybe_actual) |actual| { 21 | if (@intFromEnum(expected) != @intFromEnum(actual)) { 22 | std.log.warn("Expected {?} but is {?}\n", .{ expected, actual }); 23 | return error.TestFailed; 24 | } 25 | 26 | switch (expected) { 27 | .field => { 28 | try testing.expectEqualStrings(expected.field, actual.field); 29 | }, 30 | else => {}, 31 | } 32 | } else { 33 | std.log.warn("Expected {?} but is {?}\n", .{ expected, maybe_actual }); 34 | return error.TestFailed; 35 | } 36 | } 37 | 38 | test "Create iterator for file reader" { 39 | const file = try std.fs.cwd().openFile("test/resources/test-1.csv", .{}); 40 | defer file.close(); 41 | 42 | _ = try getTokenizer(file, &default_buffer, .{}); 43 | } 44 | 45 | test "Read single simple record from file" { 46 | const file = try std.fs.cwd().openFile("test/resources/test-1.csv", .{}); 47 | defer file.close(); 48 | var csv = try getTokenizer(file, &default_buffer, .{}); 49 | 50 | try expectToken(csv_mod.CsvToken{ .field = "1" }, try csv.next()); 51 | try expectToken(csv_mod.CsvToken{ .field = "abc" }, try csv.next()); 52 | try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next()); 53 | 54 | const next = csv.next() catch unreachable; 55 | 56 | try expect(next == null); 57 | } 58 | 59 | test "Read multiple simple records from file" { 60 | const file = try std.fs.cwd().openFile("test/resources/test-2.csv", .{}); 61 | defer file.close(); 62 | var csv = try getTokenizer(file, &default_buffer, .{}); 63 | 64 | try expectToken(csv_mod.CsvToken{ .field = "1" }, try csv.next()); 65 | try expectToken(csv_mod.CsvToken{ .field = "abc" }, try csv.next()); 66 | try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next()); 67 | 68 | try expectToken(csv_mod.CsvToken{ .field = "2" }, try csv.next()); 69 | try expectToken(csv_mod.CsvToken{ .field = "def ghc" }, try csv.next()); 70 | try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next()); 71 | 72 | const next = csv.next() catch unreachable; 73 | 74 | try expect(next == null); 75 | } 76 | 77 | test "Read quoted fields" { 78 | const file = try std.fs.cwd().openFile("test/resources/test-4.csv", .{}); 79 | defer file.close(); 80 | var csv = try getTokenizer(file, &default_buffer, .{}); 81 | 82 | try expectToken(csv_mod.CsvToken{ .field = "1" }, try csv.next()); 83 | try expectToken(csv_mod.CsvToken{ .field = "def ghc" }, try csv.next()); 84 | try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next()); 85 | 86 | try expectToken(csv_mod.CsvToken{ .field = "2" }, try csv.next()); 87 | try expectToken(csv_mod.CsvToken{ .field = "abc \"def\"" }, try csv.next()); 88 | try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next()); 89 | 90 | const next = csv.next() catch unreachable; 91 | 92 | try expect(next == null); 93 | } 94 | 95 | test "Second read is necessary to obtain field" { 96 | const file = try std.fs.cwd().openFile("test/resources/test-read-required-for-field.csv", .{}); 97 | defer file.close(); 98 | var csv = try getTokenizer(file, default_buffer[0..6], .{}); 99 | 100 | try expectToken(csv_mod.CsvToken{ .field = "12345" }, try csv.next()); 101 | try expectToken(csv_mod.CsvToken{ .field = "67890" }, try csv.next()); 102 | try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next()); 103 | 104 | const next = csv.next() catch unreachable; 105 | 106 | try expect(next == null); 107 | } 108 | 109 | test "File is empty" { 110 | const file = try std.fs.cwd().openFile("test/resources/test-empty.csv", .{}); 111 | defer file.close(); 112 | var csv = try getTokenizer(file, &default_buffer, .{}); 113 | 114 | const next = csv.next() catch unreachable; 115 | 116 | try expect(next == null); 117 | } 118 | 119 | test "Field is longer than buffer" { 120 | const file = try std.fs.cwd().openFile("test/resources/test-error-short-buffer.csv", .{}); 121 | defer file.close(); 122 | var csv = try getTokenizer(file, default_buffer[0..9], .{}); 123 | 124 | const next = csv.next(); 125 | try std.testing.expectError(csv_mod.CsvError.ShortBuffer, next); 126 | } 127 | 128 | test "Quoted field is longer than buffer" { 129 | const file = try std.fs.cwd().openFile("test/resources/test-error-short-buffer-quoted.csv", .{}); 130 | defer file.close(); 131 | var csv = try getTokenizer(file, default_buffer[0..10], .{}); 132 | 133 | const next = csv.next(); 134 | try std.testing.expectError(csv_mod.CsvError.ShortBuffer, next); 135 | } 136 | 137 | test "Quoted field with double quotes is longer than buffer" { 138 | const file = try std.fs.cwd().openFile("test/resources/test-error-short-buffer-quoted-with-double.csv", .{}); 139 | defer file.close(); 140 | var csv = try getTokenizer(file, default_buffer[0..11], .{}); 141 | 142 | const next = csv.next(); 143 | try std.testing.expectError(csv_mod.CsvError.ShortBuffer, next); 144 | } 145 | 146 | test "Quoted field with double quotes can be read on retry" { 147 | const file = try std.fs.cwd().openFile("test/resources/test-error-short-buffer-quoted-with-double.csv", .{}); 148 | defer file.close(); 149 | var csv = try getTokenizer(file, default_buffer[0..14], .{}); 150 | 151 | try expectToken(csv_mod.CsvToken{ .field = "1234567890\"" }, try csv.next()); 152 | try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next()); 153 | 154 | const next = csv.next() catch unreachable; 155 | 156 | try expect(next == null); 157 | } 158 | 159 | // TODO test last line with new line and without 160 | -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "flake-compat": { 4 | "flake": false, 5 | "locked": { 6 | "lastModified": 1696426674, 7 | "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=", 8 | "owner": "edolstra", 9 | "repo": "flake-compat", 10 | "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33", 11 | "type": "github" 12 | }, 13 | "original": { 14 | "owner": "edolstra", 15 | "repo": "flake-compat", 16 | "type": "github" 17 | } 18 | }, 19 | "flake-compat_2": { 20 | "flake": false, 21 | "locked": { 22 | "lastModified": 1696426674, 23 | "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=", 24 | "owner": "edolstra", 25 | "repo": "flake-compat", 26 | "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33", 27 | "type": "github" 28 | }, 29 | "original": { 30 | "owner": "edolstra", 31 | "repo": "flake-compat", 32 | "type": "github" 33 | } 34 | }, 35 | "flake-utils": { 36 | "inputs": { 37 | "systems": "systems" 38 | }, 39 | "locked": { 40 | "lastModified": 1710146030, 41 | "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=", 42 | "owner": "numtide", 43 | "repo": "flake-utils", 44 | "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a", 45 | "type": "github" 46 | }, 47 | "original": { 48 | "owner": "numtide", 49 | "repo": "flake-utils", 50 | "type": "github" 51 | } 52 | }, 53 | "flake-utils_2": { 54 | "inputs": { 55 | "systems": "systems_2" 56 | }, 57 | "locked": { 58 | "lastModified": 1705309234, 59 | "narHash": "sha256-uNRRNRKmJyCRC/8y1RqBkqWBLM034y4qN7EprSdmgyA=", 60 | "owner": "numtide", 61 | "repo": "flake-utils", 62 | "rev": "1ef2e671c3b0c19053962c07dbda38332dcebf26", 63 | "type": "github" 64 | }, 65 | "original": { 66 | "owner": "numtide", 67 | "repo": "flake-utils", 68 | "type": "github" 69 | } 70 | }, 71 | "flake-utils_3": { 72 | "inputs": { 73 | "systems": "systems_3" 74 | }, 75 | "locked": { 76 | "lastModified": 1710146030, 77 | "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=", 78 | "owner": "numtide", 79 | "repo": "flake-utils", 80 | "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a", 81 | "type": "github" 82 | }, 83 | "original": { 84 | "owner": "numtide", 85 | "repo": "flake-utils", 86 | "type": "github" 87 | } 88 | }, 89 | "flake-utils_4": { 90 | "inputs": { 91 | "systems": "systems_4" 92 | }, 93 | "locked": { 94 | "lastModified": 1705309234, 95 | "narHash": "sha256-uNRRNRKmJyCRC/8y1RqBkqWBLM034y4qN7EprSdmgyA=", 96 | "owner": "numtide", 97 | "repo": "flake-utils", 98 | "rev": "1ef2e671c3b0c19053962c07dbda38332dcebf26", 99 | "type": "github" 100 | }, 101 | "original": { 102 | "owner": "numtide", 103 | "repo": "flake-utils", 104 | "type": "github" 105 | } 106 | }, 107 | "gitignore": { 108 | "inputs": { 109 | "nixpkgs": [ 110 | "zls", 111 | "nixpkgs" 112 | ] 113 | }, 114 | "locked": { 115 | "lastModified": 1709087332, 116 | "narHash": "sha256-HG2cCnktfHsKV0s4XW83gU3F57gaTljL9KNSuG6bnQs=", 117 | "owner": "hercules-ci", 118 | "repo": "gitignore.nix", 119 | "rev": "637db329424fd7e46cf4185293b9cc8c88c95394", 120 | "type": "github" 121 | }, 122 | "original": { 123 | "owner": "hercules-ci", 124 | "repo": "gitignore.nix", 125 | "type": "github" 126 | } 127 | }, 128 | "langref": { 129 | "flake": false, 130 | "locked": { 131 | "narHash": "sha256-O6p2tiKD8ZMhSX+DeA/o5hhAvcPkU2J9lFys/r11peY=", 132 | "type": "file", 133 | "url": "https://raw.githubusercontent.com/ziglang/zig/0fb2015fd3422fc1df364995f9782dfe7255eccd/doc/langref.html.in" 134 | }, 135 | "original": { 136 | "type": "file", 137 | "url": "https://raw.githubusercontent.com/ziglang/zig/0fb2015fd3422fc1df364995f9782dfe7255eccd/doc/langref.html.in" 138 | } 139 | }, 140 | "nixpkgs": { 141 | "locked": { 142 | "lastModified": 1710814491, 143 | "narHash": "sha256-LyI5tOn0F7NM3WyZj3ULRp8i44IqarbY6WTxalD0n4A=", 144 | "owner": "nixos", 145 | "repo": "nixpkgs", 146 | "rev": "ff1a94e523ae9fb272e0581f068baee5d1068476", 147 | "type": "github" 148 | }, 149 | "original": { 150 | "owner": "nixos", 151 | "repo": "nixpkgs", 152 | "rev": "ff1a94e523ae9fb272e0581f068baee5d1068476", 153 | "type": "github" 154 | } 155 | }, 156 | "nixpkgs_2": { 157 | "locked": { 158 | "lastModified": 1708161998, 159 | "narHash": "sha256-6KnemmUorCvlcAvGziFosAVkrlWZGIc6UNT9GUYr0jQ=", 160 | "owner": "NixOS", 161 | "repo": "nixpkgs", 162 | "rev": "84d981bae8b5e783b3b548de505b22880559515f", 163 | "type": "github" 164 | }, 165 | "original": { 166 | "owner": "NixOS", 167 | "ref": "nixos-23.11", 168 | "repo": "nixpkgs", 169 | "type": "github" 170 | } 171 | }, 172 | "nixpkgs_3": { 173 | "locked": { 174 | "lastModified": 1717696253, 175 | "narHash": "sha256-1+ua0ggXlYYPLTmMl3YeYYsBXDSCqT+Gw3u6l4gvMhA=", 176 | "owner": "NixOS", 177 | "repo": "nixpkgs", 178 | "rev": "9b5328b7f761a7bbdc0e332ac4cf076a3eedb89b", 179 | "type": "github" 180 | }, 181 | "original": { 182 | "owner": "NixOS", 183 | "ref": "nixos-24.05", 184 | "repo": "nixpkgs", 185 | "type": "github" 186 | } 187 | }, 188 | "root": { 189 | "inputs": { 190 | "flake-utils": "flake-utils", 191 | "nixpkgs": "nixpkgs", 192 | "zig": "zig", 193 | "zls": "zls" 194 | } 195 | }, 196 | "systems": { 197 | "locked": { 198 | "lastModified": 1681028828, 199 | "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", 200 | "owner": "nix-systems", 201 | "repo": "default", 202 | "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", 203 | "type": "github" 204 | }, 205 | "original": { 206 | "owner": "nix-systems", 207 | "repo": "default", 208 | "type": "github" 209 | } 210 | }, 211 | "systems_2": { 212 | "locked": { 213 | "lastModified": 1681028828, 214 | "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", 215 | "owner": "nix-systems", 216 | "repo": "default", 217 | "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", 218 | "type": "github" 219 | }, 220 | "original": { 221 | "owner": "nix-systems", 222 | "repo": "default", 223 | "type": "github" 224 | } 225 | }, 226 | "systems_3": { 227 | "locked": { 228 | "lastModified": 1681028828, 229 | "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", 230 | "owner": "nix-systems", 231 | "repo": "default", 232 | "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", 233 | "type": "github" 234 | }, 235 | "original": { 236 | "owner": "nix-systems", 237 | "repo": "default", 238 | "type": "github" 239 | } 240 | }, 241 | "systems_4": { 242 | "locked": { 243 | "lastModified": 1681028828, 244 | "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", 245 | "owner": "nix-systems", 246 | "repo": "default", 247 | "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", 248 | "type": "github" 249 | }, 250 | "original": { 251 | "owner": "nix-systems", 252 | "repo": "default", 253 | "type": "github" 254 | } 255 | }, 256 | "zig": { 257 | "inputs": { 258 | "flake-compat": "flake-compat", 259 | "flake-utils": "flake-utils_2", 260 | "nixpkgs": "nixpkgs_2" 261 | }, 262 | "locked": { 263 | "lastModified": 1717848532, 264 | "narHash": "sha256-d+xIUvSTreHl8pAmU1fnmkfDTGQYCn2Rb/zOwByxS2M=", 265 | "owner": "mitchellh", 266 | "repo": "zig-overlay", 267 | "rev": "02fc5cc555fc14fda40c42d7c3250efa43812b43", 268 | "type": "github" 269 | }, 270 | "original": { 271 | "owner": "mitchellh", 272 | "repo": "zig-overlay", 273 | "type": "github" 274 | } 275 | }, 276 | "zig-overlay": { 277 | "inputs": { 278 | "flake-compat": "flake-compat_2", 279 | "flake-utils": "flake-utils_4", 280 | "nixpkgs": [ 281 | "zls", 282 | "nixpkgs" 283 | ] 284 | }, 285 | "locked": { 286 | "lastModified": 1717848532, 287 | "narHash": "sha256-d+xIUvSTreHl8pAmU1fnmkfDTGQYCn2Rb/zOwByxS2M=", 288 | "owner": "mitchellh", 289 | "repo": "zig-overlay", 290 | "rev": "02fc5cc555fc14fda40c42d7c3250efa43812b43", 291 | "type": "github" 292 | }, 293 | "original": { 294 | "owner": "mitchellh", 295 | "repo": "zig-overlay", 296 | "type": "github" 297 | } 298 | }, 299 | "zls": { 300 | "inputs": { 301 | "flake-utils": "flake-utils_3", 302 | "gitignore": "gitignore", 303 | "langref": "langref", 304 | "nixpkgs": "nixpkgs_3", 305 | "zig-overlay": "zig-overlay" 306 | }, 307 | "locked": { 308 | "lastModified": 1717891507, 309 | "narHash": "sha256-l/Zo1OwdB3js3wXOpgFLozKwq+bdsPySZtKbEbYBb7U=", 310 | "owner": "zigtools", 311 | "repo": "zls", 312 | "rev": "a26718049a8657d4da04c331aeced1697bc7652b", 313 | "type": "github" 314 | }, 315 | "original": { 316 | "owner": "zigtools", 317 | "ref": "refs/tags/0.13.0", 318 | "repo": "zls", 319 | "type": "github" 320 | } 321 | } 322 | }, 323 | "root": "root", 324 | "version": 7 325 | } 326 | -------------------------------------------------------------------------------- /src/main.zig: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 beho 2 | // 3 | // This library is free software; you can redistribute it and/or modify it 4 | // under the terms of the MIT license. See LICENSE for details. 5 | 6 | const std = @import("std"); 7 | const mem = std.mem; 8 | const print = std.debug.print; 9 | const assert = std.debug.assert; 10 | 11 | pub const CsvTokenType = enum { 12 | field, 13 | row_end, 14 | }; 15 | 16 | pub const CsvToken = union(CsvTokenType) { 17 | field: []const u8, 18 | row_end: void, 19 | }; 20 | 21 | pub const CsvError = error{ 22 | ShortBuffer, 23 | MisplacedQuote, 24 | NoSeparatorAfterField, 25 | }; 26 | 27 | pub const CsvConfig = struct { 28 | col_sep: u8 = ',', 29 | row_sep: u8 = '\n', 30 | quote: u8 = '"', 31 | }; 32 | 33 | const QuoteFieldReadResult = struct { 34 | value: []u8, 35 | contains_quotes: bool, 36 | }; 37 | 38 | fn CsvReader(comptime Reader: type) type { 39 | 40 | // TODO comptime 41 | return struct { 42 | buffer: []u8, 43 | current: []u8, 44 | 45 | reader: Reader, 46 | all_read: bool = false, 47 | 48 | const Self = @This(); 49 | 50 | pub fn init(reader: Reader, buffer: []u8) Self { 51 | return .{ 52 | .buffer = buffer, 53 | .current = buffer[0..0], 54 | .reader = reader, 55 | }; 56 | } 57 | 58 | inline fn empty(self: *Self) bool { 59 | return self.current.len == 0; 60 | } 61 | 62 | pub fn char(self: *Self) !?u8 { 63 | if (!try self.ensureData()) { 64 | return null; 65 | } 66 | 67 | const c = self.current[0]; 68 | self.current = self.current[1..]; 69 | 70 | return c; 71 | } 72 | 73 | pub inline fn peek(self: *Self) !?u8 { 74 | if (!try self.ensureData()) { 75 | return null; 76 | } 77 | 78 | return self.current[0]; 79 | } 80 | 81 | pub fn until(self: *Self, terminators: []const u8) !?[]u8 { 82 | if (!try self.ensureData()) { 83 | return null; 84 | } 85 | 86 | for (self.current, 0..) |c, pos| { 87 | // TODO inline 88 | for (terminators) |ct| { 89 | if (c == ct) { 90 | const s = self.current[0..pos]; 91 | self.current = self.current[pos..]; 92 | // print("{}|{}", .{ s, self.current }); 93 | return s; 94 | } 95 | } 96 | } 97 | 98 | // print("ALL_READ: {}\n", .{self.all_read}); 99 | return null; 100 | } 101 | 102 | pub fn untilClosingQuote(self: *Self, quote: u8) !?QuoteFieldReadResult { 103 | if (!try self.ensureData()) { 104 | return null; 105 | } 106 | 107 | var idx: usize = 0; 108 | var contains_quotes: bool = false; 109 | while (idx < self.current.len) : (idx += 1) { 110 | const c = self.current[idx]; 111 | // print("IDX QUOTED: {}={c}\n", .{ idx, c }); 112 | if (c == quote) { 113 | // double quotes, shift forward 114 | // print("PEEK {c}\n", .{buffer[idx + 1]}); 115 | if (idx < self.current.len - 1 and self.current[idx + 1] == '"') { 116 | // print("DOUBLE QUOTES\n", .{}); 117 | contains_quotes = true; 118 | idx += 1; 119 | } else { 120 | // print("ALL_READ {}\n", .{self.all_read}); 121 | if (!self.all_read and idx == self.current.len - 1) { 122 | return null; 123 | } 124 | 125 | const s = self.current[0..idx]; 126 | self.current = self.current[idx..]; 127 | 128 | return QuoteFieldReadResult{ .value = s, .contains_quotes = contains_quotes }; 129 | } 130 | } 131 | } 132 | 133 | return null; 134 | } 135 | 136 | /// Tries to read more data from an underlying reader if buffer is not already full. 137 | /// If anything was read returns true, otherwise false. 138 | pub fn read(self: *Self) !bool { 139 | const current_len = self.current.len; 140 | 141 | if (current_len == self.buffer.len) { 142 | return false; 143 | } 144 | 145 | if (current_len > 0) { 146 | mem.copyForwards(u8, self.buffer, self.current); 147 | } 148 | 149 | const read_len = try self.reader.read(self.buffer[current_len..]); 150 | // print("READ: current_len={} read_len={}\n", .{ current_len, read_len }); 151 | 152 | self.current = self.buffer[0 .. current_len + read_len]; 153 | self.all_read = read_len == 0; 154 | 155 | return read_len > 0; 156 | } 157 | 158 | // Ensures that there are some data in the buffer. Returns false if no data are available 159 | pub inline fn ensureData(self: *Self) !bool { 160 | if (!self.empty()) { 161 | return true; 162 | } 163 | 164 | if (self.all_read) { 165 | return false; 166 | } 167 | 168 | return self.read(); 169 | } 170 | }; 171 | } 172 | 173 | /// Tokenizes input from reader into stream of CsvTokens 174 | pub fn CsvTokenizer(comptime Reader: type) type { 175 | const Status = enum { 176 | initial, 177 | row_start, 178 | field, 179 | quoted_field_end, 180 | row_end, 181 | eof, 182 | }; 183 | 184 | return struct { 185 | const Self = @This(); 186 | 187 | config: CsvConfig, 188 | terminal_chars: [3]u8 = undefined, 189 | 190 | reader: CsvReader(Reader), 191 | 192 | status: Status = .initial, 193 | 194 | pub fn init(reader: Reader, buffer: []u8, config: CsvConfig) !Self { 195 | return Self{ 196 | .config = config, 197 | .terminal_chars = [_]u8{ config.col_sep, config.row_sep, '"' }, 198 | .reader = CsvReader(Reader).init(reader, buffer), 199 | }; 200 | } 201 | 202 | pub fn next(self: *Self) !?CsvToken { 203 | var next_status: ?Status = self.status; 204 | 205 | // Cannot use anonymous enum literals for Status 206 | // https://github.com/ziglang/zig/issues/4255 207 | 208 | while (next_status) |status| { 209 | // print("STATUS: {}\n", .{self.status}); 210 | next_status = switch (status) { 211 | .initial => if (try self.reader.read()) Status.row_start else Status.eof, 212 | .row_start => if (!try self.reader.ensureData()) Status.eof else Status.field, 213 | .field => blk: { 214 | if (!try self.reader.ensureData()) { 215 | break :blk .row_end; 216 | } 217 | 218 | return try self.parseField(); 219 | }, 220 | .quoted_field_end => blk: { 221 | // read closing quotes 222 | const quote = try self.reader.char(); 223 | assert(quote == self.config.quote); 224 | 225 | if (!try self.reader.ensureData()) { 226 | break :blk Status.row_end; 227 | } 228 | 229 | const c = (try self.reader.peek()); 230 | 231 | if (c) |value| { 232 | // print("END: {}\n", .{value}); 233 | if (value == self.config.col_sep) { 234 | // TODO write repro for assert with optional 235 | // const col_sep = try self.reader.char(); 236 | // assert(col_sep == self.config.col_sep); 237 | const col_sep = (try self.reader.char()).?; 238 | assert(col_sep == self.config.col_sep); 239 | 240 | break :blk Status.field; 241 | } 242 | 243 | if (value == self.config.row_sep) { 244 | break :blk Status.row_end; 245 | } 246 | 247 | // quote means that it did not fit into buffer and it cannot be analyzed as "" 248 | if (value == self.config.quote) { 249 | return CsvError.ShortBuffer; 250 | } 251 | } else { 252 | break :blk Status.eof; 253 | } 254 | 255 | return CsvError.NoSeparatorAfterField; 256 | }, 257 | .row_end => { 258 | if (!try self.reader.ensureData()) { 259 | self.status = Status.eof; 260 | return CsvToken{ .row_end = {} }; 261 | } 262 | 263 | const rowSep = try self.reader.char(); 264 | assert(rowSep == self.config.row_sep); 265 | 266 | self.status = Status.row_start; 267 | 268 | return CsvToken{ .row_end = {} }; 269 | }, 270 | .eof => { 271 | return null; 272 | }, 273 | }; 274 | 275 | // make the transition and also ensure that next_status is set at this point 276 | self.status = next_status.?; 277 | } 278 | 279 | unreachable; 280 | } 281 | 282 | fn parseField(self: *Self) !CsvToken { 283 | const first = (try self.reader.peek()).?; 284 | 285 | if (first != '"') { 286 | var field = try self.reader.until(&self.terminal_chars); 287 | if (field == null) { 288 | // force read - maybe separator was not read yet 289 | const hasData = try self.reader.read(); 290 | if (!hasData) { 291 | return CsvError.ShortBuffer; 292 | } 293 | 294 | field = try self.reader.until(&self.terminal_chars); 295 | if (field == null) { 296 | return CsvError.ShortBuffer; 297 | } 298 | } 299 | 300 | const terminator = (try self.reader.peek()).?; 301 | 302 | if (terminator == self.config.col_sep) { 303 | _ = try self.reader.char(); 304 | return CsvToken{ .field = field.? }; 305 | } 306 | 307 | if (terminator == self.config.row_sep) { 308 | self.status = .row_end; 309 | return CsvToken{ .field = field.? }; 310 | } 311 | 312 | if (terminator == self.config.quote) { 313 | return CsvError.MisplacedQuote; 314 | } 315 | 316 | return CsvError.ShortBuffer; 317 | } else { 318 | // consume opening quote 319 | _ = try self.reader.char(); 320 | var quoted_field = try self.reader.untilClosingQuote(self.config.quote); 321 | if (quoted_field == null) { 322 | // force read - maybe separator was not read yet 323 | const hasData = try self.reader.read(); 324 | if (!hasData) { 325 | return CsvError.ShortBuffer; 326 | } 327 | 328 | // this read will fill the buffer 329 | quoted_field = try self.reader.untilClosingQuote(self.config.quote); 330 | if (quoted_field == null) { 331 | return CsvError.ShortBuffer; 332 | } 333 | } 334 | 335 | self.status = .quoted_field_end; 336 | 337 | const field = quoted_field.?; 338 | if (!field.contains_quotes) { 339 | return CsvToken{ .field = field.value }; 340 | } else { 341 | // walk the field and remove double quotes by shifting bytes 342 | const value = field.value; 343 | var diff: u64 = 0; 344 | var idx: usize = 0; 345 | while (idx < value.len) : (idx += 1) { 346 | const c = value[idx]; 347 | value[idx - diff] = c; 348 | 349 | if (c == self.config.quote) { 350 | diff += 1; 351 | idx += 1; 352 | } 353 | } 354 | 355 | return CsvToken{ .field = value[0 .. value.len - diff] }; 356 | } 357 | } 358 | } 359 | }; 360 | } 361 | --------------------------------------------------------------------------------