├── test
    ├── resources
    │   ├── test-empty.csv
    │   ├── test-1.csv
    │   ├── test-2.csv
    │   ├── test-error-short-buffer.csv
    │   ├── test-4.csv
    │   ├── test-read-required-for-field.csv
    │   ├── test-error-short-buffer-quoted.csv
    │   └── test-error-short-buffer-quoted-with-double.csv
    └── csv_tokenizer.zig
├── zig.mod
├── .gitignore
├── .envrc
├── docs
    └── performance.md
├── .github
    └── workflows
    │   └── ci.yml
├── LICENSE
├── flake.nix
├── README.md
├── flake.lock
└── src
    └── main.zig


/test/resources/test-empty.csv:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/resources/test-1.csv:
--------------------------------------------------------------------------------
1 | 1,abc
2 | 


--------------------------------------------------------------------------------
/test/resources/test-2.csv:
--------------------------------------------------------------------------------
1 | 1,abc
2 | 2,def ghc
3 | 


--------------------------------------------------------------------------------
/test/resources/test-error-short-buffer.csv:
--------------------------------------------------------------------------------
1 | 1234567890
2 | 


--------------------------------------------------------------------------------
/test/resources/test-4.csv:
--------------------------------------------------------------------------------
1 | 1,"def ghc"
2 | 2,"abc ""def"""
3 | 


--------------------------------------------------------------------------------
/test/resources/test-read-required-for-field.csv:
--------------------------------------------------------------------------------
1 | 12345,67890
2 | 


--------------------------------------------------------------------------------
/test/resources/test-error-short-buffer-quoted.csv:
--------------------------------------------------------------------------------
1 | "1234567890"
2 | 


--------------------------------------------------------------------------------
/test/resources/test-error-short-buffer-quoted-with-double.csv:
--------------------------------------------------------------------------------
1 | "1234567890"""
2 | 


--------------------------------------------------------------------------------
/zig.mod:
--------------------------------------------------------------------------------
1 | id: m2fmjf9o3txtpyqs5i3108poalp4vjy369irlh9upgvm7w6y
2 | name: csv
3 | main: src/main.zig
4 | dependencies:
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | zig-cache/
 2 | zig-out/
 3 | /release/
 4 | /debug/
 5 | /build/
 6 | /build-*/
 7 | /docgen_tmp/
 8 | /.direnv/
 9 | /.zig-cache/
10 | 


--------------------------------------------------------------------------------
/.envrc:
--------------------------------------------------------------------------------
1 | # If we are a computer with nix available, then use that to setup
2 | # the build environment with exactly what we need.
3 | if has nix; then
4 |   use flake
5 | else
6 |   echo "nix not found"
7 | fi
8 | 


--------------------------------------------------------------------------------
/docs/performance.md:
--------------------------------------------------------------------------------
1 |  # Performance 
2 | 
3 |  Following command can be used to generate some regular testing data:
4 |  
5 |  ```bash
6 |  ruby -e '(1..100000).each {|i| puts sprintf("%d,abcdefghijkl,%010d,mnopqrs,%020d,tuvwx,\"...\"\"z\"\"...\"", i, i, i)}'
7 |  ```


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: CI
 4 | 
 5 | # Controls when the action will run.
 6 | on:
 7 |   # Triggers the workflow on push or pull request events but only for the main branch
 8 |   push:
 9 |     branches: [ main ]
10 |   pull_request:
11 |     branches: [ main ]
12 | 
13 |   # Allows you to run this workflow manually from the Actions tab
14 |   workflow_dispatch:
15 | 
16 | env:
17 |   zig_version: 0.13.0
18 | 
19 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
20 | jobs:
21 |   test:
22 |     runs-on: ubuntu-latest
23 |     steps:
24 |       - uses: actions/checkout@v3
25 |       - uses: goto-bus-stop/setup-zig@v2
26 |         with:
27 |           version: ${{ env.zig_version }}
28 |       - run: zig build test
29 |   lint:
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - uses: actions/checkout@v3
33 |       - uses: goto-bus-stop/setup-zig@v2
34 |         with:
35 |           version: ${{ env.zig_version }}
36 |       - run: zig fmt --check .
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 @_beho
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   description = "zig-csv";
 3 |   # based on https://github.com/mitchellh/zig-overlay/blob/2c86c36e7fe65faac08bdf85d041cf7b798f8ee8/templates/init/flake.nix
 4 | 
 5 |   inputs = {
 6 |     # zls at 0.11.0
 7 |     nixpkgs.url = "github:nixos/nixpkgs/ff1a94e523ae9fb272e0581f068baee5d1068476";
 8 |     flake-utils.url = "github:numtide/flake-utils";
 9 |     zig.url = "github:mitchellh/zig-overlay";
10 |     zls.url = "github:zigtools/zls?ref=refs/tags/0.13.0";
11 |   };
12 | 
13 |   outputs = {
14 |     self,
15 |     nixpkgs,
16 |     flake-utils,
17 |     ...
18 |   } @ inputs :
19 |     let
20 |       overlays = [
21 |         (final: prev: {
22 |           zigpkgs = inputs.zig.packages.${prev.system};
23 |           zlspkgs = inputs.zls.packages.${prev.system};
24 |         })
25 |       ];
26 |       # Our supported systems are the same supported systems as the Zig binaries
27 |       systems = builtins.attrNames inputs.zig.packages;
28 |     in
29 |       flake-utils.lib.eachSystem systems (
30 |         system: let
31 |           pkgs = import nixpkgs { inherit overlays system; };
32 |         in rec {
33 |           devShells.default = pkgs.mkShell {
34 |             nativeBuildInputs = [
35 |               pkgs.zigpkgs."0.13.0"
36 |             ];
37 |             buildInputs = [
38 |               pkgs.zlspkgs.zls
39 |             ];
40 |           };
41 | 
42 |           # For compatibility with older versions of the `nix` binary
43 |           devShell = self.devShells.${system}.default;
44 |         }
45 |       );
46 | }
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # zig-csv ![CI](https://github.com/beho/zig-csv/workflows/CI/badge.svg)
 2 | 
 3 | Low-level CSV parser library for [Zig language](https://github.com/ziglang/zig). Each non-empty line in input is parsed as one or more tokens of type `field`, followed by `row_end`.
 4 | 
 5 | _This library was conceived as Zig learning project and it was not used by me in production software._
 6 | 
 7 | ## Features
 8 | 
 9 | - Reads UTF-8 files.
10 | - Provides iterator interface to stream of tokens.
11 | - Handles quoted fields in which column/row separator can be used. Quote itself can be used in field by doubling it (e.g. `"This is quote: ""."`)
12 | - Configurable column separator (default `,`), row separator (`\n`) and quote (`"`).
13 |   - **Currently only single byte characters.**
14 | - Parser does not allocate – caller provides a buffer that parser operates in. **Buffer must be longer than a longest field in input.**
15 | 
16 | ## Example
17 | 
18 | Following code reads CSV tokens from a file while very naively printing them as table to standard output.
19 | 
20 | ```zig
21 | const std = @import("std");
22 | const csv = @import("csv");
23 | 
24 | pub fn main() anyerror!void {
25 |     var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
26 |     defer arena.deinit();
27 | 
28 |     const allocator = &arena.allocator;
29 |     var buffer = try allocator.alloc(u8, 4096);
30 | 
31 |     const args = try std.process.argsAlloc(allocator);
32 |     defer std.process.argsFree(allocator, args);
33 | 
34 |     if (args.len != 2) {
35 |         std.log.warn("Single arg is expected", .{});
36 |         std.process.exit(1);
37 |     }
38 | 
39 |     const file = try std.fs.cwd().openFile(args[1], .{});
40 |     defer file.close();
41 | 
42 |     var csv_tokenizer = try csv.CsvTokenizer(std.fs.File.Reader).init(file.reader(), buffer, .{});
43 |     const stdout = std.io.getStdOut().writer();
44 | 
45 |     while (try csv_tokenizer.next()) |token| {
46 |         switch (token) {
47 |             .field => |val| {
48 |                 try stdout.writeAll(val);
49 |                 try stdout.writeAll("\t");
50 |             },
51 |             .row_end => {
52 |                 try stdout.writeAll("\n");
53 |             },
54 |         }
55 |     }
56 | }
57 | ```
58 | 


--------------------------------------------------------------------------------
/test/csv_tokenizer.zig:
--------------------------------------------------------------------------------
  1 | //  Copyright (c) 2021 beho
  2 | //
  3 | //  This library is free software; you can redistribute it and/or modify it
  4 | //  under the terms of the MIT license. See LICENSE for details.
  5 | 
  6 | const std = @import("std");
  7 | const testing = std.testing;
  8 | const expect = testing.expect;
  9 | const csv_mod = @import("csv");
 10 | 
 11 | var default_buffer = [_]u8{0} ** 1024;
 12 | 
 13 | fn getTokenizer(file: std.fs.File, buffer: []u8, config: csv_mod.CsvConfig) !csv_mod.CsvTokenizer(std.fs.File.Reader) {
 14 |     const reader = file.reader();
 15 |     const csv = try csv_mod.CsvTokenizer(std.fs.File.Reader).init(reader, buffer, config);
 16 |     return csv;
 17 | }
 18 | 
 19 | fn expectToken(comptime expected: csv_mod.CsvToken, maybe_actual: ?csv_mod.CsvToken) !void {
 20 |     if (maybe_actual) |actual| {
 21 |         if (@intFromEnum(expected) != @intFromEnum(actual)) {
 22 |             std.log.warn("Expected {?} but is {?}\n", .{ expected, actual });
 23 |             return error.TestFailed;
 24 |         }
 25 | 
 26 |         switch (expected) {
 27 |             .field => {
 28 |                 try testing.expectEqualStrings(expected.field, actual.field);
 29 |             },
 30 |             else => {},
 31 |         }
 32 |     } else {
 33 |         std.log.warn("Expected {?} but is {?}\n", .{ expected, maybe_actual });
 34 |         return error.TestFailed;
 35 |     }
 36 | }
 37 | 
 38 | test "Create iterator for file reader" {
 39 |     const file = try std.fs.cwd().openFile("test/resources/test-1.csv", .{});
 40 |     defer file.close();
 41 | 
 42 |     _ = try getTokenizer(file, &default_buffer, .{});
 43 | }
 44 | 
 45 | test "Read single simple record from file" {
 46 |     const file = try std.fs.cwd().openFile("test/resources/test-1.csv", .{});
 47 |     defer file.close();
 48 |     var csv = try getTokenizer(file, &default_buffer, .{});
 49 | 
 50 |     try expectToken(csv_mod.CsvToken{ .field = "1" }, try csv.next());
 51 |     try expectToken(csv_mod.CsvToken{ .field = "abc" }, try csv.next());
 52 |     try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next());
 53 | 
 54 |     const next = csv.next() catch unreachable;
 55 | 
 56 |     try expect(next == null);
 57 | }
 58 | 
 59 | test "Read multiple simple records from file" {
 60 |     const file = try std.fs.cwd().openFile("test/resources/test-2.csv", .{});
 61 |     defer file.close();
 62 |     var csv = try getTokenizer(file, &default_buffer, .{});
 63 | 
 64 |     try expectToken(csv_mod.CsvToken{ .field = "1" }, try csv.next());
 65 |     try expectToken(csv_mod.CsvToken{ .field = "abc" }, try csv.next());
 66 |     try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next());
 67 | 
 68 |     try expectToken(csv_mod.CsvToken{ .field = "2" }, try csv.next());
 69 |     try expectToken(csv_mod.CsvToken{ .field = "def ghc" }, try csv.next());
 70 |     try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next());
 71 | 
 72 |     const next = csv.next() catch unreachable;
 73 | 
 74 |     try expect(next == null);
 75 | }
 76 | 
 77 | test "Read quoted fields" {
 78 |     const file = try std.fs.cwd().openFile("test/resources/test-4.csv", .{});
 79 |     defer file.close();
 80 |     var csv = try getTokenizer(file, &default_buffer, .{});
 81 | 
 82 |     try expectToken(csv_mod.CsvToken{ .field = "1" }, try csv.next());
 83 |     try expectToken(csv_mod.CsvToken{ .field = "def ghc" }, try csv.next());
 84 |     try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next());
 85 | 
 86 |     try expectToken(csv_mod.CsvToken{ .field = "2" }, try csv.next());
 87 |     try expectToken(csv_mod.CsvToken{ .field = "abc \"def\"" }, try csv.next());
 88 |     try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next());
 89 | 
 90 |     const next = csv.next() catch unreachable;
 91 | 
 92 |     try expect(next == null);
 93 | }
 94 | 
 95 | test "Second read is necessary to obtain field" {
 96 |     const file = try std.fs.cwd().openFile("test/resources/test-read-required-for-field.csv", .{});
 97 |     defer file.close();
 98 |     var csv = try getTokenizer(file, default_buffer[0..6], .{});
 99 | 
100 |     try expectToken(csv_mod.CsvToken{ .field = "12345" }, try csv.next());
101 |     try expectToken(csv_mod.CsvToken{ .field = "67890" }, try csv.next());
102 |     try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next());
103 | 
104 |     const next = csv.next() catch unreachable;
105 | 
106 |     try expect(next == null);
107 | }
108 | 
109 | test "File is empty" {
110 |     const file = try std.fs.cwd().openFile("test/resources/test-empty.csv", .{});
111 |     defer file.close();
112 |     var csv = try getTokenizer(file, &default_buffer, .{});
113 | 
114 |     const next = csv.next() catch unreachable;
115 | 
116 |     try expect(next == null);
117 | }
118 | 
119 | test "Field is longer than buffer" {
120 |     const file = try std.fs.cwd().openFile("test/resources/test-error-short-buffer.csv", .{});
121 |     defer file.close();
122 |     var csv = try getTokenizer(file, default_buffer[0..9], .{});
123 | 
124 |     const next = csv.next();
125 |     try std.testing.expectError(csv_mod.CsvError.ShortBuffer, next);
126 | }
127 | 
128 | test "Quoted field is longer than buffer" {
129 |     const file = try std.fs.cwd().openFile("test/resources/test-error-short-buffer-quoted.csv", .{});
130 |     defer file.close();
131 |     var csv = try getTokenizer(file, default_buffer[0..10], .{});
132 | 
133 |     const next = csv.next();
134 |     try std.testing.expectError(csv_mod.CsvError.ShortBuffer, next);
135 | }
136 | 
137 | test "Quoted field with double quotes is longer than buffer" {
138 |     const file = try std.fs.cwd().openFile("test/resources/test-error-short-buffer-quoted-with-double.csv", .{});
139 |     defer file.close();
140 |     var csv = try getTokenizer(file, default_buffer[0..11], .{});
141 | 
142 |     const next = csv.next();
143 |     try std.testing.expectError(csv_mod.CsvError.ShortBuffer, next);
144 | }
145 | 
146 | test "Quoted field with double quotes can be read on retry" {
147 |     const file = try std.fs.cwd().openFile("test/resources/test-error-short-buffer-quoted-with-double.csv", .{});
148 |     defer file.close();
149 |     var csv = try getTokenizer(file, default_buffer[0..14], .{});
150 | 
151 |     try expectToken(csv_mod.CsvToken{ .field = "1234567890\"" }, try csv.next());
152 |     try expectToken(csv_mod.CsvToken{ .row_end = {} }, try csv.next());
153 | 
154 |     const next = csv.next() catch unreachable;
155 | 
156 |     try expect(next == null);
157 | }
158 | 
159 | // TODO test last line with new line and without
160 | 


--------------------------------------------------------------------------------
/flake.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nodes": {
  3 |     "flake-compat": {
  4 |       "flake": false,
  5 |       "locked": {
  6 |         "lastModified": 1696426674,
  7 |         "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
  8 |         "owner": "edolstra",
  9 |         "repo": "flake-compat",
 10 |         "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
 11 |         "type": "github"
 12 |       },
 13 |       "original": {
 14 |         "owner": "edolstra",
 15 |         "repo": "flake-compat",
 16 |         "type": "github"
 17 |       }
 18 |     },
 19 |     "flake-compat_2": {
 20 |       "flake": false,
 21 |       "locked": {
 22 |         "lastModified": 1696426674,
 23 |         "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
 24 |         "owner": "edolstra",
 25 |         "repo": "flake-compat",
 26 |         "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
 27 |         "type": "github"
 28 |       },
 29 |       "original": {
 30 |         "owner": "edolstra",
 31 |         "repo": "flake-compat",
 32 |         "type": "github"
 33 |       }
 34 |     },
 35 |     "flake-utils": {
 36 |       "inputs": {
 37 |         "systems": "systems"
 38 |       },
 39 |       "locked": {
 40 |         "lastModified": 1710146030,
 41 |         "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
 42 |         "owner": "numtide",
 43 |         "repo": "flake-utils",
 44 |         "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
 45 |         "type": "github"
 46 |       },
 47 |       "original": {
 48 |         "owner": "numtide",
 49 |         "repo": "flake-utils",
 50 |         "type": "github"
 51 |       }
 52 |     },
 53 |     "flake-utils_2": {
 54 |       "inputs": {
 55 |         "systems": "systems_2"
 56 |       },
 57 |       "locked": {
 58 |         "lastModified": 1705309234,
 59 |         "narHash": "sha256-uNRRNRKmJyCRC/8y1RqBkqWBLM034y4qN7EprSdmgyA=",
 60 |         "owner": "numtide",
 61 |         "repo": "flake-utils",
 62 |         "rev": "1ef2e671c3b0c19053962c07dbda38332dcebf26",
 63 |         "type": "github"
 64 |       },
 65 |       "original": {
 66 |         "owner": "numtide",
 67 |         "repo": "flake-utils",
 68 |         "type": "github"
 69 |       }
 70 |     },
 71 |     "flake-utils_3": {
 72 |       "inputs": {
 73 |         "systems": "systems_3"
 74 |       },
 75 |       "locked": {
 76 |         "lastModified": 1710146030,
 77 |         "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
 78 |         "owner": "numtide",
 79 |         "repo": "flake-utils",
 80 |         "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
 81 |         "type": "github"
 82 |       },
 83 |       "original": {
 84 |         "owner": "numtide",
 85 |         "repo": "flake-utils",
 86 |         "type": "github"
 87 |       }
 88 |     },
 89 |     "flake-utils_4": {
 90 |       "inputs": {
 91 |         "systems": "systems_4"
 92 |       },
 93 |       "locked": {
 94 |         "lastModified": 1705309234,
 95 |         "narHash": "sha256-uNRRNRKmJyCRC/8y1RqBkqWBLM034y4qN7EprSdmgyA=",
 96 |         "owner": "numtide",
 97 |         "repo": "flake-utils",
 98 |         "rev": "1ef2e671c3b0c19053962c07dbda38332dcebf26",
 99 |         "type": "github"
100 |       },
101 |       "original": {
102 |         "owner": "numtide",
103 |         "repo": "flake-utils",
104 |         "type": "github"
105 |       }
106 |     },
107 |     "gitignore": {
108 |       "inputs": {
109 |         "nixpkgs": [
110 |           "zls",
111 |           "nixpkgs"
112 |         ]
113 |       },
114 |       "locked": {
115 |         "lastModified": 1709087332,
116 |         "narHash": "sha256-HG2cCnktfHsKV0s4XW83gU3F57gaTljL9KNSuG6bnQs=",
117 |         "owner": "hercules-ci",
118 |         "repo": "gitignore.nix",
119 |         "rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
120 |         "type": "github"
121 |       },
122 |       "original": {
123 |         "owner": "hercules-ci",
124 |         "repo": "gitignore.nix",
125 |         "type": "github"
126 |       }
127 |     },
128 |     "langref": {
129 |       "flake": false,
130 |       "locked": {
131 |         "narHash": "sha256-O6p2tiKD8ZMhSX+DeA/o5hhAvcPkU2J9lFys/r11peY=",
132 |         "type": "file",
133 |         "url": "https://raw.githubusercontent.com/ziglang/zig/0fb2015fd3422fc1df364995f9782dfe7255eccd/doc/langref.html.in"
134 |       },
135 |       "original": {
136 |         "type": "file",
137 |         "url": "https://raw.githubusercontent.com/ziglang/zig/0fb2015fd3422fc1df364995f9782dfe7255eccd/doc/langref.html.in"
138 |       }
139 |     },
140 |     "nixpkgs": {
141 |       "locked": {
142 |         "lastModified": 1710814491,
143 |         "narHash": "sha256-LyI5tOn0F7NM3WyZj3ULRp8i44IqarbY6WTxalD0n4A=",
144 |         "owner": "nixos",
145 |         "repo": "nixpkgs",
146 |         "rev": "ff1a94e523ae9fb272e0581f068baee5d1068476",
147 |         "type": "github"
148 |       },
149 |       "original": {
150 |         "owner": "nixos",
151 |         "repo": "nixpkgs",
152 |         "rev": "ff1a94e523ae9fb272e0581f068baee5d1068476",
153 |         "type": "github"
154 |       }
155 |     },
156 |     "nixpkgs_2": {
157 |       "locked": {
158 |         "lastModified": 1708161998,
159 |         "narHash": "sha256-6KnemmUorCvlcAvGziFosAVkrlWZGIc6UNT9GUYr0jQ=",
160 |         "owner": "NixOS",
161 |         "repo": "nixpkgs",
162 |         "rev": "84d981bae8b5e783b3b548de505b22880559515f",
163 |         "type": "github"
164 |       },
165 |       "original": {
166 |         "owner": "NixOS",
167 |         "ref": "nixos-23.11",
168 |         "repo": "nixpkgs",
169 |         "type": "github"
170 |       }
171 |     },
172 |     "nixpkgs_3": {
173 |       "locked": {
174 |         "lastModified": 1717696253,
175 |         "narHash": "sha256-1+ua0ggXlYYPLTmMl3YeYYsBXDSCqT+Gw3u6l4gvMhA=",
176 |         "owner": "NixOS",
177 |         "repo": "nixpkgs",
178 |         "rev": "9b5328b7f761a7bbdc0e332ac4cf076a3eedb89b",
179 |         "type": "github"
180 |       },
181 |       "original": {
182 |         "owner": "NixOS",
183 |         "ref": "nixos-24.05",
184 |         "repo": "nixpkgs",
185 |         "type": "github"
186 |       }
187 |     },
188 |     "root": {
189 |       "inputs": {
190 |         "flake-utils": "flake-utils",
191 |         "nixpkgs": "nixpkgs",
192 |         "zig": "zig",
193 |         "zls": "zls"
194 |       }
195 |     },
196 |     "systems": {
197 |       "locked": {
198 |         "lastModified": 1681028828,
199 |         "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
200 |         "owner": "nix-systems",
201 |         "repo": "default",
202 |         "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
203 |         "type": "github"
204 |       },
205 |       "original": {
206 |         "owner": "nix-systems",
207 |         "repo": "default",
208 |         "type": "github"
209 |       }
210 |     },
211 |     "systems_2": {
212 |       "locked": {
213 |         "lastModified": 1681028828,
214 |         "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
215 |         "owner": "nix-systems",
216 |         "repo": "default",
217 |         "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
218 |         "type": "github"
219 |       },
220 |       "original": {
221 |         "owner": "nix-systems",
222 |         "repo": "default",
223 |         "type": "github"
224 |       }
225 |     },
226 |     "systems_3": {
227 |       "locked": {
228 |         "lastModified": 1681028828,
229 |         "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
230 |         "owner": "nix-systems",
231 |         "repo": "default",
232 |         "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
233 |         "type": "github"
234 |       },
235 |       "original": {
236 |         "owner": "nix-systems",
237 |         "repo": "default",
238 |         "type": "github"
239 |       }
240 |     },
241 |     "systems_4": {
242 |       "locked": {
243 |         "lastModified": 1681028828,
244 |         "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
245 |         "owner": "nix-systems",
246 |         "repo": "default",
247 |         "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
248 |         "type": "github"
249 |       },
250 |       "original": {
251 |         "owner": "nix-systems",
252 |         "repo": "default",
253 |         "type": "github"
254 |       }
255 |     },
256 |     "zig": {
257 |       "inputs": {
258 |         "flake-compat": "flake-compat",
259 |         "flake-utils": "flake-utils_2",
260 |         "nixpkgs": "nixpkgs_2"
261 |       },
262 |       "locked": {
263 |         "lastModified": 1717848532,
264 |         "narHash": "sha256-d+xIUvSTreHl8pAmU1fnmkfDTGQYCn2Rb/zOwByxS2M=",
265 |         "owner": "mitchellh",
266 |         "repo": "zig-overlay",
267 |         "rev": "02fc5cc555fc14fda40c42d7c3250efa43812b43",
268 |         "type": "github"
269 |       },
270 |       "original": {
271 |         "owner": "mitchellh",
272 |         "repo": "zig-overlay",
273 |         "type": "github"
274 |       }
275 |     },
276 |     "zig-overlay": {
277 |       "inputs": {
278 |         "flake-compat": "flake-compat_2",
279 |         "flake-utils": "flake-utils_4",
280 |         "nixpkgs": [
281 |           "zls",
282 |           "nixpkgs"
283 |         ]
284 |       },
285 |       "locked": {
286 |         "lastModified": 1717848532,
287 |         "narHash": "sha256-d+xIUvSTreHl8pAmU1fnmkfDTGQYCn2Rb/zOwByxS2M=",
288 |         "owner": "mitchellh",
289 |         "repo": "zig-overlay",
290 |         "rev": "02fc5cc555fc14fda40c42d7c3250efa43812b43",
291 |         "type": "github"
292 |       },
293 |       "original": {
294 |         "owner": "mitchellh",
295 |         "repo": "zig-overlay",
296 |         "type": "github"
297 |       }
298 |     },
299 |     "zls": {
300 |       "inputs": {
301 |         "flake-utils": "flake-utils_3",
302 |         "gitignore": "gitignore",
303 |         "langref": "langref",
304 |         "nixpkgs": "nixpkgs_3",
305 |         "zig-overlay": "zig-overlay"
306 |       },
307 |       "locked": {
308 |         "lastModified": 1717891507,
309 |         "narHash": "sha256-l/Zo1OwdB3js3wXOpgFLozKwq+bdsPySZtKbEbYBb7U=",
310 |         "owner": "zigtools",
311 |         "repo": "zls",
312 |         "rev": "a26718049a8657d4da04c331aeced1697bc7652b",
313 |         "type": "github"
314 |       },
315 |       "original": {
316 |         "owner": "zigtools",
317 |         "ref": "refs/tags/0.13.0",
318 |         "repo": "zls",
319 |         "type": "github"
320 |       }
321 |     }
322 |   },
323 |   "root": "root",
324 |   "version": 7
325 | }
326 | 


--------------------------------------------------------------------------------
/src/main.zig:
--------------------------------------------------------------------------------
  1 | //  Copyright (c) 2021 beho
  2 | //
  3 | //  This library is free software; you can redistribute it and/or modify it
  4 | //  under the terms of the MIT license. See LICENSE for details.
  5 | 
  6 | const std = @import("std");
  7 | const mem = std.mem;
  8 | const print = std.debug.print;
  9 | const assert = std.debug.assert;
 10 | 
 11 | pub const CsvTokenType = enum {
 12 |     field,
 13 |     row_end,
 14 | };
 15 | 
 16 | pub const CsvToken = union(CsvTokenType) {
 17 |     field: []const u8,
 18 |     row_end: void,
 19 | };
 20 | 
 21 | pub const CsvError = error{
 22 |     ShortBuffer,
 23 |     MisplacedQuote,
 24 |     NoSeparatorAfterField,
 25 | };
 26 | 
 27 | pub const CsvConfig = struct {
 28 |     col_sep: u8 = ',',
 29 |     row_sep: u8 = '\n',
 30 |     quote: u8 = '"',
 31 | };
 32 | 
 33 | const QuoteFieldReadResult = struct {
 34 |     value: []u8,
 35 |     contains_quotes: bool,
 36 | };
 37 | 
 38 | fn CsvReader(comptime Reader: type) type {
 39 | 
 40 |     // TODO comptime
 41 |     return struct {
 42 |         buffer: []u8,
 43 |         current: []u8,
 44 | 
 45 |         reader: Reader,
 46 |         all_read: bool = false,
 47 | 
 48 |         const Self = @This();
 49 | 
 50 |         pub fn init(reader: Reader, buffer: []u8) Self {
 51 |             return .{
 52 |                 .buffer = buffer,
 53 |                 .current = buffer[0..0],
 54 |                 .reader = reader,
 55 |             };
 56 |         }
 57 | 
 58 |         inline fn empty(self: *Self) bool {
 59 |             return self.current.len == 0;
 60 |         }
 61 | 
 62 |         pub fn char(self: *Self) !?u8 {
 63 |             if (!try self.ensureData()) {
 64 |                 return null;
 65 |             }
 66 | 
 67 |             const c = self.current[0];
 68 |             self.current = self.current[1..];
 69 | 
 70 |             return c;
 71 |         }
 72 | 
 73 |         pub inline fn peek(self: *Self) !?u8 {
 74 |             if (!try self.ensureData()) {
 75 |                 return null;
 76 |             }
 77 | 
 78 |             return self.current[0];
 79 |         }
 80 | 
 81 |         pub fn until(self: *Self, terminators: []const u8) !?[]u8 {
 82 |             if (!try self.ensureData()) {
 83 |                 return null;
 84 |             }
 85 | 
 86 |             for (self.current, 0..) |c, pos| {
 87 |                 // TODO inline
 88 |                 for (terminators) |ct| {
 89 |                     if (c == ct) {
 90 |                         const s = self.current[0..pos];
 91 |                         self.current = self.current[pos..];
 92 |                         // print("{}|{}", .{ s, self.current });
 93 |                         return s;
 94 |                     }
 95 |                 }
 96 |             }
 97 | 
 98 |             // print("ALL_READ: {}\n", .{self.all_read});
 99 |             return null;
100 |         }
101 | 
102 |         pub fn untilClosingQuote(self: *Self, quote: u8) !?QuoteFieldReadResult {
103 |             if (!try self.ensureData()) {
104 |                 return null;
105 |             }
106 | 
107 |             var idx: usize = 0;
108 |             var contains_quotes: bool = false;
109 |             while (idx < self.current.len) : (idx += 1) {
110 |                 const c = self.current[idx];
111 |                 // print("IDX QUOTED: {}={c}\n", .{ idx, c });
112 |                 if (c == quote) {
113 |                     // double quotes, shift forward
114 |                     // print("PEEK {c}\n", .{buffer[idx + 1]});
115 |                     if (idx < self.current.len - 1 and self.current[idx + 1] == '"') {
116 |                         // print("DOUBLE QUOTES\n", .{});
117 |                         contains_quotes = true;
118 |                         idx += 1;
119 |                     } else {
120 |                         // print("ALL_READ {}\n", .{self.all_read});
121 |                         if (!self.all_read and idx == self.current.len - 1) {
122 |                             return null;
123 |                         }
124 | 
125 |                         const s = self.current[0..idx];
126 |                         self.current = self.current[idx..];
127 | 
128 |                         return QuoteFieldReadResult{ .value = s, .contains_quotes = contains_quotes };
129 |                     }
130 |                 }
131 |             }
132 | 
133 |             return null;
134 |         }
135 | 
136 |         /// Tries to read more data from an underlying reader if buffer is not already full.
137 |         /// If anything was read returns true, otherwise false.
138 |         pub fn read(self: *Self) !bool {
139 |             const current_len = self.current.len;
140 | 
141 |             if (current_len == self.buffer.len) {
142 |                 return false;
143 |             }
144 | 
145 |             if (current_len > 0) {
146 |                 mem.copyForwards(u8, self.buffer, self.current);
147 |             }
148 | 
149 |             const read_len = try self.reader.read(self.buffer[current_len..]);
150 |             // print("READ: current_len={} read_len={}\n", .{ current_len, read_len });
151 | 
152 |             self.current = self.buffer[0 .. current_len + read_len];
153 |             self.all_read = read_len == 0;
154 | 
155 |             return read_len > 0;
156 |         }
157 | 
158 |         // Ensures that there are some data in the buffer. Returns false if no data are available
159 |         pub inline fn ensureData(self: *Self) !bool {
160 |             if (!self.empty()) {
161 |                 return true;
162 |             }
163 | 
164 |             if (self.all_read) {
165 |                 return false;
166 |             }
167 | 
168 |             return self.read();
169 |         }
170 |     };
171 | }
172 | 
173 | /// Tokenizes input from reader into stream of CsvTokens
174 | pub fn CsvTokenizer(comptime Reader: type) type {
175 |     const Status = enum {
176 |         initial,
177 |         row_start,
178 |         field,
179 |         quoted_field_end,
180 |         row_end,
181 |         eof,
182 |     };
183 | 
184 |     return struct {
185 |         const Self = @This();
186 | 
187 |         config: CsvConfig,
188 |         terminal_chars: [3]u8 = undefined,
189 | 
190 |         reader: CsvReader(Reader),
191 | 
192 |         status: Status = .initial,
193 | 
194 |         pub fn init(reader: Reader, buffer: []u8, config: CsvConfig) !Self {
195 |             return Self{
196 |                 .config = config,
197 |                 .terminal_chars = [_]u8{ config.col_sep, config.row_sep, '"' },
198 |                 .reader = CsvReader(Reader).init(reader, buffer),
199 |             };
200 |         }
201 | 
202 |         pub fn next(self: *Self) !?CsvToken {
203 |             var next_status: ?Status = self.status;
204 | 
205 |             // Cannot use anonymous enum literals for Status
206 |             // https://github.com/ziglang/zig/issues/4255
207 | 
208 |             while (next_status) |status| {
209 |                 // print("STATUS: {}\n", .{self.status});
210 |                 next_status = switch (status) {
211 |                     .initial => if (try self.reader.read()) Status.row_start else Status.eof,
212 |                     .row_start => if (!try self.reader.ensureData()) Status.eof else Status.field,
213 |                     .field => blk: {
214 |                         if (!try self.reader.ensureData()) {
215 |                             break :blk .row_end;
216 |                         }
217 | 
218 |                         return try self.parseField();
219 |                     },
220 |                     .quoted_field_end => blk: {
221 |                         // read closing quotes
222 |                         const quote = try self.reader.char();
223 |                         assert(quote == self.config.quote);
224 | 
225 |                         if (!try self.reader.ensureData()) {
226 |                             break :blk Status.row_end;
227 |                         }
228 | 
229 |                         const c = (try self.reader.peek());
230 | 
231 |                         if (c) |value| {
232 |                             // print("END: {}\n", .{value});
233 |                             if (value == self.config.col_sep) {
234 |                                 // TODO write repro for assert with optional
235 |                                 // const col_sep = try self.reader.char();
236 |                                 // assert(col_sep == self.config.col_sep);
237 |                                 const col_sep = (try self.reader.char()).?;
238 |                                 assert(col_sep == self.config.col_sep);
239 | 
240 |                                 break :blk Status.field;
241 |                             }
242 | 
243 |                             if (value == self.config.row_sep) {
244 |                                 break :blk Status.row_end;
245 |                             }
246 | 
247 |                             // quote means that it did not fit into buffer and it cannot be analyzed as ""
248 |                             if (value == self.config.quote) {
249 |                                 return CsvError.ShortBuffer;
250 |                             }
251 |                         } else {
252 |                             break :blk Status.eof;
253 |                         }
254 | 
255 |                         return CsvError.NoSeparatorAfterField;
256 |                     },
257 |                     .row_end => {
258 |                         if (!try self.reader.ensureData()) {
259 |                             self.status = Status.eof;
260 |                             return CsvToken{ .row_end = {} };
261 |                         }
262 | 
263 |                         const rowSep = try self.reader.char();
264 |                         assert(rowSep == self.config.row_sep);
265 | 
266 |                         self.status = Status.row_start;
267 | 
268 |                         return CsvToken{ .row_end = {} };
269 |                     },
270 |                     .eof => {
271 |                         return null;
272 |                     },
273 |                 };
274 | 
275 |                 // make the transition and also ensure that next_status is set at this point
276 |                 self.status = next_status.?;
277 |             }
278 | 
279 |             unreachable;
280 |         }
281 | 
282 |         fn parseField(self: *Self) !CsvToken {
283 |             const first = (try self.reader.peek()).?;
284 | 
285 |             if (first != '"') {
286 |                 var field = try self.reader.until(&self.terminal_chars);
287 |                 if (field == null) {
288 |                     // force read - maybe separator was not read yet
289 |                     const hasData = try self.reader.read();
290 |                     if (!hasData) {
291 |                         return CsvError.ShortBuffer;
292 |                     }
293 | 
294 |                     field = try self.reader.until(&self.terminal_chars);
295 |                     if (field == null) {
296 |                         return CsvError.ShortBuffer;
297 |                     }
298 |                 }
299 | 
300 |                 const terminator = (try self.reader.peek()).?;
301 | 
302 |                 if (terminator == self.config.col_sep) {
303 |                     _ = try self.reader.char();
304 |                     return CsvToken{ .field = field.? };
305 |                 }
306 | 
307 |                 if (terminator == self.config.row_sep) {
308 |                     self.status = .row_end;
309 |                     return CsvToken{ .field = field.? };
310 |                 }
311 | 
312 |                 if (terminator == self.config.quote) {
313 |                     return CsvError.MisplacedQuote;
314 |                 }
315 | 
316 |                 return CsvError.ShortBuffer;
317 |             } else {
318 |                 // consume opening quote
319 |                 _ = try self.reader.char();
320 |                 var quoted_field = try self.reader.untilClosingQuote(self.config.quote);
321 |                 if (quoted_field == null) {
322 |                     // force read - maybe separator was not read yet
323 |                     const hasData = try self.reader.read();
324 |                     if (!hasData) {
325 |                         return CsvError.ShortBuffer;
326 |                     }
327 | 
328 |                     // this read will fill the buffer
329 |                     quoted_field = try self.reader.untilClosingQuote(self.config.quote);
330 |                     if (quoted_field == null) {
331 |                         return CsvError.ShortBuffer;
332 |                     }
333 |                 }
334 | 
335 |                 self.status = .quoted_field_end;
336 | 
337 |                 const field = quoted_field.?;
338 |                 if (!field.contains_quotes) {
339 |                     return CsvToken{ .field = field.value };
340 |                 } else {
341 |                     // walk the field and remove double quotes by shifting bytes
342 |                     const value = field.value;
343 |                     var diff: u64 = 0;
344 |                     var idx: usize = 0;
345 |                     while (idx < value.len) : (idx += 1) {
346 |                         const c = value[idx];
347 |                         value[idx - diff] = c;
348 | 
349 |                         if (c == self.config.quote) {
350 |                             diff += 1;
351 |                             idx += 1;
352 |                         }
353 |                     }
354 | 
355 |                     return CsvToken{ .field = value[0 .. value.len - diff] };
356 |                 }
357 |             }
358 |         }
359 |     };
360 | }
361 | 


--------------------------------------------------------------------------------