├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── composer.json
├── demo.sh
├── doc
└── screenshots
│ └── awk-csv-parser.png
├── resources
└── iso_3166-1.csv
├── src
├── awk-csv-parser.sh
├── csv-parser.awk
└── inc
│ └── functions.sh
└── tests
├── all-tests.sh
└── resources
├── invalid-expected.txt
├── invalid.csv
├── ok-expected.txt
└── ok.csv
/.gitignore:
--------------------------------------------------------------------------------
1 | /.settings
2 | /.buildpath
3 | /.project
4 | /build/
5 | *~
6 | /vendor/
7 | composer.phar
8 | /cache.properties
9 | /.twgit*
10 | /.idea
11 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # See: http://about.travis-ci.org/docs/user/build-configuration/
2 |
3 | before_install:
4 | - sudo apt-get install -qq gawk
5 |
6 | script: tests/all-tests.sh
7 |
8 | notifications:
9 | on_success: always
10 | on_failure: always
11 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | Change log
2 | ==========
3 |
4 | ## Version 1.0.2 (2017-11-14)
5 |
6 | FIX:
7 |
8 | - [#3](https://github.com/geoffroy-aubry/awk-csv-parser/pull/3): Fix error when path to script contains spaces. Thanks to [agraboso](https://github.com/agraboso)!
9 |
10 | ## Version 1.0.1 (2017-01-29)
11 |
12 | Doc:
13 |
14 | - [#1](https://github.com/geoffroy-aubry/awk-csv-parser/pull/1): Added instructions to install tool on OS X. Thanks to [emaV](https://github.com/emaV)!
15 |
16 | ## Version 1.0.0 (2013-10-22)
17 |
18 | First release on Github.
19 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Awk CSV parser
2 |
3 | [](https://packagist.org/packages/geoffroy-aubry/awk-csv-parser)
4 | [](http://travis-ci.org/geoffroy-aubry/awk-csv-parser)
5 |
6 | AWK and Bash code to easily parse CSV files, with possibly embedded commas and quotes.
7 |
8 | ## Table of Contents
9 |
10 | * [Features](#features)
11 | * [Known limitations](#known-limitations)
12 | * [Links](#links)
13 | * [Requirements](#requirements)
14 | * [Usage](#usage)
15 | * [Examples](#examples)
16 | * [Installation](#installation)
17 | * [Copyrights & licensing](#copyrights--licensing)
18 | * [Change log](#change-log)
19 | * [Continuous integration](#continuous-integration)
20 | * [Git branching model](#git-branching-model)
21 |
22 | ## Features
23 |
24 | * Parse CSV files with only Bash and Awk.
25 | * Allow to process CSV data with standard UNIX shell commands.
26 | * Properly handle CSV data that contain field separators (commas by default)
27 | and field enclosures (double quotes by default) inside enclosed data fields.
28 | * Process CSVs from stdin pipe as well as from multiple command line file arguments.
29 | * Handle any character both for field separator and field enclosure.
30 | * Can rewrite CSV records with a multi-character output field separator, CSV enclosure characters removed
31 | and escaped enclosures unescaped.
32 | * Each line may not contain the same number of fields throughout the file.
33 |
34 | ### Known limitations
35 |
36 | * Does not **yet** handle embedded newlines inside data fields.
37 |
38 | ### Links
39 |
40 | * [Wikipedia: Comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values)
41 | * [RFC 4180: Common Format and MIME Type for Comma-Separated Values (CSV) Files](http://tools.ietf.org/html/rfc4180)
42 |
43 | Other Awk implementations:
44 |
45 | * [dbro/csvquote](https://github.com/dbro/csvquote)
46 | * [AWK CSV Parser](http://lorance.freeshell.org/csv/)
47 |
48 | ## Requirements
49 |
50 | - Bash v4 _(2009)_ and above
51 | - GNU [Awk](http://www.gnu.org/software/gawk/) 3.1+
52 |
53 | Tested on Debian/Ubuntu Linux.
54 |
55 | ## Usage
56 |
57 | Displayed by:
58 |
59 | ```bash
60 | $ awk-csv-parser.sh --help
61 | ```
62 |
63 | 
64 |
65 | ##### Text version
66 |
67 | Description
68 | AWK and Bash code to easily parse CSV files, with possibly embedded commas and quotes.
69 |
70 | Usage
71 | awk-csv-parser.sh [OPTION]… []…
72 |
73 | Options
74 | -e , --enclosure=
75 | Set the CSV field enclosure. One character only, '"' (double quote) by default.
76 |
77 | -o , --output-separator=
78 | Set the output field separator. Multiple characters allowed, '|' (pipe) by default.
79 |
80 | -s , --separator=
81 | Set the CSV field separator. One character only, ',' (comma) by default.
82 |
83 | -h, --help
84 | Display this help.
85 |
86 |
87 | CSV file to parse.
88 |
89 | Discussion
90 | – The last record in the file may or may not have an ending line break.
91 | – Each line may not contain the same number of fields throughout the file.
92 | – The last field in the record must not be followed by a field separator.
93 | – Fields containing field enclosures or field separators must be enclosed in field
94 | enclosure.
95 | – A field enclosure appearing inside a field must be escaped by preceding it with
96 | another field enclosure. Example: "aaa","b""bb","ccc"
97 |
98 | Examples
99 | Parse a CSV and display records without field enclosure, fields pipe-separated:
100 | awk-csv-parser.sh --output-separator='|' resources/iso_3166-1.csv
101 |
102 | Remove CSV's header before parsing:
103 | tail -n+2 resources/iso_3166-1.csv | awk-csv-parser.sh
104 |
105 | Keep only first column of multiple files:
106 | awk-csv-parser.sh a.csv b.csv c.csv | cut -d'|' -f1
107 |
108 | Keep only first column, using multiple UTF-8 characters output separator:
109 | awk-csv-parser.sh -o '⇒⇒' resources/iso_3166-1.csv | awk -F '⇒⇒' '{print $1}'
110 |
111 | You can directly call the Awk script:
112 | awk -f csv-parser.awk -v separator=',' -v enclosure='"' --source '{
113 | csv_parse_record($0, separator, enclosure, csv)
114 | print csv[2] " ⇒ " csv[0]
115 | }' resources/iso_3166-1.csv
116 |
117 | ## Examples
118 |
119 | Excerpt from `resources/iso_3166-1.csv` ([full version](resources/iso_3166-1.csv)):
120 |
121 | ```csv
122 | Country or Area Name,ISO ALPHA-2 Code,ISO ALPHA-3 Code,ISO Numeric Code
123 | Brazil,BR,BRA,076
124 | British Virgin Islands,VG,VGB,092
125 | British Indian Ocean Territory,IO,IOT,086
126 | Brunei Darussalam,BN,BRN,096
127 | Burkina Faso,BF,BFA,854
128 | "Hong Kong, Special Administrative Region of China",HK,HKG,344
129 | "Macao, Special Administrative Region of China",MO,MAC,446
130 | Christmas Island,CX,CXR,162
131 | Cocos (Keeling) Islands,CC,CCK,166
132 | ```
133 |
134 | ##### 1. Parse a CSV and display records without field enclosure, output fields pipe-separated
135 |
136 | ```bash
137 | $ awk-csv-parser.sh --output-separator='|' resources/iso_3166-1.csv | head -n10
138 | # or:
139 | $ cat resources/iso_3166-1.csv | awk-csv-parser.sh --output-separator='|' | head -n10
140 | ```
141 |
142 | Result:
143 |
144 | ```csv
145 | Country or Area Name|ISO ALPHA-2 Code|ISO ALPHA-3 Code|ISO Numeric Code|
146 | Brazil|BR|BRA|076|
147 | British Virgin Islands|VG|VGB|092|
148 | British Indian Ocean Territory|IO|IOT|086|
149 | Brunei Darussalam|BN|BRN|096|
150 | Burkina Faso|BF|BFA|854|
151 | Hong Kong, Special Administrative Region of China|HK|HKG|344|
152 | Macao, Special Administrative Region of China|MO|MAC|446|
153 | Christmas Island|CX|CXR|162|
154 | Cocos (Keeling) Islands|CC|CCK|166|
155 | ```
156 |
157 | ##### 2. Remove CSV header, keep only first column and grep fields containing separator
158 |
159 | ```bash
160 | $ tail -n+2 resources/iso_3166-1.csv | awk-csv-parser.sh | cut -d'|' -f1 | grep ,
161 | ```
162 |
163 | Result:
164 |
165 | ```
166 | Hong Kong, Special Administrative Region of China
167 | Macao, Special Administrative Region of China
168 | Congo, Democratic Republic of the
169 | Iran, Islamic Republic of
170 | Korea, Democratic People's Republic of
171 | Korea, Republic of
172 | Micronesia, Federated States of
173 | Taiwan, Republic of China
174 | Tanzania, United Republic of
175 | ```
176 |
177 | ##### 3. You can directly call the Awk script
178 |
179 | ```bash
180 | $ awk -f csv-parser.awk -v separator=',' -v enclosure='"' --source '{
181 | csv_parse_record($0, separator, enclosure, csv)
182 | print csv[2] " ⇒ " csv[0]
183 | }' resources/iso_3166-1.csv | head -n10
184 | ```
185 |
186 | Result:
187 |
188 | ```
189 | ISO ALPHA-3 Code ⇒ Country or Area Name
190 | BRA ⇒ Brazil
191 | VGB ⇒ British Virgin Islands
192 | IOT ⇒ British Indian Ocean Territory
193 | BRN ⇒ Brunei Darussalam
194 | BFA ⇒ Burkina Faso
195 | HKG ⇒ Hong Kong, Special Administrative Region of China
196 | MAC ⇒ Macao, Special Administrative Region of China
197 | CXR ⇒ Christmas Island
198 | CCK ⇒ Cocos (Keeling) Islands
199 | ```
200 |
201 | ##### 4. Technical example
202 |
203 | Content of `tests/resources/ok.csv`:
204 |
205 | ```csv
206 | ,,
207 | a, b,c , d ,e e
208 | "","a","a,",",a",",,"
209 | "a""b","""","c"""""
210 | ```
211 |
212 | Test:
213 |
214 | ```bash
215 | $ awk-csv-parser.sh tests/resources/ok.csv
216 | ```
217 |
218 | Result:
219 |
220 | ```
221 | || |
222 | a| b|c | d |e e|
223 | |a|a,|,a|,,|
224 | a"b|"|c""|
225 | ```
226 |
227 | ##### 5. Errors
228 |
229 | Content of `tests/resources/invalid.csv`:
230 |
231 | ```csv
232 | "
233 | "a,
234 | a"
235 | "a"b
236 | ```
237 |
238 | Test:
239 |
240 | ```bash
241 | $ awk-csv-parser.sh tests/resources/invalid.csv
242 | ```
243 |
244 | Result:
245 |
246 | ```
247 | [CSV ERROR: 3] Missing closing quote after '' in following record: '"'
248 | [CSV ERROR: 3] Missing closing quote after 'a,' in following record: '"a,'
249 | [CSV ERROR: 1] Missing opening quote before 'a' in following record: 'a"'
250 | [CSV ERROR: 2] Missing separator after 'a' in following record: '"a"b'
251 | ```
252 |
253 | ## Installation
254 |
255 | ### Debian/Ubuntu
256 |
257 | 1. Move to the directory where you wish to store the source.
258 |
259 | 2. Clone the repository:
260 |
261 | ```bash
262 | $ git clone https://github.com/geoffroy-aubry/awk-csv-parser.git
263 | ```
264 |
265 | 3. You should be on `stable` branch. If not, switch your clone to that branch:
266 |
267 | ```bash
268 | $ cd awk-csv-parser && git checkout stable
269 | ```
270 |
271 | 4. You can create a symlink to `awk-csv-parser.sh`:
272 |
273 | ```bash
274 | $ sudo ln -s /path/to/src/awk-csv-parser.sh /usr/local/bin/awk-csv-parser
275 | ```
276 |
277 | 5. It's ready for use:
278 |
279 | ```bash
280 | $ awk-csv-parser
281 | ```
282 |
283 | ### OS X
284 |
285 | As both `readlink` and `sed` Mac OS X versions are based on BSD with small differences with the GNU version, you need to install GNU utilities:
286 |
287 | ```bash
288 | $ brew install coreutils gnu-sed [--with-default-names]
289 | ```
290 |
291 | With `--with-default-names` option, GNU utilities replace those of OS X.
292 | Else GNU utilities are prefixed with a `g` and you have to edit the scripts `src/awk-csv-parser.sh` and `tests/all-tests.sh`
293 | to replace both `readlink` and `sed` with `greadlink` and `gsed` respectively.
294 |
295 | Then follow Debian/Ubuntu installation process.
296 |
297 | ## Copyrights & licensing
298 |
299 | Licensed under the GNU Lesser General Public License v3 (LGPL version 3).
300 | See [LICENSE](LICENSE) file for details.
301 |
302 | ## Change log
303 |
304 | See [CHANGELOG](CHANGELOG.md) file for details.
305 |
306 | ## Continuous integration
307 |
308 | [](http://travis-ci.org/geoffroy-aubry/awk-csv-parser)
309 |
310 | Launch unit tests:
311 |
312 | ```bash
313 | $ tests/all-tests.sh
314 | ```
315 |
316 | ## Git branching model
317 |
318 | The git branching model used for development is the one described and assisted
319 | by `twgit` tool: [https://github.com/Twenga/twgit](https://github.com/Twenga/twgit).
320 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "geoffroy-aubry/awk-csv-parser",
3 | "description": "AWK and Bash code to easily parse CSV files, with possibly embedded commas and quotes.",
4 | "type": "library",
5 | "authors": [{
6 | "name": "Geoffroy Aubry",
7 | "email": "geoffroy.aubry@free.fr"
8 | }],
9 | "keywords": ["csv parser", "awk", "bash"],
10 | "license": ["LGPL-3.0+"],
11 | "bin": ["src/awk-csv-parser.sh", "src/csv-parser.awk"]
12 | }
13 |
--------------------------------------------------------------------------------
/demo.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ROOT_DIR=$(dirname "$0")
4 |
5 | tail -n+2 $ROOT_DIR/resources/iso_3166-1.csv \
6 | | $ROOT_DIR/src/awk-csv-parser.sh \
7 | | cut -d'|' -f1 | grep --color=always ,
8 |
--------------------------------------------------------------------------------
/doc/screenshots/awk-csv-parser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoffroy-aubry/awk-csv-parser/8e4dfa16db90583d07a37ac33ef79ab8b2c2824d/doc/screenshots/awk-csv-parser.png
--------------------------------------------------------------------------------
/resources/iso_3166-1.csv:
--------------------------------------------------------------------------------
1 | Country or Area Name,ISO ALPHA-2 Code,ISO ALPHA-3 Code,ISO Numeric Code
2 | Brazil,BR,BRA,076
3 | British Virgin Islands,VG,VGB,092
4 | British Indian Ocean Territory,IO,IOT,086
5 | Brunei Darussalam,BN,BRN,096
6 | Burkina Faso,BF,BFA,854
7 | "Hong Kong, Special Administrative Region of China",HK,HKG,344
8 | "Macao, Special Administrative Region of China",MO,MAC,446
9 | Christmas Island,CX,CXR,162
10 | Cocos (Keeling) Islands,CC,CCK,166
11 | Comoros,KM,COM,174
12 | Congo (Brazzaville),CG,COG,178
13 | "Congo, Democratic Republic of the",CD,COD,180
14 | Costa Rica,CR,CRI,188
15 | Côte d'Ivoire,CI,CIV,384
16 | Ethiopia,ET,ETH,231
17 | Falkland Islands (Malvinas),FK,FLK,238
18 | Finland,FI,FIN,246
19 | French Southern Territories,TF,ATF,260
20 | Gabon,GA,GAB,266
21 | Heard Island and Mcdonald Islands,HM,HMD,334
22 | Holy See (Vatican City State),VA,VAT,336
23 | "Iran, Islamic Republic of",IR,IRN,364
24 | Kiribati,KI,KIR,296
25 | "Korea, Democratic People's Republic of",KP,PRK,408
26 | "Korea, Republic of",KR,KOR,410
27 | Mayotte,YT,MYT,175
28 | Mexico,MX,MEX,484
29 | "Micronesia, Federated States of",FM,FSM,583
30 | Norfolk Island,NF,NFK,574
31 | Northern Mariana Islands,MP,MNP,580
32 | Norway,NO,NOR,578
33 | Saint-Barthélemy,BL,BLM,652
34 | Saint-Martin (French part),MF,MAF,663
35 | Switzerland,CH,CHE,756
36 | "Taiwan, Republic of China",TW,TWN,158
37 | "Tanzania, United Republic of",TZ,TZA,834
38 | Thailand,TH,THA,764
39 |
--------------------------------------------------------------------------------
/src/awk-csv-parser.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ##
4 | # Copyright © 2013 Geoffroy Aubry
5 | #
6 | # This file is part of awk-csv-parser.
7 | #
8 | # awk-csv-parser is free software: you can redistribute it and/or modify
9 | # it under the terms of the GNU Lesser General Public License as published by
10 | # the Free Software Foundation, either version 3 of the License, or
11 | # (at your option) any later version.
12 | #
13 | # awk-csv-parser is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | # GNU Lesser General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU Lesser General Public License
19 | # along with awk-csv-parser. If not, see
20 | #
21 |
22 |
23 |
24 | # Treat unset variables and parameters other than the special parameters ‘@’ or ‘*’ as an error
25 | # when performing parameter expansion. An error message will be written to the standard error,
26 | # and a non-interactive shell will exit.
27 | set -o nounset
28 |
29 | # The return value of a pipeline is the value of the last (rightmost) command to exit with a non-zero status,
30 | # or zero if all commands in the pipeline exit successfully:
31 | set -o pipefail
32 |
33 | # Globals:
34 | ROOT_DIR=$(cd "$(dirname "$(readlink -f "$BASH_SOURCE")")/.." && pwd)
35 | ENCLOSURE='"'
36 | SEPARATOR=','
37 | OUTPUT_SEPARATOR='|'
38 | IN='-'
39 |
40 | # Includes:
41 | . "$ROOT_DIR/src/inc/functions.sh"
42 |
43 | # Main:
44 | getOpts "$@"
45 | cat $IN | awk \
46 | -f "$ROOT_DIR/src/csv-parser.awk" \
47 | -v separator=$SEPARATOR \
48 | -v enclosure=$ENCLOSURE \
49 | -v output_separator=$OUTPUT_SEPARATOR \
50 | --source '{csv_parse_and_display($0, separator, enclosure, output_separator)}'
51 |
--------------------------------------------------------------------------------
/src/csv-parser.awk:
--------------------------------------------------------------------------------
1 | ##
2 | # Copyright © 2013 Geoffroy Aubry
3 | #
4 | # This file is part of awk-csv-parser.
5 | #
6 | # awk-csv-parser is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU Lesser General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # awk-csv-parser is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU Lesser General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU Lesser General Public License
17 | # along with awk-csv-parser. If not, see
18 | #
19 |
20 |
21 |
22 | ##
23 | # Extract next field on specified CSV record.
24 | #
25 | # @param string record CSV record to parse.
26 | # @param int pos Position at which search start.
27 | # @param char separator Field separator.
28 | # @param char quote Field enclosure.
29 | # @param array csv Array of found fields in which store the next field (passed by reference).
30 | # @param int num_fields Number of fieds already found.
31 | # @return int Last index of parsed character in CSV record,
32 | # or negative error code (error message in csv_error).
33 | #
34 | function csv_parse_field (record, pos, separator, quote, csv, num_fields) {
35 | if (substr(record, pos, 1) == quote) {
36 | quoted=1
37 | pos++
38 | } else {
39 | quoted=0
40 | }
41 | prev_char_is_quote=0
42 | field=""
43 |
44 | while (pos <= length(record)) {
45 | c = substr(record, pos, 1)
46 | if (c == separator && (! quoted || prev_char_is_quote)) {
47 | csv[num_fields] = field
48 | return ++pos
49 | } else if (c == quote) {
50 | if (! quoted) {
51 | csv_error="Missing opening quote before '" field "' in following record: '" record "'"
52 | return -1
53 | } else if (prev_char_is_quote) {
54 | prev_char_is_quote=0
55 | field = field quote
56 | } else {
57 | if (pos == length(record)) {
58 | quoted=0
59 | } else {
60 | prev_char_is_quote=1
61 | }
62 | }
63 | } else if (prev_char_is_quote) {
64 | csv_error="Missing separator after '" field "' in following record: '" record "'"
65 | return -2
66 | } else {
67 | field = field c
68 | }
69 | pos++
70 | }
71 |
72 | if (quoted) {
73 | csv_error="Missing closing quote after '" field "' in following record: '" record "'"
74 | return -3
75 | } else {
76 | csv[num_fields] = field
77 | return pos
78 | }
79 | }
80 |
81 | ##
82 | # Parse CSV record.
83 | #
84 | # @param string record CSV record to parse.
85 | # @param char separator Field separator.
86 | # @param char quote Field enclosure.
87 | # @param array csv Empty array in which store all fields (passed by reference).
88 | # @return int Number of fields parsed in CSV record,
89 | # or negative error code (error message in csv_error).
90 | #
91 | function csv_parse_record (record, separator, quote, csv) {
92 | if (length(record) == 0) {
93 | return
94 | }
95 |
96 | pos=1
97 | num_fields=0
98 | while (pos <= length(record)) {
99 | pos = csv_parse_field(record, pos, separator, quote, csv, num_fields)
100 | if (pos < 0) {
101 | print "\033[0;31m[CSV ERROR: " (-pos) "] \033[1;31m" csv_error "\033[0m"
102 | return pos
103 | }
104 | num_fields++
105 | }
106 |
107 | if (substr(record, length(record), 1) == separator) {
108 | csv[num_fields++]=""
109 | }
110 |
111 | return num_fields
112 | }
113 |
114 | ##
115 | # Parse CSV record, then display it without quote and replacing specified separator by output_fs.
116 | #
117 | # @param string record CSV record to parse.
118 | # @param char separator Field separator.
119 | # @param char quote Field enclosure.
120 | # @param string output_fs Output field enclosure.
121 | # @return int Return 0 if no error, else return positive error code.
122 | #
123 | function csv_parse_and_display (record, separator, quote, output_fs) {
124 | num_fields=csv_parse_record($0, separator, quote, csv)
125 | if (num_fields >= 0) {
126 | line=""
127 | for (i=0; i
5 | #
6 | # This file is part of awk-csv-parser.
7 | #
8 | # awk-csv-parser is free software: you can redistribute it and/or modify
9 | # it under the terms of the GNU Lesser General Public License as published by
10 | # the Free Software Foundation, either version 3 of the License, or
11 | # (at your option) any later version.
12 | #
13 | # awk-csv-parser is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | # GNU Lesser General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU Lesser General Public License
19 | # along with awk-csv-parser. If not, see
20 | #
21 |
22 |
23 |
24 | ##
25 | # Parse command options.
26 | #
27 | function getOpts () {
28 | local i
29 | local long_option=''
30 |
31 | for i in "$@"; do
32 | # Converting short option into long option:
33 | if [ ! -z "$long_option" ]; then
34 | i="$long_option=$i"
35 | long_option=''
36 | fi
37 |
38 | case $i in
39 | # Short options:
40 | -e) long_option="--enclosure" ;;
41 | -o) long_option="--output-separator" ;;
42 | -s) long_option="--separator" ;;
43 | -[^-]*) displayHelp; exit 0 ;;
44 |
45 | # Long options:
46 | --enclosure=*) ENCLOSURE=${i#*=} ;;
47 | --output-separator=*) OUTPUT_SEPARATOR=${i#*=} ;;
48 | --separator=*) SEPARATOR=${i#*=} ;;
49 | --*) displayHelp; exit 0 ;;
50 |
51 | # CSVs to parse:
52 | *) [[ $IN == '-' ]] && IN="$i" || IN="$IN $i" ;;
53 | esac
54 | done
55 | }
56 |
57 | ##
58 | # Help.
59 | #
60 | function displayHelp () {
61 | local normal='\033[0;37m'
62 | local title='\033[1;37m'
63 | local tab='\033[0;30m┆\033[0m '$normal
64 | local opt='\033[1;33m'
65 | local param='\033[1;36m'
66 | local cmd='\033[0;36m'
67 |
68 | echo -e "
69 | ${title}Description
70 | ${tab}AWK and Bash code to easily parse CSV files, with possibly embedded commas and quotes.
71 |
72 | ${title}Usage
73 | $tab${cmd}$(basename $0) $normal[${opt}OPTION$normal]… $normal[$param$normal]…
74 |
75 | ${title}Options
76 | $tab$opt-e $param$normal, $opt--enclosure$normal=$param
77 | $tab${tab}Set the CSV field enclosure. One character only, '\"' (double quote) by default.
78 | $tab
79 | $tab$opt-o $param$normal, $opt--output-separator$normal=$param
80 | $tab${tab}Set the output field separator. Multiple characters allowed, '|' (pipe) by default.
81 | $tab
82 | $tab$opt-s $param$normal, $opt--separator$normal=$param
83 | $tab${tab}Set the CSV field separator. One character only, ',' (comma) by default.
84 | $tab
85 | $tab$opt-h$normal, $opt--help
86 | $tab${tab}Display this help.
87 | $tab
88 | $tab$param
89 | $tab${tab}CSV file to parse.
90 |
91 | ${title}Discussion
92 | ${tab}– The last record in the file may or may not have an ending line break.
93 | ${tab}– Each line may not contain the same number of fields throughout the file.
94 | ${tab}– The last field in the record must not be followed by a field separator.
95 | ${tab}– Fields containing field enclosures or field separators must be enclosed in field
96 | ${tab} enclosure.
97 | ${tab}– A field enclosure appearing inside a field must be escaped by preceding it with
98 | ${tab} another field enclosure. Example: \"aaa\",\"b\"\"bb\",\"ccc\"
99 |
100 | ${title}Examples
101 | ${tab}Parse a CSV and display records without field enclosure, fields pipe-separated:
102 | $tab$tab${cmd}$(basename $0) $opt--output-separator$normal=$param'|' resources/iso_3166-1.csv
103 | $tab
104 | ${tab}Remove CSV's header before parsing:
105 | $tab$tab${cmd}tail -n+2 resources/iso_3166-1.csv | $(basename $0)
106 | $tab
107 | ${tab}Keep only first column of multiple files:
108 | $tab$tab${cmd}$(basename $0) ${param}a.csv b.csv c.csv$cmd | cut -d'|' -f1
109 | $tab
110 | ${tab}Keep only first column, using multiple UTF-8 characters output separator:
111 | $tab$tab${cmd}$(basename $0) $opt-o $param'⇒⇒' resources/iso_3166-1.csv$cmd | awk -F '⇒⇒' '{print \$1}'
112 | $tab
113 | ${tab}You can directly call the Awk script:
114 | $tab$tab${cmd}awk -f ${param}csv-parser.awk$cmd -v separator=',' -v enclosure='\"' --source '{
115 | $tab$tab${cmd} csv_parse_record(\$0, separator, enclosure, csv)
116 | $tab$tab${cmd} print csv[2] \" ⇒ \" csv[0]
117 | $tab$tab${cmd}}' resources/iso_3166-1.csv
118 | "
119 | }
120 |
--------------------------------------------------------------------------------
/tests/all-tests.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ##
4 | # Copyright © 2013 Geoffroy Aubry
5 | #
6 | # This file is part of awk-csv-parser.
7 | #
8 | # awk-csv-parser is free software: you can redistribute it and/or modify
9 | # it under the terms of the GNU Lesser General Public License as published by
10 | # the Free Software Foundation, either version 3 of the License, or
11 | # (at your option) any later version.
12 | #
13 | # awk-csv-parser is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | # GNU Lesser General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU Lesser General Public License
19 | # along with awk-csv-parser. If not, see
20 | #
21 |
22 |
23 |
24 | # Treat unset variables and parameters other than the special parameters ‘@’ or ‘*’ as an error
25 | # when performing parameter expansion. An error message will be written to the standard error,
26 | # and a non-interactive shell will exit.
27 | set -o nounset
28 |
29 | # The return value of a pipeline is the value of the last (rightmost) command to exit with a non-zero status,
30 | # or zero if all commands in the pipeline exit successfully:
31 | set -o pipefail
32 |
33 | ROOT_DIR=$(cd "$(dirname "$(readlink -f "$BASH_SOURCE")")/.." && pwd)
34 | SRC_DIR=$ROOT_DIR/src
35 | TESTS_DIR=$ROOT_DIR/tests
36 | RESOURCES_DIR=$TESTS_DIR/resources
37 |
38 | status=0
39 | for tests_set in ok invalid; do
40 | echo -e "\033[1;37mTesting $RESOURCES_DIR/$tests_set.csv:\033[0m"
41 | tmp_path="$(mktemp /tmp/awk-csv-parser-XXXXXXXXXX)";
42 | $SRC_DIR/awk-csv-parser.sh -s ',' -e '"' -o '|' $RESOURCES_DIR/$tests_set.csv \
43 | | sed -r 's:(\033|\x1B)\[[0-9;]*[mK]::ig' \
44 | > $tmp_path
45 | diff --report-identical-files $tmp_path $RESOURCES_DIR/$tests_set-expected.txt
46 | status=$(( $? | status))
47 | done
48 |
49 | echo
50 | if [ $status -ne 0 ]; then
51 | echo -e "\033[1;37;41m\033[0K Tests FAILED!\033[0m\n"
52 | else
53 | echo -e "\033[1;37;42m\033[0K Tests OK.\033[0m\n"
54 | fi
55 | exit $status
56 |
--------------------------------------------------------------------------------
/tests/resources/invalid-expected.txt:
--------------------------------------------------------------------------------
1 | [CSV ERROR: 3] Missing closing quote after '' in following record: '"'
2 | [CSV ERROR: 3] Missing closing quote after 'a,' in following record: '"a,'
3 | [CSV ERROR: 1] Missing opening quote before 'a' in following record: 'a"'
4 | [CSV ERROR: 2] Missing separator after 'a' in following record: '"a"b'
5 |
--------------------------------------------------------------------------------
/tests/resources/invalid.csv:
--------------------------------------------------------------------------------
1 | "
2 | "a,
3 | a"
4 | "a"b
--------------------------------------------------------------------------------
/tests/resources/ok-expected.txt:
--------------------------------------------------------------------------------
1 | || |
2 | a| b|c | d |e e|
3 | |a|a,|,a|,,|
4 | a"b|"|c""|
5 |
--------------------------------------------------------------------------------
/tests/resources/ok.csv:
--------------------------------------------------------------------------------
1 | ,,
2 | a, b,c , d ,e e
3 | "","a","a,",",a",",,"
4 | "a""b","""","c"""""
--------------------------------------------------------------------------------