├── .prev-version ├── tests ├── .gitignore ├── test.sh ├── no-perl ├── datamash-show-env.sh ├── datamash-io-errors-cheap.sh ├── CuSkip.pm ├── datamash-rand.pl ├── decorate-valgrind.sh ├── CuTmpdir.pm ├── datamash-rand.sh ├── datamash-strbin.sh ├── datamash-md5.pl ├── datamash-sort-header-deprecated.pl ├── datamash-sort-errors.sh ├── datamash-sha.pl ├── datamash-io-errors.sh ├── datamash-i18n-de.pl ├── datamash-sort-header.pl ├── datamash-check-tabular.pl ├── datamash-output-format.pl ├── datamash-tests-deprecated.pl ├── datamash-tests-2-deprecated.pl ├── datamash-crosstab.pl ├── datamash-check.pl ├── decorate-sort-tests.pl ├── datamash-transpose.pl └── datamash-pair-tests.pl ├── .gitmodules ├── po ├── quot.sed ├── boldquot.sed ├── POTFILES.in ├── remove-potcdate.sin ├── insert-header.sin ├── ChangeLog ├── en@quot.header └── en@boldquot.header ├── hooks ├── pre-commit.sh ├── setup-hooks.sh └── README.md ├── AUTHORS ├── lib └── local.mk ├── THANKS ├── src ├── double-format.h ├── decorate-functions.h ├── randutils.h ├── die.h ├── randutils.c ├── crosstab.h ├── op-scanner.h ├── column-headers.h ├── text-lines.h ├── double-format.c ├── text-options.h ├── column-headers.c ├── op-parser.h ├── text-options.c ├── op-defs.h ├── key-compare.h ├── field-ops.h ├── decorate-functions.c └── crosstab.c ├── m4 ├── ax_c_long_long.m4 └── .gitignore ├── doc ├── local.mk └── datamash-texinfo.css ├── examples ├── make_genes_example.sh ├── make_score_example.r ├── scores.txt └── scores_h.txt ├── man └── decorate.x ├── init.cfg ├── .github ├── ISSUE_TEMPLATE.txt └── PULL_REQUEST_TEMPLATE.txt ├── .gitignore ├── contrib └── bash-completion │ └── datamash ├── bootstrap.conf └── README /.prev-version: -------------------------------------------------------------------------------- 1 | 1.9 2 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | /*.trs 2 | /*.log 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "gnulib"] 2 | path = gnulib 3 | url = git://git.sv.gnu.org/gnulib.git 4 | -------------------------------------------------------------------------------- /po/quot.sed: -------------------------------------------------------------------------------- 1 | s/"\([^"]*\)"/“\1”/g 2 | s/`\([^`']*\)'/‘\1’/g 3 | s/ '\([^`']*\)' / ‘\1’ /g 4 | s/ '\([^`']*\)'$/ ‘\1’/g 5 | s/^'\([^`']*\)' /‘\1’ /g 6 | s/“”/""/g 7 | -------------------------------------------------------------------------------- /po/boldquot.sed: -------------------------------------------------------------------------------- 1 | s/"\([^"]*\)"/“\1”/g 2 | s/`\([^`']*\)'/‘\1’/g 3 | s/ '\([^`']*\)' / ‘\1’ /g 4 | s/ '\([^`']*\)'$/ ‘\1’/g 5 | s/^'\([^`']*\)' /‘\1’ /g 6 | s/“”/""/g 7 | s/“/“/g 8 | s/”/”/g 9 | s/‘/‘/g 10 | s/’/’/g 11 | -------------------------------------------------------------------------------- /hooks/pre-commit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Prefer gmake over make to get GNU make on non-GNU userland systems if present 4 | if command -v gmake 2>/dev/null; then 5 | make_cmd=gmake 6 | else 7 | make_cmd=make 8 | fi 9 | 10 | $make_cmd syntax-check 11 | -------------------------------------------------------------------------------- /hooks/setup-hooks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Find the path to datamash/.git 4 | gitdir=$(git rev-parse --absolute-git-dir) 5 | [ $? -ne 0 ] && exit 1 6 | 7 | # Find the path to datamash/hooks 8 | hookdir="$(dirname "$gitdir")/hooks" 9 | 10 | # Install hooks 11 | ln -fs "$hookdir/pre-commit.sh" "$gitdir/hooks/pre-commit" || \ 12 | { echo "Unable to install pre-commit hook" >&2; exit 1; } 13 | -------------------------------------------------------------------------------- /po/POTFILES.in: -------------------------------------------------------------------------------- 1 | # List of source files which contain translatable strings. 2 | lib/closeout.c 3 | lib/error.c 4 | lib/getopt.c 5 | lib/quotearg.c 6 | lib/version-etc.c 7 | lib/xalloc-die.c 8 | lib/xstrtol-error.c 9 | src/datamash.c 10 | src/decorate-functions.c 11 | src/decorate.c 12 | src/double-format.c 13 | src/field-ops.c 14 | src/key-compare.c 15 | src/op-parser.c 16 | src/op-scanner.c 17 | src/system.h 18 | src/text-lines.c 19 | src/text-options.c 20 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | GNU Datamash was originally authored by Assaf Gordon . 2 | It is currently maintained by Assaf Gordon and Tim Rice 3 | , with assistance from Shawn Wagner and Erik Auerswald. 4 | 5 | In addition, the following have provided patches and/or Git commits to Datamash: 6 | 7 | Barry Nisly 8 | Benno Schulenberg 9 | Dima Kogan 10 | Georg Sauthoff 11 | Jeroen Roovers 12 | Yu Fu 13 | 14 | See also the THANKS file. 15 | -------------------------------------------------------------------------------- /po/remove-potcdate.sin: -------------------------------------------------------------------------------- 1 | # Sed script that remove the POT-Creation-Date line in the header entry 2 | # from a POT file. 3 | # 4 | # The distinction between the first and the following occurrences of the 5 | # pattern is achieved by looking at the hold space. 6 | /^"POT-Creation-Date: .*"$/{ 7 | x 8 | # Test if the hold space is empty. 9 | s/P/P/ 10 | ta 11 | # Yes it was empty. First occurrence. Remove the line. 12 | g 13 | d 14 | bb 15 | :a 16 | # The hold space was nonempty. Following occurrences. Do nothing. 17 | x 18 | :b 19 | } 20 | -------------------------------------------------------------------------------- /hooks/README.md: -------------------------------------------------------------------------------- 1 | Git Hooks 2 | ========= 3 | 4 | git client-side hooks are not considered part of the repository and 5 | aren't included in a `git clone`. Running `hooks/setup-hooks.sh` will 6 | install some useful ones for you. 7 | 8 | This is only needed if you're working on the datamash source; if 9 | you're just compiling it, there's no reason to do this. 10 | 11 | Installed Hooks 12 | =============== 13 | 14 | pre-commit 15 | ---------- 16 | 17 | Makes sure the code tree passes `make syntax-check` before allowing a 18 | commit. 19 | -------------------------------------------------------------------------------- /lib/local.mk: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2014-2021 Assaf Gordon 2 | # 3 | # This file is free software; as a special exception the author gives 4 | # unlimited permission to copy and/or distribute it, with or without 5 | # modifications, as long as this notice is preserved. 6 | # 7 | # This program is distributed in the hope that it will be useful, but 8 | # WITHOUT ANY WARRANTY, to the extent permitted by law; without even the 9 | # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 10 | 11 | include lib/gnulib.mk 12 | 13 | # Allow "make distdir" to succeed before "make all" has run. 14 | dist-hook: $(noinst_LIBRARIES) 15 | .PHONY: dist-hook 16 | -------------------------------------------------------------------------------- /po/insert-header.sin: -------------------------------------------------------------------------------- 1 | # Sed script that inserts the file called HEADER before the header entry. 2 | # 3 | # At each occurrence of a line starting with "msgid ", we execute the following 4 | # commands. At the first occurrence, insert the file. At the following 5 | # occurrences, do nothing. The distinction between the first and the following 6 | # occurrences is achieved by looking at the hold space. 7 | /^msgid /{ 8 | x 9 | # Test if the hold space is empty. 10 | s/m/m/ 11 | ta 12 | # Yes it was empty. First occurrence. Read the file. 13 | r HEADER 14 | # Output the file's contents by reading the next line. But don't lose the 15 | # current line while doing this. 16 | g 17 | N 18 | bb 19 | :a 20 | # The hold space was nonempty. Following occurrences. Do nothing. 21 | x 22 | :b 23 | } 24 | -------------------------------------------------------------------------------- /THANKS: -------------------------------------------------------------------------------- 1 | Thanks to: 2 | 3 | Aaron Quinlan ( http://quinlanlab.org/ ) whose 'groupBy' program 4 | (https://github.com/arq5x/bedtools) was the inspiration for GNU Datamash. 5 | 6 | The following people provided bug reports, feature requests and other 7 | suggestions which resulted in notable improvements to GNU Datamash: 8 | 9 | Alejandro Garrido Mota 10 | Benno Schulenberg 11 | Bruno Haible 12 | Dagobert Michelseni 13 | Dave Myron 14 | Dima Kogan 15 | Frank Busse 16 | Jérémie Roquet 17 | Jeroen Hoek 18 | Khavish Bhundoo 19 | Kingsley G. Morse Jr. 20 | Mark van Rossum 21 | Renan Valieris 22 | Renato Alves 23 | Sanjeev Kumar Sharma 24 | Steve Ward 25 | Torsten Seemann 26 | wheat MAX 27 | 28 | See also the AUTHORS and ChangeLog files. 29 | -------------------------------------------------------------------------------- /po/ChangeLog: -------------------------------------------------------------------------------- 1 | 2015-05-28 gettextize 2 | 3 | * Makefile.in.in: Upgrade to gettext-0.19.4. 4 | 5 | 2015-05-28 gettextize 6 | 7 | * Makefile.in.in: Upgrade to gettext-0.19.4. 8 | * Rules-quot: Upgrade to gettext-0.19.4. 9 | 10 | 2013-04-11 gettextize 11 | 12 | * Makefile.in.in: New file, from gettext-0.18.1. 13 | * Rules-quot: New file, from gettext-0.18.1. 14 | * boldquot.sed: New file, from gettext-0.18.1. 15 | * en@boldquot.header: New file, from gettext-0.18.1. 16 | * en@quot.header: New file, from gettext-0.18.1. 17 | * insert-header.sin: New file, from gettext-0.18.1. 18 | * quot.sed: New file, from gettext-0.18.1. 19 | * remove-potcdate.sin: New file, from gettext-0.18.1. 20 | * POTFILES.in: New file. 21 | 22 | -------------------------------------------------------------------------------- /src/double-format.h: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2018-2021 Assaf Gordon 4 | 5 | This file is part of GNU Datamash. 6 | 7 | GNU Datamash is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | GNU Datamash is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with GNU Datamash. If not, see . 19 | */ 20 | #ifndef __DOUBLE_FORMAT_H__ 21 | #define __DOUBLE_FORMAT_H__ 22 | 23 | char* 24 | validate_double_format (char const *fmt); 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /tests/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Unit Tests for GNU Datamash - perform simple calculation on input data 3 | 4 | # Copyright (C) 2014-2021 Assaf Gordon 5 | # 6 | # This file is part of GNU Datamash. 7 | # 8 | # GNU Datamash is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, either version 3 of the License, or 11 | # (at your option) any later version. 12 | # 13 | # GNU Datamash is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with GNU Datamash. If not, see . 20 | # 21 | # Written by Assaf Gordon 22 | 23 | echo "Hello (Shell Unit-Testing) World" 24 | 25 | ## test passed: 26 | exit 0 27 | -------------------------------------------------------------------------------- /tests/no-perl: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Unit Tests for GNU Datamash - stub for systems without Perl 4 | 5 | # Copyright (C) 2014-2021 Assaf Gordon 6 | # 7 | # This file is part of GNU Datamash. 8 | # 9 | # GNU Datamash is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # GNU Datamash is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with GNU Datamash If not, see . 21 | # 22 | # Written by Assaf Gordon. 23 | 24 | . "${test_dir=.}/init.sh"; path_prepend_ ./src 25 | skip_ "this test requires a working perl" 26 | -------------------------------------------------------------------------------- /src/decorate-functions.h: -------------------------------------------------------------------------------- 1 | /* Decorate functions 2 | 3 | Copyright (C) 2020-2021 Assaf Gordon 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . */ 17 | 18 | #ifndef DECORATE_FUNCTIONS_H 19 | #define DECORATE_FUNCTIONS_H 20 | 21 | struct conversions_t 22 | { 23 | const char* name; 24 | const char* description; 25 | bool (*decorate_fn)(const char* in); 26 | }; 27 | 28 | 29 | extern struct conversions_t builtin_conversions[]; 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /src/randutils.h: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2022-2025 Timothy Rice 4 | 5 | This file is part of GNU Datamash. 6 | 7 | GNU Datamash is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | GNU Datamash is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with GNU Datamash. If not, see . 19 | */ 20 | 21 | /* Written by Tim Rice */ 22 | #ifndef __RANDUTILS_H__ 23 | #define __RANDUTILS_H__ 24 | 25 | # include 26 | 27 | /* Initialize random number source */ 28 | void 29 | init_random (bool force_seed, unsigned long seed); 30 | 31 | #endif // __RANDUTILS_H__ 32 | -------------------------------------------------------------------------------- /tests/datamash-show-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Unit Tests for GNU Datamash - perform simple calculation on input data 4 | 5 | # Copyright (C) 2014-2021 Assaf Gordon 6 | # 7 | # This file is part of GNU Datamash. 8 | # 9 | # GNU Datamash is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # GNU Datamash is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with GNU Datamash. If not, see . 21 | # 22 | # Written by Assaf Gordon 23 | 24 | ### DEBUG Helper to show the ENV 25 | echo "Debug Helper" 26 | echo "-----ENV------" 27 | env 28 | echo 29 | echo 30 | echo "-----PWD------" 31 | pwd 32 | echo 33 | echo 34 | exit 0 35 | -------------------------------------------------------------------------------- /src/die.h: -------------------------------------------------------------------------------- 1 | /* Report an error and exit. 2 | Copyright 2016-2018 Free Software Foundation, Inc. 3 | 4 | This program is free software; you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation; either version 3, or (at your option) 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program; if not, see https://www.gnu.org/licenses/. 16 | */ 17 | 18 | #ifndef DIE_H 19 | # define DIE_H 20 | 21 | # include 22 | # include 23 | # include 24 | 25 | /* Like 'error (STATUS, ...)', except STATUS must be a nonzero constant. 26 | This may pacify the compiler or help it generate better code. */ 27 | # define die(status, ...) \ 28 | verify_expr (status, (error (status, __VA_ARGS__), assume (false))) 29 | 30 | #endif /* DIE_H */ 31 | -------------------------------------------------------------------------------- /po/en@quot.header: -------------------------------------------------------------------------------- 1 | # All this catalog "translates" are quotation characters. 2 | # The msgids must be ASCII and therefore cannot contain real quotation 3 | # characters, only substitutes like grave accent (0x60), apostrophe (0x27) 4 | # and double quote (0x22). These substitutes look strange; see 5 | # http://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html 6 | # 7 | # This catalog translates grave accent (0x60) and apostrophe (0x27) to 8 | # left single quotation mark (U+2018) and right single quotation mark (U+2019). 9 | # It also translates pairs of apostrophe (0x27) to 10 | # left single quotation mark (U+2018) and right single quotation mark (U+2019) 11 | # and pairs of quotation mark (0x22) to 12 | # left double quotation mark (U+201C) and right double quotation mark (U+201D). 13 | # 14 | # When output to an UTF-8 terminal, the quotation characters appear perfectly. 15 | # When output to an ISO-8859-1 terminal, the single quotation marks are 16 | # transliterated to apostrophes (by iconv in glibc 2.2 or newer) or to 17 | # grave/acute accent (by libiconv), and the double quotation marks are 18 | # transliterated to 0x22. 19 | # When output to an ASCII terminal, the single quotation marks are 20 | # transliterated to apostrophes, and the double quotation marks are 21 | # transliterated to 0x22. 22 | # 23 | -------------------------------------------------------------------------------- /m4/ax_c_long_long.m4: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # https://www.gnu.org/software/autoconf-archive/ax_c_long_long.html 3 | # =========================================================================== 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_C_LONG_LONG 8 | # 9 | # DESCRIPTION 10 | # 11 | # Provides a test for the existence of the long long int type and defines 12 | # HAVE_LONG_LONG if it is found. 13 | # 14 | # LICENSE 15 | # 16 | # Copyright (c) 2008 Caolan McNamara 17 | # 18 | # Copying and distribution of this file, with or without modification, are 19 | # permitted in any medium without royalty provided the copyright notice 20 | # and this notice are preserved. This file is offered as-is, without any 21 | # warranty. 22 | 23 | #serial 7 24 | 25 | AU_ALIAS([AC_C_LONG_LONG], [AX_C_LONG_LONG]) 26 | AC_DEFUN([AX_C_LONG_LONG], 27 | [AC_CACHE_CHECK(for long long int, ac_cv_c_long_long, 28 | [if test "$GCC" = yes; then 29 | ac_cv_c_long_long=yes 30 | else 31 | AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [[long long int i;]])], 32 | [ac_cv_c_long_long=yes], 33 | [ac_cv_c_long_long=no]) 34 | fi]) 35 | if test $ac_cv_c_long_long = yes; then 36 | AC_DEFINE([HAVE_LONG_LONG], 1, [compiler understands long long]) 37 | fi 38 | ]) 39 | -------------------------------------------------------------------------------- /po/en@boldquot.header: -------------------------------------------------------------------------------- 1 | # All this catalog "translates" are quotation characters. 2 | # The msgids must be ASCII and therefore cannot contain real quotation 3 | # characters, only substitutes like grave accent (0x60), apostrophe (0x27) 4 | # and double quote (0x22). These substitutes look strange; see 5 | # http://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html 6 | # 7 | # This catalog translates grave accent (0x60) and apostrophe (0x27) to 8 | # left single quotation mark (U+2018) and right single quotation mark (U+2019). 9 | # It also translates pairs of apostrophe (0x27) to 10 | # left single quotation mark (U+2018) and right single quotation mark (U+2019) 11 | # and pairs of quotation mark (0x22) to 12 | # left double quotation mark (U+201C) and right double quotation mark (U+201D). 13 | # 14 | # When output to an UTF-8 terminal, the quotation characters appear perfectly. 15 | # When output to an ISO-8859-1 terminal, the single quotation marks are 16 | # transliterated to apostrophes (by iconv in glibc 2.2 or newer) or to 17 | # grave/acute accent (by libiconv), and the double quotation marks are 18 | # transliterated to 0x22. 19 | # When output to an ASCII terminal, the single quotation marks are 20 | # transliterated to apostrophes, and the double quotation marks are 21 | # transliterated to 0x22. 22 | # 23 | # This catalog furthermore displays the text between the quotation marks in 24 | # bold face, assuming the VT100/XTerm escape sequences. 25 | # 26 | -------------------------------------------------------------------------------- /tests/datamash-io-errors-cheap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Unit Tests for GNU Datamash - simple I/O error simulation 4 | 5 | # Copyright (C) 2022 Erik Auerswald 6 | # 7 | # This file is part of GNU Datamash. 8 | # 9 | # GNU Datamash is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # GNU Datamash is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with GNU Datamash. If not, see . 21 | # 22 | # Written by Erik Auerswald, 23 | # based on datamash-io-errors.sh written by Assaf Gordon 24 | 25 | ## 26 | ## This script tests GNU Datamash's handling of basic I/O errors. 27 | ## 28 | 29 | . "${test_dir=.}/init.sh"; path_prepend_ ./src 30 | 31 | fail=0 32 | 33 | ## 34 | ## This test requires the special file /dev/full 35 | ## 36 | test -w /dev/full || skip_ 'requires writable /dev/full' 37 | 38 | ## Test 1: output error 39 | echo 0 | datamash -g 1 count 1 > /dev/full && 40 | { warn_ "datamash failed to detect no-space error" ; fail=1 ; } 41 | 42 | Exit $fail 43 | -------------------------------------------------------------------------------- /tests/CuSkip.pm: -------------------------------------------------------------------------------- 1 | package CuSkip; 2 | # This file (CuSkip.pm) was copied from GNU Coretils. 3 | # It has the following copyright notice and license: 4 | 5 | # Skip a test: emit diag to log and to stderr, and exit 77 6 | 7 | # Copyright (C) 2011-2013 Free Software Foundation, Inc. 8 | 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | 22 | use strict; 23 | use warnings; 24 | 25 | our $ME = $0 || ""; 26 | 27 | # Emit a diagnostic both to stderr and to $stderr_fileno_. 28 | # FIXME: don't hard-code that value (9), since it's already defined in init.cfg. 29 | sub skip ($) 30 | { 31 | my ($msg) = @_; 32 | my $stderr_fileno_ = 9; 33 | warn $msg; 34 | open FH, ">&$stderr_fileno_" 35 | or warn "$ME: failed to dup stderr\n"; 36 | print FH $msg; 37 | close FH 38 | or warn "$ME: failed to close FD $stderr_fileno_\n"; 39 | exit 77; 40 | } 41 | 42 | 1; 43 | -------------------------------------------------------------------------------- /src/randutils.c: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2013-2021 Assaf Gordon 4 | Copyright (C) 2022-2025 Timothy Rice 5 | 6 | This file is part of GNU Datamash. 7 | 8 | GNU Datamash is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | GNU Datamash is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with GNU Datamash. If not, see . 20 | */ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "randutils.h" 32 | 33 | void 34 | init_random (bool force_seed, unsigned long seed) 35 | { 36 | if (!force_seed) 37 | { 38 | errno = 0; 39 | ssize_t nbytes = getrandom (&seed, sizeof (seed), 0); 40 | if (nbytes == -1 || errno != 0) 41 | { 42 | fprintf (stderr, "Error %d: %s\n", errno, strerror (errno)); 43 | } 44 | } 45 | srandom (seed); 46 | } 47 | -------------------------------------------------------------------------------- /doc/local.mk: -------------------------------------------------------------------------------- 1 | # Make GNU Datamash documentation. -*-Makefile-*- 2 | # This is included by the top-level Makefile.am. 3 | 4 | # Based on GNU Hello: 5 | # Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 6 | # 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free 7 | # Software Foundation, Inc. 8 | 9 | # Modifications for GNU Datamash are 10 | # Copyright (C) 2014-2021 Assaf Gordon 11 | 12 | # This program is free software: you can redistribute it and/or modify 13 | # it under the terms of the GNU General Public License as published by 14 | # the Free Software Foundation, either version 3 of the License, or 15 | # (at your option) any later version. 16 | 17 | # This program is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | # GNU General Public License for more details. 21 | 22 | # You should have received a copy of the GNU General Public License 23 | # along with this program. If not, see . 24 | 25 | info_TEXINFOS = doc/datamash.texi 26 | EXTRA_DIST += doc/datamash-texinfo.css 27 | 28 | # For the 'make html' target - generate a single HTML file 29 | # and embed the CSS statements in it. 30 | AM_MAKEINFOHTMLFLAGS = --no-split \ 31 | --css-include=$(top_srcdir)/doc/datamash-texinfo.css 32 | 33 | # Changes to the CSS should trigger a new HTML regeneration 34 | $(top_builddir)/doc/datamash.html: $(top_srcdir)/doc/datamash-texinfo.css 35 | 36 | doc_datamash_TEXINFOS = \ 37 | doc/fdl.texi 38 | -------------------------------------------------------------------------------- /src/crosstab.h: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2013-2021 Assaf Gordon 4 | 5 | This file is part of GNU Datamash. 6 | 7 | GNU Datamash is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | GNU Datamash is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with GNU Datamash. If not, see . 19 | */ 20 | 21 | /* Written by Assaf Gordon */ 22 | #ifndef __CROSSTAB_H__ 23 | #define __CROSSTAB_H__ 24 | 25 | struct crosstab 26 | { 27 | Hash_table *rows; 28 | Hash_table *columns; 29 | Hash_table *data; 30 | }; 31 | 32 | struct crosstab_datacell 33 | { 34 | const char* row_name; 35 | const char* col_name; 36 | const char* data; 37 | }; 38 | 39 | struct crosstab_data_cell* 40 | crosstab_new_datacell (const char* row, const char* col, const char* data); 41 | 42 | struct crosstab* 43 | crosstab_init (); 44 | 45 | void 46 | crosstab_add_result (struct crosstab* ct, 47 | const char* row, const char* col, const char* data); 48 | 49 | void 50 | crosstab_print (const struct crosstab* ct); 51 | 52 | void 53 | crosstab_free (struct crosstab* ct); 54 | 55 | #endif /* __CROSSTAB_H__ */ 56 | -------------------------------------------------------------------------------- /examples/make_genes_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ## Copyright (C) 2014-2021 Assaf Gordon 4 | ## 5 | ## This file is part of GNU Datamash. 6 | ## 7 | ## GNU Datamash is free software: you can redistribute it and/or modify 8 | ## it under the terms of the GNU General Public License as published by 9 | ## the Free Software Foundation, either version 3 of the License, or 10 | ## (at your option) any later version. 11 | ## 12 | ## GNU Datamash is distributed in the hope that it will be useful, 13 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | ## GNU General Public License for more details. 16 | ## 17 | ## You should have received a copy of the GNU General Public License 18 | ## along with GNU Datamash. If not, see . 19 | 20 | 21 | ## 22 | ## A short script to general a sample of genes based on HG19/RefSeq file. 23 | ## 24 | 25 | if [ ! -e "refGene.txt" ] ; then 26 | wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz || exit 1 27 | gunzip refGene.txt.gz || exit 1 28 | fi 29 | 30 | (cat refGene.txt | 31 | sort -k13,13 | 32 | ../datamash -g 13 countunique 3 countunique 4 | 33 | awk '$2>1 || $3>1' | sort -R | head -n 100 | cut -f1 -d " " ; 34 | cut -f13 refGene.txt | sort -R -u | head -n 1000 ) | 35 | sort -u > genelist.txt 36 | 37 | grep -F -f genelist.txt refGene.txt | grep -E -v "chrUn|hap" > genes.txt 38 | 39 | ( echo "bin 40 | name 41 | chrom 42 | strand 43 | txStart 44 | txEnd 45 | cdsStart 46 | cdsEnd 47 | exonCount 48 | exonStarts 49 | exonEnds 50 | score 51 | name2 52 | cdsStartStat 53 | cdsEndStat 54 | exonFrames" | paste -s -d ' ' ; cat genes.txt ) > genes_h.txt 55 | 56 | rm -f genelist.txt 57 | -------------------------------------------------------------------------------- /src/op-scanner.h: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2013-2021 Assaf Gordon 4 | 5 | This file is part of GNU Datamash. 6 | 7 | GNU Datamash is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | GNU Datamash is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with GNU Datamash. If not, see . 19 | */ 20 | 21 | /* Written by Assaf Gordon */ 22 | #ifndef __OP_SCANNER_H__ 23 | #define __OP_SCANNER_H__ 24 | 25 | #define MAX_IDENTIFIER_LENGTH 512 26 | 27 | enum TOKEN 28 | { 29 | TOK_END=0, 30 | TOK_IDENTIFIER, 31 | TOK_INTEGER, 32 | TOK_FLOAT, 33 | TOK_COMMA, 34 | TOK_DASH, 35 | TOK_COLONS, 36 | TOK_WHITESPACE 37 | }; 38 | 39 | extern uintmax_t scan_val_int; 40 | extern long double scan_val_float; 41 | extern char* scanner_identifier; 42 | extern bool scanner_keep_whitespace; 43 | 44 | /* Initialize the scanner from argc/argv pair. 45 | note: argv should contain only the actual input: remove 46 | any other program parameters (including progname/argv[0]) */ 47 | void 48 | scanner_set_input_from_argv (int argc, const char* argv[]); 49 | 50 | /* Free any data/memory associated with the scanner */ 51 | void 52 | scanner_free (); 53 | 54 | enum TOKEN 55 | scanner_get_token (); 56 | 57 | enum TOKEN 58 | scanner_peek_token (); 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /man/decorate.x: -------------------------------------------------------------------------------- 1 | ." GNU decorate - manual page 2 | ." Copyright (C) 2014-2021 Assaf Gordon 3 | [NAME] 4 | decorate - convert fields of various formats 5 | 6 | 7 | [>DESCRIPTION] 8 | The \fBdecorate\fR program allows sorting input according to various 9 | ordering, e.g. IP addresses, roman numerals, etc. 10 | It works in tandem with sort(1) to perform the actual sorting. 11 | 12 | The idea was suggested by 13 | .UR https://lists.gnu.org/r/bug-coreutils/2015-06/msg00076.html 14 | Pádraig Brady in https://lists.gnu.org/r/bug-coreutils/2015-06/msg00076.html: 15 | 16 | 1. Decorate: convert the input to a sortable-format as additional fields 17 | .br 18 | 2. Sort according to the inserted fields 19 | .br 20 | 3. Undecorate: remove the inserted fields 21 | 22 | [=EXAMPLES] 23 | Example of preparing to sort by roman numerals: 24 | .PP 25 | .nf 26 | .RS 27 | $ printf "%s\\n" C V III IX XI | \fBdecorate\fR \-k1,1:roman \-\-decorate 28 | 0000100 C 29 | 0000005 V 30 | 0000003 III 31 | 0000009 IX 32 | 0000011 XI 33 | .RE 34 | .fi 35 | .PP 36 | 37 | The output can now be sent to sort(1), followed by removing (=undecorate) 38 | the first field. 39 | 40 | .PP 41 | .nf 42 | .RS 43 | $ printf "%s\\n" C V III IX XI \\ 44 | | \fBdecorate\fR \-k1,1:roman \-\-decorate \\ 45 | | sort \-k1,1 \\ 46 | | \fBdecorate\fR \-\-undecorate 1 47 | III 48 | V 49 | IX 50 | XI 51 | C 52 | .RE 53 | .fi 54 | .PP 55 | 56 | \fBdecorate(1)\fR can automatically combine the decorate-sort-undecorate steps 57 | (when run without \-\-decorate or \-\-undecorate): 58 | 59 | .PP 60 | .nf 61 | .RS 62 | $ printf "%s\\n" C V III IX XI | \fBdecorate\fR \-k1,1:roman 63 | III 64 | V 65 | IX 66 | XI 67 | C 68 | .RE 69 | .fi 70 | .PP 71 | 72 | 73 | 74 | 75 | 76 | [ADDITIONAL INFORMATION] 77 | See 78 | .UR https://www.gnu.org/software/datamash 79 | GNU Datamash Website (https://www.gnu.org/software/datamash) 80 | -------------------------------------------------------------------------------- /examples/make_score_example.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Copyright (C) 2014-2021 Assaf Gordon 4 | ## 5 | ## This file is part of Compute. 6 | ## 7 | ## Compute is free software: you can redistribute it and/or modify 8 | ## it under the terms of the GNU General Public License as published by 9 | ## the Free Software Foundation, either version 3 of the License, or 10 | ## (at your option) any later version. 11 | ## 12 | ## Compute is distributed in the hope that it will be useful, 13 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | ## GNU General Public License for more details. 16 | ## 17 | ## You should have received a copy of the GNU General Public License 18 | ## along with Compute. If not, see . 19 | 20 | ## 21 | ## A short R script to generate random data for the 'scores' example. 22 | ## 23 | library(randomNames) 24 | 25 | gen_data = function(count,group_name,mean,sd) 26 | { 27 | return( 28 | data.frame(Name=gsub(" ","-",randomNames(count,gender="M",which.names="first")), 29 | Major=rep(group_name,count), 30 | Score=pmin(round(rnorm(count, mean=mean,sd=sd)),100))) 31 | } 32 | 33 | exp = rbind( 34 | gen_data( runif(1,min=10,max=20), "Arts", runif(1,min=50,max=90), runif(1,min=5,max=20) ), 35 | gen_data( runif(1,min=10,max=20), "Business", runif(1,min=50,max=90), runif(1,min=5,max=20) ), 36 | gen_data( runif(1,min=10,max=20), "Health-Medicine", runif(1,min=50,max=90), runif(1,min=5,max=20) ), 37 | gen_data( runif(1,min=10,max=20), "Social-Sciences", runif(1,min=50,max=90), runif(1,min=5,max=20) ), 38 | gen_data( runif(1,min=10,max=20), "Life-Sciences", runif(1,min=50,max=90), runif(1,min=5,max=20) ), 39 | gen_data( runif(1,min=10,max=20), "Engineering", runif(1,min=50,max=90), runif(1,min=5,max=20) ) 40 | ) 41 | 42 | write.table(exp,file="scores.txt",sep="\t",row.names=FALSE,col.names=FALSE,quote=FALSE); 43 | write.table(exp,file="scores_h.txt",sep="\t",row.names=FALSE,col.names=T,quote=FALSE); 44 | -------------------------------------------------------------------------------- /tests/datamash-rand.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - tests rand operations 4 | 5 | Copyright (C) 2022-2025 Timothy Rice 6 | 7 | This file is part of GNU Datamash. 8 | 9 | GNU Datamash is free software: you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation, either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | GNU Datamash is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with GNU Datamash. If not, see . 21 | 22 | Written by Tim Rice. 23 | =cut 24 | use strict; 25 | use warnings; 26 | 27 | # Until a better way comes along to auto-use Coreutils Perl modules 28 | # as in the coreutils' autotools system. 29 | use Coreutils; 30 | use CuSkip; 31 | use CuTmpdir qw(datamash); 32 | 33 | (my $program_name = $0) =~ s|.*/||; 34 | my $prog_bin = 'datamash'; 35 | 36 | ## Cross-Compiling portability hack: 37 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 38 | ## the full path of the binary, if the binary is on the $PATH. 39 | ## So we try to detect what is the actual returned value of the program 40 | ## in case of an error. 41 | my $prog = `$prog_bin ---print-progname`; 42 | $prog = $prog_bin unless $prog; 43 | 44 | # Turn off localization of executable's output. 45 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 46 | 47 | my $in=<<'EOF'; 48 | A A 49 | A B 50 | B A 51 | B B 52 | EOF 53 | 54 | my $out=<<'EOF'; 55 | A B 56 | B A 57 | EOF 58 | 59 | my @Tests = 60 | ( 61 | ['r1', '-W -S0 groupby 1 rand 2', {IN_PIPE=>$in}, {OUT=>$out}], 62 | ); 63 | 64 | my $save_temps = $ENV{SAVE_TEMPS}; 65 | my $verbose = $ENV{VERBOSE}; 66 | 67 | my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); 68 | exit $fail; 69 | -------------------------------------------------------------------------------- /examples/scores.txt: -------------------------------------------------------------------------------- 1 | Shawn Arts 65 2 | Marques Arts 58 3 | Fernando Arts 78 4 | Paul Arts 63 5 | Walter Arts 75 6 | Derek Arts 60 7 | Nathaniel Arts 88 8 | Tyreque Arts 74 9 | Trevon Arts 74 10 | Nathan Arts 71 11 | Zachary Arts 80 12 | Donovan Arts 75 13 | Levi Arts 76 14 | Sage Arts 55 15 | Roberto Arts 65 16 | William Arts 46 17 | Nico Arts 59 18 | Bryan Arts 68 19 | Isaiah Arts 80 20 | David Business 92 21 | Leonard Business 87 22 | Tysza Business 92 23 | Darren Business 94 24 | Christian Business 88 25 | Aaron Business 83 26 | Kerris Business 82 27 | Dakota Business 83 28 | Teriuse Business 94 29 | Caleb Business 87 30 | Juan Business 79 31 | Andre Health-Medicine 72 32 | Diego Health-Medicine 82 33 | Jonathan Health-Medicine 100 34 | Kevin Health-Medicine 100 35 | Patrick Health-Medicine 92 36 | D'Angelo Health-Medicine 90 37 | Daniel Health-Medicine 91 38 | Dilan Health-Medicine 84 39 | Angel Health-Medicine 100 40 | Peter Health-Medicine 86 41 | Dalton Health-Medicine 100 42 | Israel Health-Medicine 81 43 | Gabriel Health-Medicine 100 44 | Chase Social-Sciences 27 45 | Leroy Social-Sciences 74 46 | Jesse Social-Sciences 32 47 | Drake Social-Sciences 76 48 | Ja'Won Social-Sciences 37 49 | Joel Social-Sciences 72 50 | Darius Social-Sciences 51 51 | David Social-Sciences 69 52 | Williams Social-Sciences 62 53 | Manuel Social-Sciences 61 54 | Lance Social-Sciences 65 55 | Drake Social-Sciences 59 56 | Joseph Social-Sciences 61 57 | Randy Social-Sciences 68 58 | Justin Social-Sciences 90 59 | Yeng Life-Sciences 39 60 | Allen Life-Sciences 50 61 | Brandon Life-Sciences 72 62 | Christian Life-Sciences 67 63 | Aaron Life-Sciences 58 64 | Gurnam Life-Sciences 66 65 | Anthony Life-Sciences 32 66 | Joshua Life-Sciences 14 67 | Nathen Life-Sciences 46 68 | Christopher Life-Sciences 59 69 | John Life-Sciences 70 70 | Austin Life-Sciences 91 71 | Antonio Engineering 88 72 | Faison Engineering 47 73 | Devin Engineering 92 74 | Ignatius Engineering 83 75 | Sonny Engineering 50 76 | Antonio Engineering 56 77 | Zackery Engineering 54 78 | Joe'Quann Engineering 75 79 | Thanh Engineering 53 80 | Michael Engineering 39 81 | Leonardo Engineering 78 82 | Omar Engineering 99 83 | Avery Engineering 51 84 | -------------------------------------------------------------------------------- /examples/scores_h.txt: -------------------------------------------------------------------------------- 1 | Name Major Score 2 | Shawn Arts 65 3 | Marques Arts 58 4 | Fernando Arts 78 5 | Paul Arts 63 6 | Walter Arts 75 7 | Derek Arts 60 8 | Nathaniel Arts 88 9 | Tyreque Arts 74 10 | Trevon Arts 74 11 | Nathan Arts 71 12 | Zachary Arts 80 13 | Donovan Arts 75 14 | Levi Arts 76 15 | Sage Arts 55 16 | Roberto Arts 65 17 | William Arts 46 18 | Nico Arts 59 19 | Bryan Arts 68 20 | Isaiah Arts 80 21 | David Business 92 22 | Leonard Business 87 23 | Tysza Business 92 24 | Darren Business 94 25 | Christian Business 88 26 | Aaron Business 83 27 | Kerris Business 82 28 | Dakota Business 83 29 | Teriuse Business 94 30 | Caleb Business 87 31 | Juan Business 79 32 | Andre Health-Medicine 72 33 | Diego Health-Medicine 82 34 | Jonathan Health-Medicine 100 35 | Kevin Health-Medicine 100 36 | Patrick Health-Medicine 92 37 | D'Angelo Health-Medicine 90 38 | Daniel Health-Medicine 91 39 | Dilan Health-Medicine 84 40 | Angel Health-Medicine 100 41 | Peter Health-Medicine 86 42 | Dalton Health-Medicine 100 43 | Israel Health-Medicine 81 44 | Gabriel Health-Medicine 100 45 | Chase Social-Sciences 27 46 | Leroy Social-Sciences 74 47 | Jesse Social-Sciences 32 48 | Drake Social-Sciences 76 49 | Ja'Won Social-Sciences 37 50 | Joel Social-Sciences 72 51 | Darius Social-Sciences 51 52 | David Social-Sciences 69 53 | Williams Social-Sciences 62 54 | Manuel Social-Sciences 61 55 | Lance Social-Sciences 65 56 | Drake Social-Sciences 59 57 | Joseph Social-Sciences 61 58 | Randy Social-Sciences 68 59 | Justin Social-Sciences 90 60 | Yeng Life-Sciences 39 61 | Allen Life-Sciences 50 62 | Brandon Life-Sciences 72 63 | Christian Life-Sciences 67 64 | Aaron Life-Sciences 58 65 | Gurnam Life-Sciences 66 66 | Anthony Life-Sciences 32 67 | Joshua Life-Sciences 14 68 | Nathen Life-Sciences 46 69 | Christopher Life-Sciences 59 70 | John Life-Sciences 70 71 | Austin Life-Sciences 91 72 | Antonio Engineering 88 73 | Faison Engineering 47 74 | Devin Engineering 92 75 | Ignatius Engineering 83 76 | Sonny Engineering 50 77 | Antonio Engineering 56 78 | Zackery Engineering 54 79 | Joe'Quann Engineering 75 80 | Thanh Engineering 53 81 | Michael Engineering 39 82 | Leonardo Engineering 78 83 | Omar Engineering 99 84 | Avery Engineering 51 85 | -------------------------------------------------------------------------------- /tests/decorate-valgrind.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Unit Tests for GNU Decorate - auxiliary program for sort preprocessing 4 | 5 | # Copyright (C) 2014-2021 Assaf Gordon 6 | # Copyright (C) 2025 Erik Auerswald 7 | # 8 | # This file is part of GNU Datamash. 9 | # 10 | # GNU Datamash is free software: you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation, either version 3 of the License, or 13 | # (at your option) any later version. 14 | # 15 | # GNU Datamash is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | # 20 | # You should have received a copy of the GNU General Public License 21 | # along with GNU Datamash If not, see . 22 | # 23 | # Written by Assaf Gordon. 24 | 25 | . "${test_dir=.}/init.sh"; path_prepend_ ./src 26 | 27 | require_valgrind_ 28 | 29 | ## Don't use valgrind on statically-compiled binary 30 | ## (it gives some false-positives and the test fails). 31 | if which ldd >/dev/null ; then 32 | ## Tricky implicit assumption: 33 | ## If the system has "ldd" - we can test if this is a static binary. 34 | ## If the system doesn't have "ldd", we can't test it, and we'll assume 35 | ## we can valgrind without false-positives. 36 | ## This is relevant for Mac OS X, where static binaries are discouraged and 37 | ## difficult to create 38 | ## (https://developer.apple.com/library/mac/qa/qa1118/_index.html) 39 | ldd $(which decorate) >/dev/null 2>/dev/null || 40 | skip_ "skipping valgrind test for a non-dynamic-binary decorate" 41 | fi 42 | 43 | 44 | fail=0 45 | 46 | # check fix for buffer under-read (CWE-127) reported by Frank Busse in 47 | # 48 | echo | valgrind --error-exitcode=1 decorate --undecorate 6 > /dev/null || 49 | { warn_ "--undecorate 6 buffer under-read - failed" ; fail=1 ; } 50 | 51 | Exit $fail 52 | -------------------------------------------------------------------------------- /src/column-headers.h: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2014-2021 Assaf Gordon 4 | 5 | This file is part of GNU Datamash. 6 | 7 | GNU Datamash is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | GNU Datamash is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with GNU Datamash. If not, see . 19 | */ 20 | 21 | /* Written by Assaf Gordon */ 22 | #ifndef __COLUMN_HEADERS_H__ 23 | #define __COLUMN_HEADERS_H__ 24 | 25 | /* 26 | Column Headers Module 27 | */ 28 | 29 | /* 30 | Given a parsed line (representing the header line), 31 | sets the column names. 32 | 33 | if 'store_names' is true, 34 | stores the name of each field as the column header. 35 | if 'store_names' is false, 36 | simply counts the number of fields in the input line. 37 | */ 38 | void 39 | build_input_line_headers (const struct line_record_t *lr, bool store_names); 40 | 41 | /* 42 | returns the number of fields as extracted by 'build_input_line_headers ()' 43 | */ 44 | size_t 45 | get_num_column_headers (); 46 | 47 | /* 48 | returns the name of column 'field_num' (1 == first field). 49 | 50 | If 'store_names' (above) was true, returns the name of the column as 51 | appeared in the first input line. 52 | If 'store_names' (above) was false, returns 'field-X'. 53 | 54 | The returned string must not be modified (or free'd). 55 | */ 56 | const char* 57 | get_input_field_name (size_t field_num); 58 | 59 | 60 | /* returns field number (1== first field) 61 | which matches the given field name. 62 | 63 | returns ZERO if no such field found. */ 64 | size_t 65 | get_input_field_number (const char* field_name); 66 | 67 | void 68 | free_column_headers (); 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /init.cfg: -------------------------------------------------------------------------------- 1 | # This file is sourced by init.sh, *before* its initialization. 2 | 3 | ## 4 | ## The file was copied from GNU coreutils, with the following license: 5 | ## 6 | 7 | # Copyright (C) 2010-2014 Free Software Foundation, Inc. 8 | 9 | # Modifications for GNU Datamash are 10 | # Copyright (C) 2014-2021 Assaf Gordon 11 | 12 | # This program is free software: you can redistribute it and/or modify 13 | # it under the terms of the GNU General Public License as published by 14 | # the Free Software Foundation, either version 3 of the License, or 15 | # (at your option) any later version. 16 | 17 | # This program is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | # GNU General Public License for more details. 21 | 22 | # You should have received a copy of the GNU General Public License 23 | # along with this program. If not, see . 24 | 25 | # This goes hand in hand with the "exec 9>&2;" in tests/Makefile.am's 26 | # TESTS_ENVIRONMENT definition. 27 | stderr_fileno_=9 28 | 29 | # Skip the current test if valgrind doesn't work, 30 | # which could happen if not installed, 31 | # or hasn't support for the built architecture, 32 | # or hasn't appropriate error suppressions installed etc. 33 | require_valgrind_() 34 | { 35 | valgrind --error-exitcode=1 true 2>/dev/null || 36 | skip_ "requires a working valgrind" 37 | } 38 | 39 | # Skip the current test if 'paste' doesn't work. 40 | # Alpine linux does not have 'paste' in the default minimal installation. 41 | require_paste_() 42 | { 43 | paste - /dev/null || 44 | skip_ "requires a working paste(1)" 45 | } 46 | 47 | 48 | openbsd_seq_replacement_() 49 | { 50 | ## Wrap jot on OpenBSD since it doesn't have seq 51 | test "$(uname -s)" = OpenBSD || return 52 | 53 | seq() 54 | { 55 | if [ $# -eq 1 ]; then 56 | jot "$1" 57 | elif [ $# -eq 2 ]; then 58 | jot - "$1" "$2" 59 | fi 60 | } 61 | } 62 | 63 | 64 | expensive_() 65 | { 66 | if test "$RUN_EXPENSIVE_TESTS" != yes; then 67 | skip_ 'expensive: disabled by default 68 | This test is relatively expensive, so it is disabled by default. 69 | To run it anyway, rerun make check with the RUN_EXPENSIVE_TESTS 70 | environment variable set to yes. E.g., 71 | 72 | env RUN_EXPENSIVE_TESTS=yes make check 73 | 74 | or use the shortcut target of the toplevel Makefile, 75 | 76 | make check-expensive 77 | ' 78 | fi 79 | } 80 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.txt: -------------------------------------------------------------------------------- 1 | Please **do not** send pull-requests or open new issues on Github. 2 | 3 | Github is a downstream mirror and is not frequently monitored, 4 | all development is coordinated upstream on GNU resources. 5 | 6 | Send general questions, suggestions or bug reports to: 7 | bug-datamash@gnu.org 8 | 9 | Before reporting a new bug, please previous discussions and bug reports 10 | on the Datamash Mailing list: 11 | https://lists.gnu.org/r/bug-datamash/ 12 | 13 | ## Effective bug reports 14 | 15 | * Include a descriptive subject line (e.g. what the problem is). 16 | * Include the version (i.e. the output of `datamash --version`). 17 | * Include the operating system and the type of hardware you are using 18 | (e.g. the output of `uname -a`). 19 | * Include the exact command and parameters you have used. 20 | * Clearly explain what is the output you expected to get, and what is 21 | the actual result you encountered. 22 | * Include as much information as possible to reproduce the problem. 23 | If the problem happens on a very large input file, try to provide 24 | a minimal example (a subset of the input file) that still causes the problem. 25 | *Do not* include attachments over 40kB. 26 | * List policy is reply-to-all, and non-subscribers may post. 27 | * There may be a moderation delay for a first-time post, whether or not 28 | you subscribe. 29 | 30 | 31 | ## Mailing List Etiquette 32 | 33 | When sending messages to bug-datamash@gnu.org : 34 | 35 | * Send messages as plain text. 36 | * Do not send messages encoded as HTML nor encoded as base64 MIME nor 37 | included as multiple formats. 38 | * Avoid sending large messages, such as log files, system call trace 39 | output, and other content resulting in messages over about 40 kB. 40 | * Avoid sending screenshots (e.g. PNG files). When reporting errors 41 | you encounter on the terminal, copy and paste the text to your message. 42 | 43 | 44 | 45 | 46 | 47 | Copyright (C) 2017-2021 Assaf Gordon 48 | 49 | This program is free software: you can redistribute it and/or modify 50 | it under the terms of the GNU General Public License as published by 51 | the Free Software Foundation, either version 3 of the License, or 52 | (at your option) any later version. 53 | 54 | This program is distributed in the hope that it will be useful, 55 | but WITHOUT ANY WARRANTY; without even the implied warranty of 56 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 57 | GNU General Public License for more details. 58 | 59 | You should have received a copy of the GNU General Public License 60 | along with this program. If not, see . 61 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.txt: -------------------------------------------------------------------------------- 1 | Please **do not** send pull-requests or open new issues on Github. 2 | 3 | Github is a downstream mirror and is not frequently monitored, 4 | all development is coordinated upstream on GNU resources. 5 | 6 | Send general questions, suggestions or bug reports to: 7 | bug-datamash@gnu.org 8 | 9 | Before reporting a new bug, please previous discussions and bug reports 10 | on the Datamash Mailing list: 11 | https://lists.gnu.org/r/bug-datamash/ 12 | 13 | ## Effective bug reports 14 | 15 | * Include a descriptive subject line (e.g. what the problem is). 16 | * Include the version (i.e. the output of `datamash --version`). 17 | * Include the operating system and the type of hardware you are using 18 | (e.g. the output of `uname -a`). 19 | * Include the exact command and parameters you have used. 20 | * Clearly explain what is the output you expected to get, and what is 21 | the actual result you encountered. 22 | * Include as much information as possible to reproduce the problem. 23 | If the problem happens on a very large input file, try to provide 24 | a minimal example (a subset of the input file) that still causes the problem. 25 | *Do not* include attachments over 40kB. 26 | * List policy is reply-to-all, and non-subscribers may post. 27 | * There may be a moderation delay for a first-time post, whether or not 28 | you subscribe. 29 | 30 | 31 | ## Mailing List Etiquette 32 | 33 | When sending messages to bug-datamash@gnu.org : 34 | 35 | * Send messages as plain text. 36 | * Do not send messages encoded as HTML nor encoded as base64 MIME nor 37 | included as multiple formats. 38 | * Avoid sending large messages, such as log files, system call trace 39 | output, and other content resulting in messages over about 40 kB. 40 | * Avoid sending screenshots (e.g. PNG files). When reporting errors 41 | you encounter on the terminal, copy and paste the text to your message. 42 | 43 | 44 | 45 | 46 | 47 | Copyright (C) 2017-2021 Assaf Gordon 48 | 49 | This program is free software: you can redistribute it and/or modify 50 | it under the terms of the GNU General Public License as published by 51 | the Free Software Foundation, either version 3 of the License, or 52 | (at your option) any later version. 53 | 54 | This program is distributed in the hope that it will be useful, 55 | but WITHOUT ANY WARRANTY; without even the implied warranty of 56 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 57 | GNU General Public License for more details. 58 | 59 | You should have received a copy of the GNU General Public License 60 | along with this program. If not, see . 61 | -------------------------------------------------------------------------------- /src/text-lines.h: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2013-2021 Assaf Gordon 4 | 5 | This file is part of GNU Datamash. 6 | 7 | GNU Datamash is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | GNU Datamash is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with GNU Datamash. If not, see . 19 | */ 20 | 21 | /* Written by Assaf Gordon */ 22 | #ifndef __TEXT_LINES_H__ 23 | #define __TEXT_LINES_H__ 24 | 25 | struct field_record_t 26 | { 27 | size_t len; 28 | const char* buf; 29 | }; 30 | 31 | struct line_record_t 32 | { 33 | /* buffer of the entire line, as created with gnulib's 34 | readlinbuffer_delim */ 35 | struct linebuffer lbuf; 36 | 37 | /* array of fields. Each valid field is a pointer to 'lbuf' */ 38 | struct field_record_t *fields; 39 | size_t num_fields; /* number of fields in this line */ 40 | size_t alloc_fields; /* number of fields allocated */ 41 | }; 42 | 43 | static inline size_t 44 | line_record_length (const struct line_record_t *lr) 45 | { 46 | return lr->lbuf.length; 47 | } 48 | 49 | static inline const char* 50 | line_record_buffer (const struct line_record_t *lr) 51 | { 52 | return lr->lbuf.buffer; 53 | } 54 | 55 | static inline size_t 56 | line_record_num_fields (const struct line_record_t *lr) 57 | { 58 | return lr->num_fields; 59 | } 60 | 61 | static inline const struct field_record_t* 62 | line_record_field_unsafe (const struct line_record_t *lr, const size_t n) 63 | { 64 | return &lr->fields[n-1]; 65 | } 66 | 67 | static inline bool 68 | line_record_get_field (const struct line_record_t *lr, const size_t n, 69 | const char ** /* out */ pptr, size_t* /*out*/ plen) 70 | { 71 | assert (n!=0); /* LCOV_EXCL_LINE */ 72 | if (line_record_num_fields (lr) < n) 73 | return false; 74 | 75 | *pptr = lr->fields[n-1].buf; 76 | *plen = lr->fields[n-1].len; 77 | return true; 78 | } 79 | 80 | void 81 | line_record_init (struct line_record_t* lr); 82 | 83 | bool 84 | line_record_fread (struct /* in/out */ line_record_t* lr, 85 | FILE *stream, char delimiter, bool skip_comments, 86 | bool vnlog_prologue); 87 | 88 | void 89 | line_record_free (struct line_record_t* lr); 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /ABOUT-NLS 2 | /ABOUT-NLS~ 3 | *~ 4 | .*.swp 5 | 6 | .version 7 | 8 | aclocal.m4 9 | autom4te.cache/* 10 | # sym-linked macro files, autogenerated by libtoolize: 11 | m4/libtool.m4 12 | m4/ltoptions.m4 13 | m4/ltsugar.m4 14 | m4/ltversion.m4 15 | m4/gnulib-cache.m4 16 | m4/lt~obsolete.m4 17 | ABOUT-NLS 18 | build-aux/ 19 | 20 | 21 | # Autogenerate man page 22 | datamash.1 23 | decorate.1 24 | 25 | config.h 26 | config.h.in 27 | config.in 28 | config.log 29 | config.status 30 | config.cache 31 | configure 32 | libtool 33 | 34 | */.deps/* 35 | 36 | src/.dirstamp 37 | 38 | config/compile 39 | config/config.guess 40 | config/config.sub 41 | config/depcomp 42 | config/install-sh 43 | config/ltmain.sh 44 | config/missing 45 | 46 | Makefile 47 | */Makefile 48 | Makefile.in 49 | */Makefile.in 50 | 51 | stamp-h1 52 | *.o 53 | 54 | ChangeLog 55 | INSTALL 56 | 57 | # Jekyll's _site 58 | _site/ 59 | bin/ 60 | 61 | # gnulib modules, auto-generated in "./reconf" 62 | lib/ 63 | 64 | # getext files 65 | po/*.pot 66 | po/Makefile.in 67 | po/Makevars 68 | po/.gitignore 69 | po/stamp-po 70 | po/POTFILES 71 | po/Rules-quot 72 | po/remove-potcdate.sed 73 | po/*.po 74 | po/*.gmo 75 | po/LINGUAS 76 | po/.reference 77 | 78 | test-suite.log 79 | 80 | # compiled binary 81 | /datamash 82 | /decorate 83 | # compiled binary (with mingw cross-compiling) 84 | /datamash.exe 85 | /decorate.exe 86 | 87 | # Documentation files 88 | doc/.dirstamp 89 | doc/.gitignore 90 | doc/datamash.info 91 | doc/datamash.pdf 92 | doc/datamash.t2p 93 | doc/stamp-vti 94 | doc/version.texi 95 | # Auto-generated coverage info with 'make coverage' 96 | doc/coverage 97 | # Auto-generated HTML manual with 'make web-manual' 98 | doc/manual 99 | # Auto-generated PDF-related files 100 | doc/datamash.aux 101 | doc/datamash.cp 102 | doc/datamash.cps 103 | doc/datamash.fn 104 | doc/datamash.ky 105 | doc/datamash.log 106 | doc/datamash.op 107 | doc/datamash.pg 108 | doc/datamash.toc 109 | doc/datamash.tp 110 | doc/datamash.vr 111 | 112 | # Coverage files 113 | src/*.gcda 114 | src/*.gcno 115 | lib/*.gcda 116 | lib/*.gcno 117 | datamash.lcov 118 | datamash-cov/ 119 | 120 | # clang static analysis files 121 | /clang_output_* 122 | 123 | # GNU Global tags 124 | /GPATH 125 | /GRTAGS 126 | /GTAGS 127 | 128 | /nohup.out 129 | 130 | datamash-*.tar.gz 131 | datamash-*.tar.gz.sig 132 | /GNUmakefile 133 | /README-release 134 | /maint.mk 135 | 136 | # Generated by pmccabe2html 137 | cyclo-datamash.html 138 | 139 | # Side-effects of expensive file system checks 140 | /bad_disk.img 141 | /bottles.txt 142 | /log.txt 143 | /numbers.txt 144 | /tiny_disk.img 145 | 146 | # Side-effects of syntax checks 147 | /.sc-start-sc_* 148 | 149 | # Side-effects of make release 150 | /vc-diffs 151 | -------------------------------------------------------------------------------- /src/double-format.c: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2022-2025 Timothy Rice 4 | Copyright (C) 2018-2021 Assaf Gordon 5 | Copyright (C) 1994-2018 Free Software Foundation, Inc. 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with this program. If not, see . 19 | */ 20 | 21 | /* 22 | Portions of this function were copied from GNU coreutils' seq.c, 23 | hence FSF copyright. 24 | */ 25 | 26 | 27 | #include 28 | 29 | #include "system.h" 30 | #include "die.h" 31 | #include "quote.h" 32 | #include "xalloc.h" 33 | 34 | #include "text-options.h" 35 | #include "double-format.h" 36 | 37 | char* 38 | validate_double_format (char const *fmt) 39 | { 40 | size_t i; 41 | size_t len; 42 | char *out; 43 | 44 | len = strlen (fmt); 45 | 46 | if (len > MAX_NUMERIC_FORMAT_LEN - 1) { 47 | die (EXIT_FAILURE, 0, _("numeric format too large")); 48 | } 49 | 50 | /* extra space for NUL and 'L' printf-modifier */ 51 | out = xmalloc (len+2); 52 | 53 | for (i = 0; ! (fmt[i] == '%' && fmt[i + 1] != '%'); i += (fmt[i] == '%') + 1) 54 | if (!fmt[i]) 55 | die (EXIT_FAILURE, 0, 56 | _("format %s has no %% directive"), quote (fmt)); 57 | 58 | i++; 59 | i += strspn (fmt + i, "-+#0 '"); 60 | i += strspn (fmt + i, "0123456789"); 61 | if (fmt[i] == '.') 62 | { 63 | i++; 64 | i += strspn (fmt + i, "0123456789"); 65 | } 66 | 67 | if (!fmt[i]) 68 | die (EXIT_FAILURE, 0, 69 | _("format %s missing valid type after '%%'"), quote (fmt)); 70 | 71 | if (! strchr ("efgaEFGA", fmt[i])) 72 | die (EXIT_FAILURE, 0, 73 | _("format %s has unknown/invalid type %%%c directive"), 74 | quote (fmt), fmt[i]); 75 | 76 | /* Copy characters until the type character, add 'L', then the type, 77 | then the rest of the format string. */ 78 | memcpy (out, fmt, i); 79 | out[i] = 'L'; 80 | out[i+1] = fmt[i]; 81 | memcpy (out+i+2, fmt+i+1, len-i); 82 | out[len+1] = '\0'; 83 | 84 | for (i++; fmt[i] ; i += (fmt[i] == '%') + 1) 85 | if (fmt[i] == '%' && fmt[i + 1] != '%') 86 | die (EXIT_FAILURE, 0, _("format %s has too many %% directives"), 87 | quote (fmt)); 88 | 89 | return out; 90 | } 91 | -------------------------------------------------------------------------------- /tests/CuTmpdir.pm: -------------------------------------------------------------------------------- 1 | package CuTmpdir; 2 | # This file (CuTmpDir.pm) was copied from GNU Coretils. 3 | # It has the following copyright notice and license: 4 | # 5 | # create, then chdir into a temporary sub-directory 6 | 7 | # Copyright (C) 2007-2013 Free Software Foundation, Inc. 8 | 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | 22 | use strict; 23 | use warnings; 24 | 25 | use File::Temp; 26 | use File::Find; 27 | 28 | our $ME = $0 || ""; 29 | 30 | my $dir; 31 | 32 | sub skip_test($) 33 | { 34 | warn "$ME: skipping test: unsafe working directory name: '$_[0]'\n"; 35 | exit 77; 36 | } 37 | 38 | sub chmod_1 39 | { 40 | my $name = $_; 41 | 42 | # Skip symlinks and non-directories. 43 | -l $name || !-d _ 44 | and return; 45 | 46 | chmod 0700, $name; 47 | } 48 | 49 | sub chmod_tree 50 | { 51 | # When tempdir fails, it croaks, which leaves $dir undefined. 52 | defined $dir 53 | or return; 54 | 55 | # Perform the equivalent of find "$dir" -type d -print0|xargs -0 chmod -R 700. 56 | my $options = {untaint => 1, wanted => \&chmod_1}; 57 | find ($options, $dir); 58 | } 59 | 60 | sub import { 61 | my $prefix = $_[1]; 62 | 63 | $ME eq '-' && defined $prefix 64 | and $ME = $prefix; 65 | 66 | if ($prefix !~ /^\//) 67 | { 68 | eval 'use Cwd'; 69 | my $cwd = $@ ? '.' : Cwd::getcwd(); 70 | $prefix = "$cwd/$prefix"; 71 | } 72 | 73 | # Untaint for the upcoming mkdir. 74 | $prefix =~ m!^([-+\@\w./]+)$! 75 | or skip_test $prefix; 76 | $prefix = $1; 77 | 78 | my $original_pid = $$; 79 | 80 | my $on_sig_remove_tmpdir = sub { 81 | my ($sig) = @_; 82 | if ($$ == $original_pid and defined $dir) 83 | { 84 | chmod_tree; 85 | # Older versions of File::Temp lack this method. 86 | exists &File::Temp::cleanup 87 | and &File::Temp::cleanup; 88 | } 89 | $SIG{$sig} = 'DEFAULT'; 90 | kill $sig, $$; 91 | }; 92 | 93 | foreach my $sig (qw (INT TERM HUP)) 94 | { 95 | $SIG{$sig} = $on_sig_remove_tmpdir; 96 | } 97 | 98 | $dir = File::Temp::tempdir("$prefix.tmp-XXXX", CLEANUP => 1 ); 99 | chdir $dir 100 | or warn "$ME: failed to chdir to $dir: $!\n"; 101 | } 102 | 103 | END { 104 | # Move cwd out of the directory we're about to remove. 105 | # This is required on some systems, and by some versions of File::Temp. 106 | chdir '..' 107 | or warn "$ME: failed to chdir to .. from $dir: $!\n"; 108 | 109 | my $saved_errno = $?; 110 | chmod_tree; 111 | $? = $saved_errno; 112 | } 113 | 114 | 1; 115 | -------------------------------------------------------------------------------- /src/text-options.h: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2013-2021 Assaf Gordon 4 | 5 | This file is part of GNU Datamash. 6 | 7 | GNU Datamash is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | GNU Datamash is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with GNU Datamash. If not, see . 19 | */ 20 | 21 | /* Written by Assaf Gordon */ 22 | #ifndef __TEXT_OPTIONS_H__ 23 | #define __TEXT_OPTIONS_H__ 24 | 25 | /* 26 | Text Processing options, used by several modules. 27 | */ 28 | 29 | /* The character marking end of line. Default to \n. */ 30 | extern char eolchar; 31 | 32 | /* If TAB has this value, blanks separate fields. */ 33 | enum { TAB_WHITESPACE = CHAR_MAX + 1 }; 34 | 35 | /* Tab character separating fields. If TAB_WHITESPACE, then fields are 36 | separated by the empty string between a non-blank character and a blank 37 | character. */ 38 | extern int in_tab ; 39 | /* The output field separator character, defaults to a TAB (ASCII 9) */ 40 | extern int out_tab ; 41 | 42 | /* Global case-sensitivity option. Defaults to 'true' . */ 43 | extern bool case_sensitive ; 44 | 45 | /* Largest possible format string */ 46 | #define MAX_NUMERIC_FORMAT_LEN 100 47 | /* Numeric output format (default: "%.14Lg" */ 48 | extern char numeric_output_format[MAX_NUMERIC_FORMAT_LEN + 1]; 49 | /* number of bytes to allocate for output buffer */ 50 | extern int numeric_output_bufsize; 51 | 52 | /* The character used to separate collapsed/uniqued strings */ 53 | extern char collapse_separator; 54 | 55 | /* Should NA/NaN/empty values be silengtly ignored? */ 56 | extern bool remove_na_values; 57 | 58 | /* if true, 'transpose' and 'reverse' require every line to have 59 | the exact same number of fields. Otherwise, the program 60 | will fail with non-zero exit code. */ 61 | extern bool strict; 62 | 63 | /* if 'strict' is false, lines with fewer-than-expected fields 64 | will be filled with this value */ 65 | extern const char* missing_field_filler; 66 | 67 | /* if true, skip comments line (lines starting with optional whitespace 68 | followed by '#' or ';'. See line_record_is_comment. */ 69 | extern bool skip_comments; 70 | 71 | extern bool vnlog; 72 | 73 | #define UCHAR_LIM (UCHAR_MAX + 1) 74 | extern bool blanks[UCHAR_LIM]; 75 | 76 | /* Initializes the 'blanks' table. */ 77 | void 78 | init_blank_table (void); 79 | 80 | static inline void 81 | print_field_separator () 82 | { 83 | putchar (out_tab); 84 | } 85 | 86 | static inline void 87 | print_line_separator () 88 | { 89 | putchar (eolchar); 90 | } 91 | 92 | 93 | void 94 | set_numeric_output_precision (const char* digits); 95 | 96 | void 97 | set_numeric_printf_format (const char* format); 98 | 99 | #endif 100 | -------------------------------------------------------------------------------- /tests/datamash-rand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Unit Tests for GNU Datamash - perform simple calculation on input data 3 | 4 | # Copyright (C) 2014-2021 Assaf Gordon 5 | # 6 | # This file is part of GNU Datamash. 7 | # 8 | # GNU Datamash is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, either version 3 of the License, or 11 | # (at your option) any later version. 12 | # 13 | # GNU Datamash is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with GNU Datamash. If not, see . 20 | # 21 | # Written by Assaf Gordon 22 | 23 | ## 24 | ## This script tests the randomness of the 'rand' operation 25 | ## 26 | 27 | . "${test_dir=.}/init.sh"; path_prepend_ ./src 28 | 29 | fail=0 30 | 31 | require_paste_ 32 | 33 | ## Ensure seq is useable 34 | openbsd_seq_replacement_ 35 | seq 10 >/dev/null 2>/dev/null || 36 | skip_ "requires a working seq" 37 | 38 | 39 | ## 40 | ## --- First test --- 41 | ## 42 | ## select a random number between 0 and 9, 43 | ## repeat selection for 1000 times. 44 | ## Each digit should be returned at least once 45 | ## (unless we're extremely unlucky...) 46 | 47 | INPUT=$(seq 0 9) || framework_failure_ "generating INPUT failed" 48 | 49 | for i in $(seq 1000) ; 50 | do 51 | echo "$INPUT" | datamash rand 1 52 | done > out_rand1 || framework_failure_ "test1 failed: datamash error" 53 | 54 | ## First Check: each number should be there once 55 | RESULT=$(cat out_rand1 | sort -n | uniq | paste -d , -s -) || 56 | framework_failure_ "test1 failed: error preparing first check" 57 | 58 | [ "$RESULT" = "0,1,2,3,4,5,6,7,8,9" ] || 59 | { warn_ "test1 failed. RESULT='$RESULT'." ; fail=1 ; } 60 | 61 | 62 | ## Second check - we expect (hope?) the distribution is uniform, 63 | ## and each number appears more-or-less equaly. 64 | ## This is a poor-man's way of quasi-validation... 65 | ## Using 'datamash', count how many times each number appears, 66 | ## then, find the smallest count - in a uniform distribution, 67 | ## we expect each number to appear close to 100 times (1000 draws of 10 items). 68 | ## 69 | ## NOTE: 70 | ## We use 'datamash' to validate itself... but only after assuming the 71 | ## basic operations (sort, group, count, min) have been already tested. 72 | RESULT=$(cat out_rand1 | 73 | datamash --sort --group 1 count 1 | 74 | datamash min 2) || 75 | framework_failure_ "test1 failed: error preparing second check" 76 | 77 | ## We set the cut-off at 60 - if any number appeared less than 60 times, 78 | ## we *might* have a problem in the uniform randomness in 'datamash'. 79 | if [ "$RESULT" -lt "60" ] ; then 80 | warn_ "Possible unifority problem in 'rand' operation." 81 | echo "--- distribution of numbers ---" 82 | cat out_rand1 | datamash --sort --group 1 count 1 83 | echo "--- end ---" 84 | echo "--- 1000 random draws start here---" 85 | cat out_rand1 86 | echo "---- end ----" 87 | fail=1 88 | fi 89 | 90 | 91 | Exit $fail 92 | -------------------------------------------------------------------------------- /tests/datamash-strbin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Unit Tests for GNU Datamash - perform simple calculation on input data 3 | 4 | # Copyright (C) 2015-2021 Assaf Gordon 5 | # 6 | # This file is part of GNU Datamash. 7 | # 8 | # GNU Datamash is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, either version 3 of the License, or 11 | # (at your option) any later version. 12 | # 13 | # GNU Datamash is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with GNU Datamash. If not, see . 20 | # 21 | # Written by Assaf Gordon 22 | 23 | ## 24 | ## This script tests the strbin (string binning/hashing) operator 25 | ## 26 | 27 | . "${test_dir=.}/init.sh"; path_prepend_ ./src 28 | 29 | fail=0 30 | 31 | ## Ensure seq is useable 32 | openbsd_seq_replacement_ 33 | seq 10 >/dev/null 2>/dev/null \ 34 | || skip_ "requires a working seq" 35 | 36 | 37 | # Generate input 38 | seq 1000 | sed 's/^/id-/' > in \ 39 | || framework_failure_ "generating INPUT failed" 40 | 41 | # bin into 10 groups 42 | datamash strbin 1 < in > out1 \ 43 | || { warn_ "'datamash strbin 1' failed" ; fail=1 ; } 44 | 45 | # Check output values 46 | sort -n -u < out1 > out2 || framework_failure_ "failed to sort out1" 47 | 48 | 49 | # Default binning to 10 bins, accept only single digits 50 | grep '^[^0-9]$' < out2 > /dev/null \ 51 | && { warn_ "'datamash strbin 1' generated invalid output (out2):" ; 52 | cat out2 >&2 ; 53 | fail=1 ; } 54 | 55 | # Test binning into varying number of bins 56 | for i in 5 10 100 300 ; 57 | do 58 | datamash strbin:$i 1 < in > out-$i \ 59 | || { warn_ "'datamash strbin:$i 1' failed" ; fail=1 ; break ; } 60 | 61 | # Check output values 62 | max=$(sort -n -u -r < out-$i | head -n1) 63 | 64 | test -n "$max" \ 65 | || { warn_ "'datamash strbin:$i 1' failed - max output is empty" ; 66 | fail=1 ; 67 | break ; } 68 | 69 | test "$max" -gt 0 \ 70 | || { warn_ "'datamash strbin:$i 1' failed - max value too small ($max)"; 71 | fail=1 ; 72 | break ; } 73 | 74 | test "$max" -lt "$i" \ 75 | || { warn_ "'datamash strbin:$i 1' failed - max value too large ($max)"; 76 | fail=1 ; 77 | break ; } 78 | done 79 | 80 | 81 | # Same srting must result in the same bin, 82 | # in the same run and in different runs. 83 | # (the returned value, however, is machine-dependant) 84 | 85 | text="hello-42-world" 86 | for i in 5 10 100 300 ; 87 | do 88 | bin1=$(printf "%s\n%s\n%s\n" "$text" "$text" "$text" \ 89 | | datamash strbin:$i 1 | uniq) 90 | bin2=$(printf "%s\n" "$text" \ 91 | | datamash strbin:$i 1 | uniq) 92 | 93 | test -n "$bin1" \ 94 | || { warn_ "'datamash strbin:$i 1' failed on text '$text' - empty"; 95 | fail=1 ; 96 | break ; } 97 | 98 | test "x$bin1" = "x$bin2" \ 99 | || { warn_ "'datamash strbin:$i 1' failed on text '$text' - " \ 100 | "bin1 ($bin1) doesn't match bin2 ($bin2)" ; 101 | fail=1 ; 102 | break ; } 103 | done 104 | 105 | 106 | Exit $fail 107 | -------------------------------------------------------------------------------- /src/column-headers.c: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2014-2021 Assaf Gordon. 4 | 5 | This file is part of GNU Datamash. 6 | 7 | GNU Datamash is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | GNU Datamash is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with GNU Datamash. If not, see . 19 | */ 20 | 21 | /* Written by Assaf Gordon */ 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include "system.h" 30 | #include "xalloc.h" 31 | #include "linebuffer.h" 32 | #include "ignore-value.h" 33 | #include "intprops.h" 34 | 35 | #include "text-options.h" 36 | #include "text-lines.h" 37 | #include "column-headers.h" 38 | 39 | static size_t num_input_column_headers = 0 ; 40 | static char** input_column_headers; 41 | 42 | void free_column_headers () 43 | { 44 | for (size_t i = 0; i < num_input_column_headers; ++i) 45 | { 46 | free (input_column_headers[i]); 47 | input_column_headers[i] = NULL; 48 | } 49 | free (input_column_headers); 50 | input_column_headers = NULL; 51 | } 52 | 53 | size_t _GL_ATTRIBUTE_PURE 54 | get_num_column_headers () 55 | { 56 | return num_input_column_headers; 57 | } 58 | 59 | const char* _GL_ATTRIBUTE_PURE 60 | get_input_field_name (size_t field_num) 61 | { 62 | assert (field_num > 0 /* LCOV_EXCL_LINE */ 63 | && field_num <= num_input_column_headers); /* LCOV_EXCL_LINE */ 64 | return input_column_headers[field_num-1]; 65 | } 66 | 67 | size_t _GL_ATTRIBUTE_PURE 68 | get_input_field_number (const char* field_name) 69 | { 70 | assert (field_name != NULL); /* LCOV_EXCL_LINE */ 71 | assert (*field_name != 0); /* LCOV_EXCL_LINE */ 72 | for (size_t i = 0 ; i < num_input_column_headers ; ++i) 73 | { 74 | if (STREQ (field_name,input_column_headers[i])) 75 | return i+1; 76 | } 77 | return 0; 78 | } 79 | 80 | void 81 | build_input_line_headers (const struct line_record_t *lr, bool store_names) 82 | { 83 | char *str; 84 | size_t len = 0; 85 | const size_t num_fields = line_record_num_fields (lr); 86 | const size_t field_name_buf_size = 7+INT_BUFSIZE_BOUND (size_t)+1; 87 | 88 | num_input_column_headers = num_fields; 89 | input_column_headers = XNMALLOC (num_fields, char*); 90 | 91 | for (size_t i = 1; i <= num_fields; ++i) 92 | { 93 | if (!store_names) 94 | { 95 | str = xmalloc ( field_name_buf_size ); 96 | ignore_value (snprintf (str, field_name_buf_size, 97 | "field-%"PRIuMAX,(uintmax_t)i)); 98 | } 99 | else 100 | { 101 | const char* tmp = NULL; 102 | line_record_get_field (lr, i, &tmp, &len); 103 | str = xmalloc ( len+1 ); 104 | memcpy (str, tmp, len); 105 | str[len] = 0; 106 | } 107 | 108 | input_column_headers[i-1] = str; 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /tests/datamash-md5.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - tests md5 operations 4 | 5 | Copyright (C) 2013-2021 Assaf Gordon 6 | Copyright (C) 2022-2025 Timothy Rice 7 | 8 | This file is part of GNU Datamash. 9 | 10 | GNU Datamash is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | GNU Datamash is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with GNU Datamash. If not, see . 22 | 23 | Written by Assaf Gordon. 24 | =cut 25 | ## NOTE: Digest::MD5 is supposed to be a core module, 26 | ## but some OSes don't distributed it (e.g. CentOS 7 requires 27 | ## a separate package 'perl-Digest-MD5'). 28 | ## If not available, skip this tests (instead of failing). 29 | use strict; 30 | use warnings; 31 | 32 | # Until a better way comes along to auto-use Coreutils Perl modules 33 | # as in the coreutils' autotools system. 34 | use Coreutils; 35 | use CuSkip; 36 | use CuTmpdir qw(datamash); 37 | 38 | ## Perl 5.8 and earlier do not have Digest::SHA as core module. 39 | ## Skip the test if it is not found. 40 | my $have_sha = 41 | eval qq{use Digest::MD5 qw(md5_hex);1;}; 42 | 43 | CuSkip::skip "requires Perl with Digest::MD5 module\nload error:\n$@" 44 | unless $have_sha; 45 | 46 | (my $program_name = $0) =~ s|.*/||; 47 | my $prog_bin = 'datamash'; 48 | 49 | ## Cross-Compiling portability hack: 50 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 51 | ## the full path of the binary, if the binary is on the $PATH. 52 | ## So we try to detect what is the actual returned value of the program 53 | ## in case of an error. 54 | my $prog = `$prog_bin ---print-progname`; 55 | $prog = $prog_bin unless $prog; 56 | 57 | # Turn off localization of executable's output. 58 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 59 | 60 | my $in_g1=<<'EOF'; 61 | A 100 62 | A 10 63 | A 50 64 | A 35 65 | EOF 66 | 67 | # Header line, with custom field separator 68 | my $in_hdr2=<<'EOF'; 69 | x:y:z 70 | A:3:W 71 | A:5:W 72 | A:7:W 73 | A:11:X 74 | A:13:X 75 | B:17:Y 76 | B:19:Z 77 | C:23:Z 78 | EOF 79 | 80 | =pod 81 | Example: 82 | my $data = "a 1\nb 2\n"; 83 | my $out = transform_column($data, 2, \&md5_hex); 84 | # out => md5_hex("1") . "\n" . md5_hex("2") . "\n" ; 85 | =cut 86 | sub transform_column($$$) 87 | { 88 | my $input_text = shift; 89 | my $input_column = shift; 90 | my $function = shift; 91 | 92 | return join "", 93 | map { "$_\n" } 94 | map { &$function($_->[ $input_column - 1 ]) } 95 | map { [ split / / ] } 96 | split("\n", $input_text); 97 | } 98 | 99 | # md5 of the second column of '$in_g1' 100 | my $out_g1_md5 = transform_column ($in_g1, 2, \&md5_hex); 101 | 102 | my @Tests = 103 | ( 104 | ['md5-1', '-W md5 2', {IN_PIPE=>$in_g1}, {OUT=>$out_g1_md5}], 105 | ); 106 | 107 | my $save_temps = $ENV{SAVE_TEMPS}; 108 | my $verbose = $ENV{VERBOSE}; 109 | 110 | my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); 111 | exit $fail; 112 | -------------------------------------------------------------------------------- /src/op-parser.h: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2013-2021 Assaf Gordon 4 | 5 | This file is part of GNU Datamash. 6 | 7 | GNU Datamash is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | GNU Datamash is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with GNU Datamash. If not, see . 19 | */ 20 | 21 | /* Written by Assaf Gordon */ 22 | #ifndef __OP_PARSER_H__ 23 | #define __OP_PARSER_H__ 24 | 25 | struct group_column_t 26 | { 27 | size_t num; /* 1 = first field */ 28 | bool by_name; /* true if the user gave a column name */ 29 | char* name; /* column name - to be converted to number after 30 | header line is read */ 31 | }; 32 | 33 | struct op_column_t 34 | { 35 | size_t num; /* 1 = first field */ 36 | bool by_name; /* true if the user gave a column name */ 37 | char* name; /* column name - to be converted to number after 38 | header line is read */ 39 | enum field_operation op; 40 | }; 41 | 42 | struct mode_check_params_t 43 | { 44 | uintmax_t n_lines; /* If not zero, require this number of lines */ 45 | uintmax_t n_fields; /* if not zero, require this number of fields */ 46 | }; 47 | 48 | struct datamash_ops 49 | { 50 | enum processing_mode mode; /* the processing mode */ 51 | bool header_required; /* true if any of the fields (groups/operations) 52 | used a named column instead of a number. */ 53 | 54 | struct group_column_t *grps; /* group-by columns */ 55 | size_t num_grps; 56 | size_t alloc_grps; 57 | 58 | struct fieldop *ops; /* field operations */ 59 | size_t num_ops; 60 | size_t alloc_ops; 61 | 62 | /* Additional parameters for mode operatons 63 | (i.e. ones relating to the operation mode, 64 | not to specific field-ops) */ 65 | union 66 | { 67 | struct mode_check_params_t check_params; 68 | } mode_params; 69 | }; 70 | 71 | /* Parse the operations, return new datamash_ops structure. 72 | This function assumes new syntax: 73 | 1. The first word is either a mode (e.g. transpose/groupby/reverse) 74 | or an operation (e.g. sum/min/max) - implying a 'group-by' mode. 75 | 2. The rest of the parameters are operations. */ 76 | struct datamash_ops* 77 | datamash_ops_parse ( int argc, const char* argv[] ); 78 | 79 | /* Parse the operations, return new datamash_ops structure. 80 | This function assumes old syntax: 81 | The user already specified "-g X,Y,Z" - the processing mode is known, 82 | and the grouping text 'X,Y,Z' is known. 83 | The function will only accept operations (e.g. sum/min/max). */ 84 | struct datamash_ops* 85 | datamash_ops_parse_premode ( enum processing_mode pm, 86 | const char* grouping_spec, 87 | int argc, const char* argv[] ); 88 | 89 | void 90 | datamash_ops_debug_print ( const struct datamash_ops* p ); 91 | 92 | void 93 | datamash_ops_free (struct datamash_ops *p); 94 | 95 | #endif 96 | -------------------------------------------------------------------------------- /tests/datamash-sort-header-deprecated.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - perform simple calculation on input data 4 | 5 | Copyright (C) 2022-2025 Timothy Rice 6 | Copyright (C) 2013-2021 Assaf Gordon 7 | 8 | This file is part of GNU Datamash. 9 | 10 | GNU Datamash is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | GNU Datamash is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with GNU Datamash. If not, see . 22 | 23 | Written by Assaf Gordon and Tim Rice. 24 | =cut 25 | use strict; 26 | use warnings; 27 | use List::Util qw/max/; 28 | use Data::Dumper; 29 | 30 | # Until a better way comes along to auto-use Coreutils Perl modules 31 | # as in the coreutils' autotools system. 32 | use Coreutils; 33 | use CuSkip; 34 | use CuTmpdir qw(datamash); 35 | 36 | (my $program_name = $0) =~ s|.*/||; 37 | my $prog_bin = 'datamash'; 38 | 39 | ## Cross-Compiling portability hack: 40 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 41 | ## the full path of the binary, if the binary is on the $PATH. 42 | ## So we try to detect what is the actual returned value of the program 43 | ## in case of an error. 44 | my $prog = `$prog_bin --foobar 2>&1 | head -n 1 | cut -f1 -d:`; 45 | chomp $prog if $prog; 46 | $prog = $prog_bin unless $prog; 47 | 48 | # Turn off localization of executable's output. 49 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 50 | 51 | # An unsorted input with a header line 52 | my $INFILE=<<'EOF'; 53 | x y z 54 | A % 1 55 | B ( 2 56 | A & 3 57 | B = 4 58 | EOF 59 | 60 | my @INFILE_lines = split /\n/, $INFILE, -1; 61 | my $INFILE_NO_HEADER = join("\n", @INFILE_lines[1..$#INFILE_lines]); 62 | 63 | my $exp_sort_in_header_full=<<'EOF'; 64 | A % 1 1,3 65 | B ( 2 2,4 66 | EOF 67 | 68 | my $exp_sort_out_header_full=<<'EOF'; 69 | field-1 field-2 field-3 unique(field-3) 70 | A % 1 1,3 71 | B ( 2 2,4 72 | EOF 73 | 74 | my $exp_sort_headers_full=<<'EOF'; 75 | x y z unique(z) 76 | A % 1 1,3 77 | B ( 2 2,4 78 | EOF 79 | 80 | my $deprecation_notice="$prog: Using -f/--full with non-linewise " . 81 | "operations is deprecated and will be disabled " . 82 | "in a future release.\n"; 83 | 84 | my @Tests = 85 | ( 86 | ['shdep01', '-t " " --sort --full --header-out -g 1 unique 3', 87 | {IN_PIPE=>$INFILE_NO_HEADER}, {OUT=>$exp_sort_out_header_full}, 88 | {ERR=>$deprecation_notice}], 89 | ['shdep02', '-t " " -g 1 --sort --full --header-in unique 3', 90 | {IN_PIPE=>$INFILE}, {OUT=>$exp_sort_in_header_full}, 91 | {ERR=>$deprecation_notice}], 92 | ['shdep03', '-t " " -g 1 --sort --full --headers unique 3', 93 | {IN_PIPE=>$INFILE}, {OUT=>$exp_sort_headers_full}, 94 | {ERR=>$deprecation_notice}], 95 | 96 | # Check sort-piping with empty input - should always produce empty output 97 | ['shdep04', '-t " " --sort --full unique 3', 98 | {IN_PIPE=>""}, {OUT=>""}, 99 | {ERR=>$deprecation_notice}], 100 | ); 101 | 102 | my $save_temps = $ENV{SAVE_TEMPS}; 103 | my $verbose = $ENV{VERBOSE}; 104 | 105 | my $fail = run_tests ($program_name, $prog_bin, \@Tests, $save_temps, $verbose); 106 | exit $fail; 107 | -------------------------------------------------------------------------------- /tests/datamash-sort-errors.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Unit Tests for GNU Datamash - perform simple calculation on input data 3 | 4 | # Copyright (C) 2014-2021 Assaf Gordon 5 | # 6 | # This file is part of GNU Datamash. 7 | # 8 | # GNU Datamash is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, either version 3 of the License, or 11 | # (at your option) any later version. 12 | # 13 | # GNU Datamash is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with GNU Datamash. If not, see . 20 | # 21 | # Written by Assaf Gordon 22 | 23 | ## 24 | ## This script tests the sort piping code for errors 25 | ## 26 | 27 | . "${test_dir=.}/init.sh"; path_prepend_ ./src 28 | 29 | fail=0 30 | 31 | require_paste_ 32 | 33 | ## Ensure seq is useable 34 | openbsd_seq_replacement_ 35 | seq 10 >/dev/null 2>/dev/null || 36 | skip_ "requires a working seq" 37 | 38 | ## Cross-Compiling portability hack: 39 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 40 | ## the full path of the binary, if the binary is on the $PATH. 41 | ## So we try to detect what is the actual returned value of the program 42 | ## in case of an error. 43 | PROG_ARGV0=$(datamash --foobar 2>&1 | head -n 1 | cut -f1 -d:) 44 | [ -z "$PROG_ARGV0" ] && PROG_ARGV0="datamash" 45 | 46 | ## 47 | ## 48 | ## Test preparations 49 | ## 50 | ## 51 | GROUPPARAM=$(seq 1000 2000 | paste -d "," -s -) || 52 | framework_failure_ "failed to construct too-long group parameter" 53 | 54 | ## The expected error message when 'sort' is not found 55 | printf 'sh: sort: not found\ndatamash: read error (on close)' > exp_err2 || 56 | framework_failure_ "failed to create exp_err2" 57 | 58 | ## 59 | ## Create a bad 'sort' executable, to simulate failed pipe/popen 60 | ## 61 | BADDIR1=$(mktemp -d bad_sort.XXXXXX) || 62 | framework_failure_ "Failed to create temp directory for bad-sort" 63 | printf "#!/foo/bar/bad/interpreter" > "$BADDIR1/sort" || 64 | framework_failure_ "Failed to create bad-sort: $BADDIR1/sort" 65 | chmod a+x "$BADDIR1/sort" || 66 | framework_failure_ "failed to make bad-sort executable" 67 | ORIGPATH=$PATH 68 | 69 | ## The directory where the "datamash' executable is 70 | DATAMASHDIR=$(dirname $(which datamash)) 71 | test -z "$DATAMASHDIR" && 72 | framework_failure_ "failed to find datamash's directory" 73 | 74 | ## Create a 'sort' which will crash 75 | BADDIR=$(mktemp -d badsort.XXXXXX) || 76 | framework_failure_ "failed to create bad-sort-dir" 77 | echo '#!/bin/sh 78 | read A 79 | echo "$A" 80 | read B 81 | echo "$B" 82 | Z=0 83 | C=$((1/$Z)) 84 | ' > "$BADDIR/sort" || framework_failure_ "failed to create $BADDIR/sort" 85 | chmod a+x "$BADDIR/sort" || 86 | framework_failure_ "failed to make $BADDIR/sort executable" 87 | 88 | 89 | ## 90 | ## Tests start here 91 | ## 92 | 93 | ## 94 | ## Test with non-existing 'sort' executable, by giving an invalid path 95 | ## 96 | ## NOTE: This run SHOULD return an error, hence the "&&" instead of "||" 97 | ## 98 | seq 10 | datamash --sort --sort-cmd=/not/a/sort -g 1 sum 1 && 99 | { warn_ "datamash --sort with non existing 'sort' did not fail " \ 100 | "(it should have failed)" ; fail=1 ; } 101 | 102 | ## 103 | ## Test with a 'sort' that crashes 104 | ## NOTE: This run SHOULD return an error, hence the "&&" instead of "||" 105 | ## 106 | seq 10 | datamash --sort --sort-cmd="${BADDIR}/sort" -g 1 sum 1 && 107 | { warn_ "datamash --sort with crashing 'sort' did not fail " \ 108 | "(it should have failed)" ; fail=1 ; } 109 | 110 | Exit $fail 111 | -------------------------------------------------------------------------------- /contrib/bash-completion/datamash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # datamash bash-completion 3 | # 4 | ## Copyright (C) 2014-2021 Assaf Gordon 5 | ## Copyright (C) 2022-2025 Timothy Rice 6 | ## 7 | ## This file is part of GNU Datamash. 8 | ## 9 | ## This file is free software; as a special exception the author gives 10 | ## unlimited permission to copy and/or distribute it, with or without 11 | ## modifications, as long as this notice is preserved. 12 | ## 13 | ## This program is distributed in the hope that it will be useful, but 14 | ## WITHOUT ANY WARRANTY, to the extent permitted by law; without even the 15 | ## implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 16 | ## 17 | 18 | _datamash () 19 | { 20 | local cur prev words cword split=false 21 | _get_comp_words_by_ref cur prev words cword 22 | 23 | local modes="check crosstab groupby reverse rmdup transpose" 24 | local modes_re=${modes// /|} 25 | 26 | #NOTE: do not change the spaces (or indentation or backslashes) 27 | # or the regex will fail. 28 | local groupby_ops="sum min max absmin absmax range \ 29 | count first last rand \ 30 | unique uniq collapse countunique \ 31 | mean geomean harmmean trimmean median q1 q3 iqr perc mode antimode \ 32 | pstdev sstdev pvar svar mad madraw \ 33 | pskew sskew pkurt skurt dpo jarque \ 34 | pcov scov ppearson spearson dotprod" 35 | local groupby_ops_re=${groupby_ops// /|} 36 | 37 | local line_ops="base64 debase64 md5 sha1 sha224 sha256 sha384 sha512 \ 38 | round floor ceil trunc frac bin strbin dirname basename extname barename \ 39 | getnum cut echo" 40 | local line_ops_re=${line_ops// /|} 41 | 42 | local datamash_short_options="-c -C -f -g -h -H -i -s -t -R -V -W -z" 43 | 44 | local datamash_long_options=" --skip-comments --full --group --header-in 45 | --header-out --headers --vnlog --ignore-case --sort --no-strict --filler 46 | --format --field-separator --narm --output-delimiter --round --whitespace 47 | --zero-terminated --collapse-delimiter --help --version" 48 | 49 | local all_ops_re="$modes_re|$groupby_ops_re|$line_ops_re" 50 | 51 | # IF the previous word as an operator, the next parameter should 52 | # be a numeric value, so don't offer any completion. 53 | if [[ "$prev" =~ $all_ops_re ]] ; then 54 | return 0 55 | fi 56 | 57 | # Based on current parameteres, check which mode we're in. 58 | local suggest_modes=1 59 | local suggest_groupby_ops=1 60 | local suggest_line_ops=1 61 | local i=$((cword-1)) 62 | 63 | while [ "$i" -gt 0 ] ; do 64 | local tmp_word=${words[$i]} 65 | 66 | if [[ "$tmp_word" =~ $modes_re ]] ; then 67 | suggest_modes=0 68 | case "$tmp_word" in 69 | crosstab|groupby) suggest_line_ops=0 70 | esac 71 | fi 72 | 73 | if [[ "$tmp_word" =~ $groupby_ops_re ]]; then 74 | suggest_modes=0 75 | suggest_line_ops=0 76 | fi 77 | # if the user specified -g, we're in "groupby" mode 78 | if [[ "$tmp_word" = "-g" ]] ; then 79 | suggest_modes=0 80 | suggest_line_ops=0 81 | fi 82 | 83 | if [[ "$tmp_word" =~ $line_ops_re ]]; then 84 | suggest_modes=0 85 | suggest_groupby_ops=0 86 | fi 87 | 88 | i=$((i-1)) 89 | done 90 | 91 | # Options trump everything (if the user typed '-') 92 | if [[ "$cur" = "-"* ]] ; then 93 | COMPREPLY=( $(compgen -W \ 94 | "$datamash_short_options $datamash_long_options" -- "$cur") ) 95 | return 0 96 | fi 97 | 98 | # suggest other possibilities 99 | local suggest="" 100 | if [ "$suggest_modes" -eq 1 ] ; then 101 | suggest="$modes" 102 | fi 103 | if [ "$suggest_groupby_ops" -eq 1 ] ; then 104 | suggest="$suggest $groupby_ops" 105 | fi 106 | if [ "$suggest_line_ops" -eq 1 ] ; then 107 | suggest="$suggest $line_ops" 108 | fi 109 | 110 | COMPREPLY=( $(compgen -W "$suggest" -- "$cur") ) 111 | return 0 112 | } 113 | 114 | complete -F _datamash datamash 115 | 116 | # ex: ts=4 sw=4 et filetype=sh 117 | -------------------------------------------------------------------------------- /tests/datamash-sha.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - tests sha1/256/5125 operations 4 | 5 | Copyright (C) 2013-2021 Assaf Gordon 6 | Copyright (C) 2022-2025 Timothy Rice 7 | 8 | This file is part of GNU Datamash. 9 | 10 | GNU Datamash is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | GNU Datamash is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with GNU Datamash. If not, see . 22 | 23 | Written by Assaf Gordon. 24 | =cut 25 | ## NOTE: Perl<5.10 don't have Digest::SHA core module - 26 | ## So skip only those tests if needed. 27 | ## Other line operations (e.g. md5/base64) are tested in the main 28 | ## unit test module 'datamash-tests.pl'. 29 | use strict; 30 | use warnings; 31 | 32 | # Until a better way comes along to auto-use Coreutils Perl modules 33 | # as in the coreutils' autotools system. 34 | use Coreutils; 35 | use CuSkip; 36 | use CuTmpdir qw(datamash); 37 | 38 | ## Perl 5.8 and earlier do not have Digest::SHA as core module. 39 | ## Skip the test if it is not found. 40 | my $have_sha = 41 | eval qq{use Digest::SHA qw(sha1_hex sha224_hex sha256_hex 42 | sha384_hex sha512_hex);1;}; 43 | 44 | CuSkip::skip "requires Perl>5.8 with Digest::SHA module\nload error:\n$@" 45 | unless $have_sha; 46 | 47 | (my $program_name = $0) =~ s|.*/||; 48 | my $prog_bin = 'datamash'; 49 | 50 | ## Cross-Compiling portability hack: 51 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 52 | ## the full path of the binary, if the binary is on the $PATH. 53 | ## So we try to detect what is the actual returned value of the program 54 | ## in case of an error. 55 | my $prog = `$prog_bin --foobar 2>&1 | head -n 1 | cut -f1 -d:`; 56 | chomp $prog if $prog; 57 | $prog = $prog_bin unless $prog; 58 | 59 | # Turn off localization of executable's output. 60 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 61 | 62 | my $in_g1=<<'EOF'; 63 | A 100 64 | A 10 65 | A 50 66 | A 35 67 | EOF 68 | 69 | # Header line, with custom field separator 70 | my $in_hdr2=<<'EOF'; 71 | x:y:z 72 | A:3:W 73 | A:5:W 74 | A:7:W 75 | A:11:X 76 | A:13:X 77 | B:17:Y 78 | B:19:Z 79 | C:23:Z 80 | EOF 81 | 82 | =pod 83 | Example: 84 | my $data = "a 1\nb 2\n"; 85 | my $out = transform_column($data, 2, \&md5_hex); 86 | # out => md5_hex("1") . "\n" . md5_hex("2") . "\n" ; 87 | =cut 88 | sub transform_column($$$) 89 | { 90 | my $input_text = shift; 91 | my $input_column = shift; 92 | my $function = shift; 93 | 94 | return join "", 95 | map { "$_\n" } 96 | map { &$function($_->[ $input_column - 1 ]) } 97 | map { [ split / / ] } 98 | split("\n", $input_text); 99 | } 100 | 101 | my $out_g1_sha1 = transform_column ($in_g1, 2, \&sha1_hex); 102 | my $out_g1_sha224 = transform_column ($in_g1, 2, \&sha224_hex); 103 | my $out_g1_sha256 = transform_column ($in_g1, 2, \&sha256_hex); 104 | my $out_g1_sha384 = transform_column ($in_g1, 2, \&sha384_hex); 105 | my $out_g1_sha512 = transform_column ($in_g1, 2, \&sha512_hex); 106 | 107 | my @Tests = 108 | ( 109 | ['sha1-1', '-W sha1 2', {IN_PIPE=>$in_g1}, {OUT=>$out_g1_sha1}], 110 | ['sha224-1','-W sha224 2', {IN_PIPE=>$in_g1}, {OUT=>$out_g1_sha224}], 111 | ['sha256-1','-W sha256 2', {IN_PIPE=>$in_g1}, {OUT=>$out_g1_sha256}], 112 | ['sha384-1','-W sha384 2', {IN_PIPE=>$in_g1}, {OUT=>$out_g1_sha384}], 113 | ['sha512-1','-W sha512 2', {IN_PIPE=>$in_g1}, {OUT=>$out_g1_sha512}], 114 | ); 115 | 116 | my $save_temps = $ENV{SAVE_TEMPS}; 117 | my $verbose = $ENV{VERBOSE}; 118 | 119 | my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); 120 | exit $fail; 121 | -------------------------------------------------------------------------------- /tests/datamash-io-errors.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Unit Tests for GNU Datamash - I/O error simulation 3 | 4 | # Copyright (C) 2014-2021 Assaf Gordon 5 | # 6 | # This file is part of GNU Datamash. 7 | # 8 | # GNU Datamash is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, either version 3 of the License, or 11 | # (at your option) any later version. 12 | # 13 | # GNU Datamash is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with GNU Datamash. If not, see . 20 | # 21 | # Written by Assaf Gordon 22 | 23 | ## 24 | ## This script tests datamash's handling of I/O errors. 25 | ## It requires special setup, and is skipped unless found. 26 | ## 27 | 28 | . "${test_dir=.}/init.sh"; path_prepend_ ./src 29 | 30 | expensive_ 31 | 32 | fail=0 33 | 34 | ## 35 | ## The required mounted file-systems 36 | ## 37 | FULLFS=/tmp/fullfs/ 38 | BADFS=/tmp/badfs/ 39 | 40 | which mountpoint >/dev/null 2>&1 || 41 | skip_ "requires mountpoint program" 42 | stdbuf --version >/dev/null 2>&1 || 43 | skip_ "requires GNU stdbuf program" 44 | stat --version >/dev/null 2>&1 || 45 | skip_ "requires GNU stat program" 46 | mountpoint -q "$FULLFS" || 47 | skip_ "requires special mounted file system '$FULLFS'" 48 | mountpoint -q "$BADFS" || 49 | skip_ "requires special mounted file system '$BADFS'" 50 | 51 | ## 52 | ## Clean files in the (almost) full file-system. 53 | ## This will ensure few writes are successful before getting "no space" error 54 | ## (unlike "/dev/full"). 55 | ## 56 | clean_full_fs() 57 | { 58 | find "$FULLFS" -maxdepth 1 -type f -delete || 59 | framework_failure_ "failed to clean full-fs" 60 | # Give the system time to actually delete the files 61 | fullfs_retries=1 62 | FREE=0 63 | while test $fullfs_retries -lt 5 && test $FREE -le 5 ; do 64 | sync ; sleep 1 65 | FREE=$(stat --file-system -c %a "$FULLFS") || 66 | framework_failure_ "failed to find free space on $FULLFS" 67 | fullfs_retries=$((fullfs_retries+1)) 68 | done 69 | # Ensure the (almost) full file system has a bit of free space... 70 | test "$FREE" -gt 5 || 71 | framework_failure_ "almost-full-file system has no free space" 72 | # ... but not too much (otherwise the program will not get "no space" errors). 73 | test "$FREE" -lt 64 || 74 | framework_failure_ "almost-full-file system has too much free spcae" 75 | } 76 | 77 | ## 78 | ## Sanity checks: 79 | ## 1. Ensure the corrupted file system is corrupted 80 | cat "$BADFS/numbers.txt" >/dev/null 2>&1 && 81 | framework_failure_ "corrupted file system did not trigger I/O error" 82 | ## 2. Ensure the (almost) full file system gets full 83 | clean_full_fs 84 | seq 10000 >"$FULLFS/test.txt" 2>/dev/null && 85 | framework_failure_ "almost full file system did not trigger no-space error" 86 | clean_full_fs 87 | 88 | ## Test 1: 89 | ## input error, reading file directly 90 | datamash sum 1 < "$BADFS/numbers.txt" >/dev/null && 91 | { warn_ "datamash failed to detect read error" ; fail=1 ; } 92 | 93 | ## Test 2: 94 | ## input error, using sort (and popen/pipe) 95 | datamash -s -g 1 sum 1 < "$BADFS/numbers.txt" >/dev/null && 96 | { warn_ "datamash+sort failed to detect read error" ; fail=1 ; } 97 | 98 | ## Test 3: 99 | ## output error, default line-buffering 100 | seq 10000 | datamash -g 1 count 1 > "$FULLFS/test.txt" && 101 | { warn_ "datamash failed to detect no-space error" ; fail=1 ; } 102 | clean_full_fs 103 | 104 | ## Test 4: 105 | ## output error, with line-buffering. 106 | ## This means few of the first "write()" calls will succeed, 107 | ## and later ones should fail with "no space" (which is different than 108 | ## writing to "/dev/full"). 109 | seq 10000 | stdbuf -oL datamash -g 1 count 1 > "$FULLFS/test.txt" && 110 | { warn_ "datamash failed to detect no-space error" ; fail=1 ; } 111 | clean_full_fs 112 | 113 | Exit $fail 114 | -------------------------------------------------------------------------------- /tests/datamash-i18n-de.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - check German locale (de_DE.UTF-8). 4 | 5 | Copyright (C) 2013-2021 Assaf Gordon 6 | Copyright (C) 2022-2025 Timothy Rice 7 | 8 | This file is part of GNU Datamash. 9 | 10 | GNU Datamash is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | GNU Datamash is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with GNU Datamash. If not, see . 22 | 23 | Written by Assaf Gordon and Timothy Rice. 24 | =cut 25 | use strict; 26 | use warnings; 27 | 28 | # Until a better way comes along to auto-use Coreutils Perl modules 29 | # as in the coreutils' autotools system. 30 | use Coreutils; 31 | use CuSkip; 32 | use CuTmpdir qw(datamash); 33 | use MIME::Base64 ; 34 | 35 | ## Skip this test if Deutsche (German) locale not found. 36 | use POSIX qw(locale_h); 37 | use locale; 38 | my $lc_de = setlocale(LC_ALL, "de_DE.utf8"); 39 | CuSkip::skip "requires de_DE.utf8 locale\n" 40 | unless defined($lc_de); 41 | 42 | (my $program_name = $0) =~ s|.*/||; 43 | my $prog_bin = 'datamash'; 44 | 45 | ## Cross-Compiling portability hack: 46 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 47 | ## the full path of the binary, if the binary is on the $PATH. 48 | ## So we try to detect what is the actual returned value of the program 49 | ## in case of an error. 50 | my $prog = `$prog_bin ---print-progname`; 51 | $prog = $prog_bin unless $prog; 52 | 53 | ## Portability hack 54 | ## Check if the system's sort supports stable sorting ('-s'). 55 | ## If it doesn't - skip some tests 56 | my $rc = system("sort -s < /dev/null > /dev/null 2>/dev/null"); 57 | die "testing framework failure: failed to execute sort -s" 58 | if ( ($rc == -1) || ($rc & 127) ); 59 | my $sort_exit_code = ($rc >> 8); 60 | my $have_stable_sort = ($sort_exit_code==0); 61 | 62 | 63 | # Deutsche Prüfungen 64 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('de_DE.utf8') x 3; 65 | 66 | my @Prufungen = 67 | ( 68 | # Prüfen Sie, ob das Komma als Dezimaltrennzeichen funktioniert 69 | ['de1', 'sum 1', {IN_PIPE=>"1,1\n"}, {OUT=>"1,1\n"}], 70 | ['de2', 'sum 1,2', {IN_PIPE=>"1,1\t2,2\n"}, {OUT=>"1,1\t2,2\n"}], 71 | ['de3', 'count 1,2,3', {IN_PIPE=>"1,1\t2,2\t3,3\n"}, {OUT=>"1\t1\t1\n"}], 72 | 73 | # There is a bug where the bin operation does not respect 74 | # the locale's choice of decimal separator. 75 | # TODO: Be able to uncomment the following line. 76 | #['de4', 'bin:0,1 1' {IN_PIPE=>"1,15\n"}, {OUT=>"1,1\n"}], 77 | 78 | # Comma as field separator is problematic for numeric operations 79 | ['de5', '-t, cut 2,1', {IN_PIPE=>"1,2\n"}, {OUT=>"2,1\n"}], 80 | ['de6', '-t, unique 1,2', {IN_PIPE=>"1,2\n"}, {OUT=>"1,2\n"}], 81 | ['de7', '-t, count 1,2', {IN_PIPE=>"1,2\n"}, {OUT=>"1,1\n"}], 82 | ['de8', '-t, countunique 1,2', {IN_PIPE=>"1,2\n"}, {OUT=>"1,1\n"}], 83 | ['de9', '-t, rmdup 1', {IN_PIPE=>"1,2\n"}, {OUT=>"1,2\n"}], 84 | ['de10', '-t, rmdup 2', {IN_PIPE=>"1,2\n"}, {OUT=>"1,2\n"}], 85 | ['de11', '-t, sum 1,2', {IN_PIPE=>"1,2\n"}, {OUT=>"1,2\n"}], 86 | ['de12', '-t, sum 1,2,3', {IN_PIPE=>"1,2,3\n"}, {OUT=>"1,2,3\n"}], 87 | ['de13', '-st, groupby 1 sum 2,3', 88 | {IN_PIPE=>"a,14,1\nb,1,14\na,2,1\n"}, {OUT=>"a,16,2\nb,1,14\n"}], 89 | 90 | # TODO: make the getnum operation locale-aware 91 | #['de14', 'getnum 1', {IN_PIPE=>"bar-1,2\n"}, {OUT=>"1,2\n"}], 92 | #['de15', 'getnum:p 1', {IN_PIPE=>"bar-1,2\n"}, {OUT=>"1,2\n"}], 93 | #['de16', 'getnum:d 1', {IN_PIPE=>"bar-1,2\n"}, {OUT=>"-1,2\n"}], 94 | 95 | ); 96 | 97 | my $save_temps = $ENV{SAVE_TEMPS}; 98 | my $verbose = $ENV{VERBOSE}; 99 | 100 | my $fail = run_tests ($program_name, $prog, \@Prufungen, $save_temps, $verbose); 101 | 102 | exit $fail; 103 | -------------------------------------------------------------------------------- /tests/datamash-sort-header.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - perform simple calculation on input data 4 | 5 | Copyright (C) 2022-2025 Timothy Rice 6 | Copyright (C) 2013-2021 Assaf Gordon 7 | 8 | This file is part of GNU Datamash. 9 | 10 | GNU Datamash is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | GNU Datamash is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with GNU Datamash. If not, see . 22 | 23 | Written by Assaf Gordon and Tim Rice. 24 | =cut 25 | use strict; 26 | use warnings; 27 | use List::Util qw/max/; 28 | use Data::Dumper; 29 | 30 | # Until a better way comes along to auto-use Coreutils Perl modules 31 | # as in the coreutils' autotools system. 32 | use Coreutils; 33 | use CuSkip; 34 | use CuTmpdir qw(datamash); 35 | 36 | (my $program_name = $0) =~ s|.*/||; 37 | my $prog_bin = 'datamash'; 38 | 39 | ## Cross-Compiling portability hack: 40 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 41 | ## the full path of the binary, if the binary is on the $PATH. 42 | ## So we try to detect what is the actual returned value of the program 43 | ## in case of an error. 44 | my $prog = `$prog_bin --foobar 2>&1 | head -n 1 | cut -f1 -d:`; 45 | chomp $prog if $prog; 46 | $prog = $prog_bin unless $prog; 47 | 48 | # Turn off localization of executable's output. 49 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 50 | 51 | # An unsorted input with a header line 52 | my $INFILE=<<'EOF'; 53 | x y z 54 | A % 1 55 | B ( 2 56 | A & 3 57 | B = 4 58 | EOF 59 | 60 | my @INFILE_lines = split /\n/, $INFILE, -1; 61 | my $INFILE_NO_HEADER = join("\n", @INFILE_lines[1..$#INFILE_lines]); 62 | 63 | # The expected output with different option combinations 64 | my $exp_no_sort_no_header=<<'EOF'; 65 | x z 66 | A 1 67 | B 2 68 | A 3 69 | B 4 70 | EOF 71 | 72 | my $exp_no_sort_in_header=<<'EOF'; 73 | A 1 74 | B 2 75 | A 3 76 | B 4 77 | EOF 78 | 79 | my $exp_sort_in_header=<<'EOF'; 80 | A 1,3 81 | B 2,4 82 | EOF 83 | 84 | my $exp_no_sort_headers=<<'EOF'; 85 | GroupBy(x) unique(z) 86 | A 1 87 | B 2 88 | A 3 89 | B 4 90 | EOF 91 | 92 | my $exp_sort_headers=<<'EOF'; 93 | GroupBy(x) unique(z) 94 | A 1,3 95 | B 2,4 96 | EOF 97 | 98 | my $exp_sort_out_header=<<'EOF'; 99 | GroupBy(field-1) unique(field-3) 100 | A 1,3 101 | B 2,4 102 | EOF 103 | 104 | my @Tests = 105 | ( 106 | # Simple transpose and reverse 107 | ['sh01', '-t " " -g 1 unique 3', 108 | {IN_PIPE=>$INFILE}, {OUT=>$exp_no_sort_no_header}], 109 | ['sh02', '-t " " -g 1 --header-in unique 3', 110 | {IN_PIPE=>$INFILE}, {OUT=>$exp_no_sort_in_header}], 111 | ['sh03', '-t " " -g 1 --sort --header-in unique 3', 112 | {IN_PIPE=>$INFILE}, {OUT=>$exp_sort_in_header}], 113 | ['sh04', '-t " " -g 1 --headers unique 3', 114 | {IN_PIPE=>$INFILE}, {OUT=>$exp_no_sort_headers}], 115 | ['sh05', '-t " " -g 1 --sort --headers unique 3', 116 | {IN_PIPE=>$INFILE}, {OUT=>$exp_sort_headers}], 117 | ['sh06', '-t " " -sH -g 1 unique 3', 118 | {IN_PIPE=>$INFILE}, {OUT=>$exp_sort_headers}], 119 | ['sh07', '-t " " --sort --header-out -g 1 unique 3', 120 | {IN_PIPE=>$INFILE_NO_HEADER}, {OUT=>$exp_sort_out_header}], 121 | 122 | # Check sort-piping with empty input - should always produce empty output 123 | ['sh08', '-t " " --sort unique 3', 124 | {IN_PIPE=>""}, {OUT=>""}], 125 | ['sh09', '-t " " --sort --header-in unique 3', 126 | {IN_PIPE=>""}, {OUT=>""}], 127 | ['sh10', '-t " " --sort --header-out unique 3', 128 | {IN_PIPE=>""}, {OUT=>""}], 129 | ['sh11', '-t " " --sort --headers unique 3', 130 | {IN_PIPE=>""}, {OUT=>""}], 131 | 132 | ); 133 | 134 | my $save_temps = $ENV{SAVE_TEMPS}; 135 | my $verbose = $ENV{VERBOSE}; 136 | 137 | my $fail = run_tests ($program_name, $prog_bin, \@Tests, $save_temps, $verbose); 138 | exit $fail; 139 | -------------------------------------------------------------------------------- /src/text-options.c: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2013-2021 Assaf Gordon 4 | Copyright (C) 2022-2025 Timothy Rice 5 | 6 | This file is part of GNU Datamash. 7 | 8 | GNU Datamash is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | GNU Datamash is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with GNU Datamash. If not, see . 20 | */ 21 | 22 | /* Written by Assaf Gordon */ 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #include "system.h" 29 | 30 | #include "die.h" 31 | #include "double-format.h" 32 | #include "text-options.h" 33 | 34 | /* The character marking end of line. Default to \n. */ 35 | char eolchar = '\n'; 36 | 37 | /* Tab character separating fields. If TAB_WHITESPACE, then fields are 38 | separated by the empty string between a non-blank character and a blank 39 | character. */ 40 | int in_tab = '\t'; 41 | int out_tab= '\t'; 42 | 43 | /* Global case-sensitivity option. Defaults to 'true' . */ 44 | bool case_sensitive = true; 45 | 46 | /* In the future: allow users to change this */ 47 | char numeric_output_format[MAX_NUMERIC_FORMAT_LEN + 1] = "%.14Lg"; 48 | 49 | /* number of bytes to allocate for output buffer */ 50 | int numeric_output_bufsize = 200; 51 | 52 | /* The character used to separate collapsed/uniqued strings */ 53 | char collapse_separator = ','; 54 | 55 | /* Should NA/NaN/empty values be silengtly ignored? */ 56 | bool remove_na_values = false; 57 | 58 | /* if true, 'transpose' and 'reverse' require every line to have 59 | the exact same number of fields. Otherwise, the program 60 | will fail with non-zero exit code. */ 61 | bool strict = true; 62 | 63 | /* if 'strict' is false, lines with fewer-than-expected fields 64 | will be filled with this value */ 65 | const char* missing_field_filler = "N/A"; 66 | 67 | /* if true, skip comments line (lines starting with optional whitespace 68 | followed by '#' or ';'. See line_record_is_comment. */ 69 | bool skip_comments = false; 70 | 71 | bool vnlog = false; 72 | 73 | #define UCHAR_LIM (UCHAR_MAX + 1) 74 | bool blanks[UCHAR_LIM]; 75 | 76 | void 77 | init_blank_table (void) 78 | { 79 | size_t i; 80 | 81 | for (i = 0; i < UCHAR_LIM; ++i) 82 | { 83 | blanks[i] = !! isblank (i); 84 | } 85 | } 86 | 87 | /* Force generation of these inline'd symbols, needed to avoid 88 | "undefined reference" when compiling with coverage instrumentation. 89 | See: http://stackoverflow.com/a/16245669 */ 90 | void print_field_separator (); 91 | void print_line_separator (); 92 | 93 | 94 | 95 | /* Calculate the required size of the output buffer */ 96 | static void 97 | finalize_numeric_output_buffer () 98 | { 99 | char c; 100 | long double d = LDBL_MAX; 101 | int n = snprintf (&c, 1, numeric_output_format, d); 102 | numeric_output_bufsize = n + 100 ; 103 | } 104 | 105 | void 106 | set_numeric_output_precision (const char* digits) 107 | { 108 | long int l; 109 | char *p; 110 | 111 | if (digits == NULL || digits[0] == '\0') 112 | die (EXIT_FAILURE, 0, _("missing rounding digits value")); 113 | 114 | errno = 0; 115 | l = strtol (digits, &p, 10); 116 | if (errno != 0 || *p != '\0' || l <=0 || l> 50) 117 | die (EXIT_FAILURE, 0, _("invalid rounding digits value %s"), 118 | quote (digits)); 119 | 120 | snprintf (numeric_output_format, sizeof (numeric_output_format), "%%.%dLf", 121 | (int)l); 122 | 123 | finalize_numeric_output_buffer (); 124 | } 125 | 126 | void 127 | set_numeric_printf_format (const char* format) 128 | { 129 | char *new_format = validate_double_format (format); 130 | snprintf (numeric_output_format, sizeof (numeric_output_format), "%s", 131 | new_format); 132 | free (new_format); 133 | finalize_numeric_output_buffer (); 134 | } 135 | -------------------------------------------------------------------------------- /tests/datamash-check-tabular.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - perform simple calculation on input data 4 | Tests for 'transpose' and 'reverse' operation modes. 5 | 6 | 7 | Copyright (C) 2013-2021 Assaf Gordon 8 | Copyright (C) 2022-2025 Timothy Rice 9 | 10 | This file is part of GNU Datamash. 11 | 12 | GNU Datamash is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | GNU Datamash is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with GNU Datamash. If not, see . 24 | 25 | Written by Assaf Gordon. 26 | =cut 27 | use strict; 28 | use warnings; 29 | use List::Util qw/max/; 30 | use Data::Dumper; 31 | 32 | # Until a better way comes along to auto-use Coreutils Perl modules 33 | # as in the coreutils' autotools system. 34 | use Coreutils; 35 | use CuSkip; 36 | use CuTmpdir qw(datamash); 37 | 38 | (my $program_name = $0) =~ s|.*/||; 39 | my $prog_bin = 'datamash'; 40 | 41 | my $prog = `$prog_bin ---print-progname`; 42 | 43 | # Turn off localization of executable's output. 44 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 45 | 46 | my $in1=<<'EOF'; 47 | a x 1 48 | b z 6 49 | c x 7 50 | EOF 51 | 52 | # missing field on second line 53 | my $in2=<<'EOF'; 54 | a x 1 55 | b z 56 | c x 7 57 | EOF 58 | 59 | # Same as in2, with whitespace delimiters 60 | my $in2_ws=<<"EOF"; 61 | a x \t 1 62 | b \t z 63 | c x 7 64 | EOF 65 | 66 | # second line has 2 tab characters, thus 3 fields 67 | # (the last field is empty). 68 | # version 1.1.0 and before rejected such input. 69 | my $in3=<<"EOF"; 70 | a x 1 71 | b z\t 72 | c x 7 73 | EOF 74 | 75 | # Same as in3, with whitespace delimiters 76 | my $in3_ws=<<"EOF"; 77 | a x \t 1 78 | b \t z \t 79 | c\t\t\tx \t 7 80 | EOF 81 | 82 | 83 | # one line 84 | my $in4=<<'EOF'; 85 | a x 1 86 | EOF 87 | 88 | # one field 89 | my $in5=<<'EOF'; 90 | a 91 | b 92 | c 93 | d 94 | EOF 95 | 96 | # one field, with bad input (fourth line has 0 fields) 97 | my $in6=<<'EOF'; 98 | a 99 | b 100 | c 101 | 102 | e 103 | EOF 104 | 105 | my @Tests = 106 | ( 107 | ['c1', 'check', {IN_PIPE=>$in1}, {OUT=>"3 lines, 3 fields\n"}], 108 | 109 | ['c2', 'check', {IN_PIPE=>$in4}, {OUT=>"1 line, 3 fields\n"}], 110 | ['c3', 'check', {IN_PIPE=>$in5}, {OUT=>"4 lines, 1 field\n"}], 111 | ['c4', 'check', {IN_PIPE=>$in3}, {OUT=>"3 lines, 3 fields\n"}], 112 | ['c5', '-W check', {IN_PIPE=>$in3_ws}, {OUT=>"3 lines, 3 fields\n"}], 113 | 114 | # Check bad input: 115 | # The first four lines will be something like: 116 | # 'line X has N fields:' 117 | # ' [content of line X]' 118 | # 'line Y has M fields:' 119 | # ' [content of line Y]' 120 | # The ERR_SUBSTR will remove these messages, as they are highly variable 121 | # and dependant on the input. Then only the last line of error message 122 | # is checked. 123 | ['e1', 'check', {IN_PIPE=>$in2}, {EXIT=>1}, 124 | {ERR_SUBST => 's/^(li| ).*$//'}, 125 | {ERR => "\n\n\n\n$prog: check failed: line 2 has 2 fields " . 126 | "(previous line had 3)\n"}], 127 | ['e1ws', '-W check', {IN_PIPE=>$in2_ws}, {EXIT=>1}, 128 | {ERR_SUBST => 's/^(li| ).*$//'}, 129 | {ERR => "\n\n\n\n$prog: check failed: line 2 has 2 fields " . 130 | "(previous line had 3)\n"}], 131 | 132 | ['e2', 'check', {IN_PIPE=>$in6}, {EXIT=>1}, 133 | {ERR_SUBST => 's/^(li| ).*$//'}, 134 | {ERR => "\n\n\n\n$prog: check failed: line 4 has 0 fields " . 135 | "(previous line had 1)\n"}], 136 | ['e2ws', '-W check', {IN_PIPE=>$in6}, {EXIT=>1}, 137 | {ERR_SUBST => 's/^(li| ).*$//'}, 138 | {ERR => "\n\n\n\n$prog: check failed: line 4 has 0 fields " . 139 | "(previous line had 1)\n"}], 140 | ); 141 | 142 | my $save_temps = $ENV{SAVE_TEMPS}; 143 | my $verbose = $ENV{VERBOSE}; 144 | 145 | my $fail = run_tests ($program_name, $prog_bin, \@Tests, $save_temps, $verbose); 146 | exit $fail; 147 | -------------------------------------------------------------------------------- /m4/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | /00gnulib.m4 3 | /absolute-header.m4 4 | /af_alg.m4 5 | /alloca.m4 6 | /arpa_inet_h.m4 7 | /assert.m4 8 | /base64.m4 9 | /byteswap.m4 10 | /calloc.m4 11 | /ceill.m4 12 | /ceil.m4 13 | /check-math-lib.m4 14 | /close.m4 15 | /codeset.m4 16 | /configmake.m4 17 | /ctype_h.m4 18 | /ctype.m4 19 | /double-slash-root.m4 20 | /dup2.m4 21 | /eealloc.m4 22 | /errno_h.m4 23 | /error.m4 24 | /expl.m4 25 | /exp.m4 26 | /exponentd.m4 27 | /exponentf.m4 28 | /exponentl.m4 29 | /extensions.m4 30 | /extern-inline.m4 31 | /extern-inline.m4~ 32 | /fabsl.m4 33 | /fabs.m4 34 | /fcntl_h.m4 35 | /fcntl.m4 36 | /fcntl-o.m4 37 | /fflush.m4 38 | /flexmember.m4 39 | /float_h.m4 40 | /floorl.m4 41 | /floor.m4 42 | /fpending.m4 43 | /fpieee.m4 44 | /fpurge.m4 45 | /freading.m4 46 | /free.m4 47 | /frexpl.m4 48 | /frexp.m4 49 | /fseek.m4 50 | /fseeko.m4 51 | /fstat.m4 52 | /ftell.m4 53 | /ftello.m4 54 | /getdtablesize.m4 55 | /getlocalename_l.m4 56 | /getopt.m4 57 | /getprogname.m4 58 | /getrandom.m4 59 | /gettext.m4 60 | /glibc21.m4 61 | /glibc2.m4 62 | /gl-openssl.m4 63 | /gnulib-cache.m4 64 | /gnulib-common.m4 65 | /gnulib-comp.m4 66 | /gnulib-tool.m4 67 | /host-cpu-c-abi.m4 68 | /iconv_h.m4 69 | /iconv.m4 70 | /iconv_open.m4 71 | /include_next.m4 72 | /inet_pton.m4 73 | /init-package-version.m4 74 | /__inline.m4 75 | /inline.m4 76 | /intdiv0.m4 77 | /intldir.m4 78 | /intl.m4 79 | /intlmacosx.m4 80 | /intl-thread-locale.m4 81 | /intmax.m4 82 | /intmax_t.m4 83 | /inttostr.m4 84 | /inttypes_h.m4 85 | /inttypes.m4 86 | /inttypes-pri.m4 87 | /isblank.m4 88 | /isfinite.m4 89 | /isinf.m4 90 | /isnand.m4 91 | /isnanf.m4 92 | /isnanl.m4 93 | /iswblank.m4 94 | /iswdigit.m4 95 | /iswxdigit.m4 96 | /largefile.m4 97 | /lcmessage.m4 98 | /ldexpl.m4 99 | /ldexp.m4 100 | /lib-ld.m4 101 | /lib-link.m4 102 | /lib-prefix.m4 103 | /libunistring-base.m4 104 | /limits-h.m4 105 | /localcharset.m4 106 | /localeconv.m4 107 | /locale-fr.m4 108 | /locale_h.m4 109 | /locale-ja.m4 110 | /localename.m4 111 | /locale-zh.m4 112 | /lock.m4 113 | /logl.m4 114 | /log.m4 115 | /longlong.m4 116 | /lseek.m4 117 | /malloca.m4 118 | /malloc.m4 119 | /mathfunc.m4 120 | /math_h.m4 121 | /mbchar.m4 122 | /mbiter.m4 123 | /mbrtowc.m4 124 | /mbsinit.m4 125 | /mbslen.m4 126 | /mbstate_t.m4 127 | /md5.m4 128 | /memchr.m4 129 | /minmax.m4 130 | /mmap-anon.m4 131 | /mode_t.m4 132 | /modfl.m4 133 | /modf.m4 134 | /msvc-inval.m4 135 | /msvc-nothrow.m4 136 | /multiarch.m4 137 | /netinet_in_h.m4 138 | /nls.m4 139 | /nocrash.m4 140 | /off_t.m4 141 | /open-cloexec.m4 142 | /open.m4 143 | /open-slash.m4 144 | /pathmax.m4 145 | /pclose.m4 146 | /pid_t.m4 147 | /po.m4 148 | /popen.m4 149 | /printf.m4 150 | /printf-posix.m4 151 | /progtest.m4 152 | /pthread_rwlock_rdlock.m4 153 | /quotearg.m4 154 | /random.m4 155 | /random_r.m4 156 | /reallocarray.m4 157 | /realloc.m4 158 | /roundl.m4 159 | /round.m4 160 | /setlocale.m4 161 | /setlocale_null.m4 162 | /sha1.m4 163 | /sha256.m4 164 | /sha512.m4 165 | /signbit.m4 166 | /size_max.m4 167 | /snprintf.m4 168 | /socklen.m4 169 | /sockpfaf.m4 170 | /sqrtl.m4 171 | /sqrt.m4 172 | /ssize_t.m4 173 | /stat.m4 174 | /stat-time.m4 175 | /stdalign.m4 176 | /stdarg.m4 177 | /stdbool.m4 178 | /stddef_h.m4 179 | /std-gnu11.m4 180 | /stdint_h.m4 181 | /stdint.m4 182 | /stdio_h.m4 183 | /stdlib_h.m4 184 | /stdnoreturn.m4 185 | /stpcpy.m4 186 | /strcasecmp.m4 187 | /strdup.m4 188 | /strerror.m4 189 | /string_h.m4 190 | /strings_h.m4 191 | /strncasecmp.m4 192 | /strndup.m4 193 | /strnlen.m4 194 | /strsep.m4 195 | /strtod.m4 196 | /strtold.m4 197 | /strtoll.m4 198 | /strtoull.m4 199 | /strtoumax.m4 200 | /sys_random_h.m4 201 | /sys_socket_h.m4 202 | /sys_stat_h.m4 203 | /sys_types_h.m4 204 | /sys_uio_h.m4 205 | /sys_wait_h.m4 206 | /threadlib.m4 207 | /time_h.m4 208 | /truncl.m4 209 | /trunc.m4 210 | /uintmax_t.m4 211 | /ungetc.m4 212 | /unistd_h.m4 213 | /unlocked-io.m4 214 | /vararrays.m4 215 | /vasnprintf.m4 216 | /version-etc.m4 217 | /visibility.m4 218 | /waitpid.m4 219 | /warnings.m4 220 | /warn-on-use.m4 221 | /wchar_h.m4 222 | /wchar_t.m4 223 | /wctype_h.m4 224 | /wcwidth.m4 225 | /wint_t.m4 226 | /xalloc.m4 227 | /xsize.m4 228 | /xstrndup.m4 229 | /xstrtol.m4 230 | /year2038.m4 231 | /zzgnulib.m4 232 | /fseterr.m4 233 | /gettext_h.m4 234 | /lstat.m4 235 | /stdckdint_h.m4 236 | /stringeq.m4 237 | /unitypes_h.m4 238 | /assert_h.m4 239 | /build-to-host.m4 240 | /c-bool.m4 241 | /c32rtomb.m4 242 | /error_h.m4 243 | /extensions-aix.m4 244 | /gnulib-i18n.m4 245 | /iswpunct.m4 246 | /locale-en.m4 247 | /mbrtoc32.m4 248 | /mempcpy.m4 249 | /musl.m4 250 | /off64_t.m4 251 | /once.m4 252 | /pthread-once.m4 253 | /pthread-spin.m4 254 | /pthread_h.m4 255 | /sched_h.m4 256 | /sys_cdefs_h.m4 257 | /uchar_h.m4 258 | /unicase_h.m4 259 | /unictype_h.m4 260 | /uninorm_h.m4 261 | -------------------------------------------------------------------------------- /bootstrap.conf: -------------------------------------------------------------------------------- 1 | # Bootstrap configuration. 2 | 3 | # Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2017 4 | # Free Software Foundation, Inc. 5 | 6 | # Modifications for GNU Datamash are 7 | # Copyright (C) 2014-2021 Assaf Gordon 8 | 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation; either version 3 of the License, or 12 | # (at your option) any later version. 13 | 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | # 22 | 23 | # gnulib modules used by this package. 24 | gnulib_modules=" 25 | announce-gen 26 | assert 27 | base64 28 | calloc-gnu 29 | c-ctype 30 | ceill 31 | closeout 32 | configmake 33 | crypto/sha1 34 | crypto/sha256 35 | crypto/sha512 36 | crypto/md5 37 | dirname 38 | do-release-commit-and-tag 39 | dup2 40 | errno 41 | error 42 | expl 43 | extensions 44 | fabsl 45 | floorl 46 | fpucw 47 | gendocs 48 | getopt-gnu 49 | getrandom 50 | gettext-h 51 | gitlog-to-changelog 52 | git-version-gen 53 | gnupload 54 | gnu-web-doc-update 55 | hard-locale 56 | hash 57 | hash-pjw 58 | hash-pjw-bare 59 | ignore-value 60 | inet_pton 61 | inline 62 | inttypes 63 | intprops 64 | inttostr 65 | isblank 66 | isnanl 67 | linebuffer 68 | locale 69 | localeconv 70 | logl 71 | maintainer-makefile 72 | minmax 73 | modfl 74 | isnanl 75 | netinet_in 76 | pclose 77 | pmccabe2html 78 | popen 79 | progname 80 | propername 81 | random 82 | readme-release 83 | realloc-gnu 84 | roundl 85 | setlocale 86 | signbit 87 | sh-quote 88 | size_max 89 | snprintf 90 | sqrtl 91 | std-gnu11 92 | stdbool 93 | stdint 94 | stdnoreturn 95 | stpcpy 96 | strcase 97 | strdup-posix 98 | strsep 99 | strtold 100 | strtoll 101 | sys_random 102 | sys_socket 103 | unlocked-io 104 | update-copyright 105 | version-etc 106 | warnings 107 | waitpid 108 | xalloc 109 | xstrtol 110 | xstrtol-error 111 | xstrtoumax 112 | " 113 | 114 | # Additional xgettext options to use. Use "\\\newline" to break lines. 115 | XGETTEXT_OPTIONS=$XGETTEXT_OPTIONS'\\\ 116 | --from-code=UTF-8\\\ 117 | --flag=asprintf:2:c-format --flag=vasprintf:2:c-format\\\ 118 | --flag=asnprintf:3:c-format --flag=vasnprintf:3:c-format\\\ 119 | --flag=wrapf:1:c-format\\\ 120 | ' 121 | 122 | # If "AM_GNU_GETTEXT(external" or "AM_GNU_GETTEXT([external]" 123 | # appears in configure.ac, exclude some unnecessary files. 124 | # Without grep's -E option (not portable enough, pre-configure), 125 | # the following test is ugly. Also, this depends on the existence 126 | # of configure.ac, not the obsolescent-named configure.in. But if 127 | # you're using this infrastructure, you should care about such things. 128 | 129 | gettext_external=0 130 | grep '^[ ]*AM_GNU_GETTEXT(external\>' configure.ac > /dev/null && 131 | gettext_external=1 132 | grep '^[ ]*AM_GNU_GETTEXT(\[external\]' configure.ac > /dev/null && 133 | gettext_external=1 134 | 135 | if test $gettext_external = 1; then 136 | # Gettext supplies these files, but we don't need them since 137 | # we don't have an intl subdirectory. 138 | excluded_files=' 139 | m4/glibc2.m4 140 | m4/intdiv0.m4 141 | m4/lcmessage.m4 142 | m4/lock.m4 143 | m4/printf-posix.m4 144 | m4/size_max.m4 145 | m4/uintmax_t.m4 146 | m4/ulonglong.m4 147 | m4/visibility.m4 148 | m4/xsize.m4 149 | ' 150 | fi 151 | 152 | gnulib_tool_option_extras="--makefile-name=gnulib.mk --automake-subdir" 153 | 154 | # Build prerequisites 155 | buildreq="\ 156 | autoconf 2.69 157 | automake 1.11.1 158 | autopoint 0.19.4 159 | git 1.5.5 160 | gettext 0.19.4 161 | gperf - 162 | gzip - 163 | makeinfo - 164 | perl 5.8 165 | pkg-config 0.28 166 | tar - 167 | " 168 | 169 | bootstrap_post_import_hook () 170 | { 171 | # Automake requires that ChangeLog exist. 172 | touch ChangeLog || return 1 173 | } 174 | 175 | # File that should exist in the top directory of a checked out hierarchy, 176 | # but not in a distribution tarball. 177 | checkout_only_file=HACKING.md 178 | -------------------------------------------------------------------------------- /tests/datamash-output-format.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - perform simple calculation on input data 4 | 5 | Copyright (C) 2018-2021 Assaf Gordon 7 | 8 | This file is part of GNU Datamash. 9 | 10 | GNU Datamash is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | GNU Datamash is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with GNU Datamash. If not, see . 22 | 23 | Written by Assaf Gordon. 24 | =cut 25 | use strict; 26 | use warnings; 27 | 28 | ## 29 | ## This script tests output format options 30 | ## 31 | 32 | 33 | # Until a better way comes along to auto-use Coreutils Perl modules 34 | # as in the coreutils' autotools system. 35 | use Coreutils; 36 | use CuSkip; 37 | use CuTmpdir qw(datamash); 38 | 39 | (my $program_name = $0) =~ s|.*/||; 40 | my $prog = 'datamash'; 41 | 42 | # TODO: add localization tests with "grouping" 43 | # Turn off localization of executable's output. 44 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 45 | 46 | my $in1=<<'EOF'; 47 | 1.000004 48 | 0.000005 49 | EOF 50 | 51 | my @Tests = 52 | ( 53 | # Test Rouding 54 | ['r1', 'sum 1' , {IN_PIPE=>$in1}, {OUT => "1.000009\n"}], 55 | ['r2', '--round 1 sum 1' , {IN_PIPE=>$in1}, {OUT => "1.0\n"}], 56 | ['r3', '--round 3 sum 1' , {IN_PIPE=>$in1}, {OUT => "1.000\n"}], 57 | ['r4', '--round 5 sum 1' , {IN_PIPE=>$in1}, {OUT => "1.00001\n"}], 58 | ['r5', '--round 6 sum 1' , {IN_PIPE=>$in1}, {OUT => "1.000009\n"}], 59 | ['r6', '--round 7 sum 1' , {IN_PIPE=>$in1}, {OUT => "1.0000090\n"}], 60 | 61 | # Test short rounding option 62 | ['r7', '-R 7 sum 1', {IN_PIPE=>$in1}, {OUT => "1.0000090\n"}], 63 | 64 | # Test multiple rounding options 65 | ['r8', '--round 3 -R 7 sum 1', {IN_PIPE=>$in1}, {OUT => "1.0000090\n"}], 66 | ['r9', '--round 7 -R 3 sum 1', {IN_PIPE=>$in1}, {OUT => "1.000\n"}], 67 | 68 | 69 | # Test Custom formats: %f 70 | ['f1', '--format "%07.3f" sum 1', {IN_PIPE=>$in1}, {OUT => "001.000\n"}], 71 | ['f2', '--format "%.7f" sum 1', {IN_PIPE=>$in1}, {OUT => "1.0000090\n"}], 72 | ['f3', '--format "%10f" sum 1', {IN_PIPE=>$in1}, {OUT => " 1.000009\n"}], 73 | ['f4', '--format "%-10f" sum 1', {IN_PIPE=>$in1}, {OUT => "1.000009 \n"}], 74 | ['f5', '--format "%+10f" sum 1', {IN_PIPE=>$in1}, {OUT => " +1.000009\n"}], 75 | # Test %#f (alternate form: always show decimal point) 76 | ['f6', '--format "%.0f" sum 1', {IN_PIPE=>$in1}, {OUT => "1\n"}], 77 | ['f7', '--format "%#.0f" sum 1', {IN_PIPE=>$in1}, {OUT => "1.\n"}], 78 | 79 | # Test Custom formats: %g 80 | ['g1', '--format "%g" sum 1', {IN_PIPE=>$in1}, {OUT => "1.00001\n"}], 81 | ['g2', '--format "%10g" sum 1', {IN_PIPE=>$in1}, {OUT => " 1.00001\n"}], 82 | ['g3', '--format "%010g" sum 1', {IN_PIPE=>$in1}, {OUT => "0001.00001\n"}], 83 | ['g4', '--format "%.10g" sum 1', {IN_PIPE=>$in1}, {OUT => "1.000009\n"}], 84 | ['g5', '--format "%.3g" sum 1', {IN_PIPE=>$in1}, {OUT => "1\n"}], 85 | # Test %#g (alternate form: don't trim zero decimal digits) 86 | ['g6', '--format "%.4g" sum 1', {IN_PIPE=>$in1}, {OUT => "1\n"}], 87 | ['g7', '--format "%#.4g" sum 1', {IN_PIPE=>$in1}, {OUT => "1.000\n"}], 88 | 89 | # Test Custom formats: %e 90 | ['e1', '--format "%e" sum 1', {IN_PIPE=>$in1}, {OUT=>"1.000009e+00\n"}], 91 | ['e2', '--format "%.3e" sum 1', {IN_PIPE=>$in1}, {OUT=>"1.000e+00\n"}], 92 | 93 | # Test Custom formats: %a 94 | # Disable the test for now. Valid output can differ (e.g. 0x8.000p-3 and 95 | # 0x1.000p0 ). 96 | # ['a1', '--format "%0.3a" sum 1', {IN_PIPE=>$in1}, {OUT=>"0x8.000p-3\n"}], 97 | 98 | 99 | # Custom formats can use lots of memory 100 | ['m1', '--format "%04000.0f" sum 1', {IN_PIPE=>$in1}, 101 | {OUT => "0" x 3999 . "1\n"}], 102 | 103 | # due to binary floating representation, some decimal point digits won't be 104 | # zero (e.g. 1.0000090000000000000000000000000523453254320000000... or 105 | # 1.000008999999...). 106 | # The OUT_SUBST replaces exactly 3995 digits (as expected from the format) 107 | # with an "X". 108 | ['m2', '--format "%.4000f" sum 1', {IN_PIPE=>$in1}, 109 | {OUT => "1.00000X\n"}, 110 | {OUT_SUBST => 's/^(1\.00000)([0-9]{3995})$/\1X/'}], 111 | ); 112 | 113 | 114 | my $save_temps = $ENV{SAVE_TEMPS}; 115 | my $verbose = $ENV{VERBOSE}; 116 | 117 | my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); 118 | exit $fail; 119 | -------------------------------------------------------------------------------- /src/op-defs.h: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2013-2021 Assaf Gordon 4 | Copyright (C) 2022-2025 Timothy Rice 5 | 6 | This file is part of GNU Datamash. 7 | 8 | GNU Datamash is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | GNU Datamash is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with GNU Datamash. If not, see . 20 | */ 21 | 22 | /* Written by Assaf Gordon */ 23 | #ifndef __OPERATION_DEFINITONS_H__ 24 | #define __OPERATION_DEFINITONS_H__ 25 | 26 | enum field_operation 27 | { 28 | OP_INVALID = -1, 29 | OP_COUNT = 0, 30 | OP_SUM, 31 | OP_MIN, 32 | OP_MAX, 33 | OP_ABSMIN, 34 | OP_ABSMAX, 35 | OP_RANGE, 36 | OP_FIRST, 37 | OP_LAST, 38 | OP_RAND, 39 | OP_MEAN, 40 | OP_GEOMEAN, 41 | OP_HARMMEAN, 42 | OP_MS, 43 | OP_RMS, 44 | OP_MEDIAN, 45 | OP_QUARTILE_1, 46 | OP_QUARTILE_3, 47 | OP_IQR, /* Inter-quartile range */ 48 | OP_PERCENTILE, 49 | OP_PSTDEV, /* Population Standard Deviation */ 50 | OP_SSTDEV, /* Sample Standard Deviation */ 51 | OP_PVARIANCE, /* Population Variance */ 52 | OP_SVARIANCE, /* Sample Variance */ 53 | OP_MAD, /* MAD - Median Absolute Deviation, with adjustment constant of 54 | 1.4826 for normal distribution */ 55 | OP_MADRAW, /* MAD (same as above), with constant=1 */ 56 | OP_S_SKEWNESS,/* Sample Skewness */ 57 | OP_P_SKEWNESS,/* Population Skewness */ 58 | OP_S_EXCESS_KURTOSIS, /* Sample Excess Kurtosis */ 59 | OP_P_EXCESS_KURTOSIS, /* Population Excess Kurtosis */ 60 | OP_JARQUE_BERA, /* Jarque-Bera test of normality */ 61 | OP_DP_OMNIBUS, /* D'Agostino-Pearson omnibus test of normality */ 62 | OP_MODE, 63 | OP_ANTIMODE, 64 | OP_UNIQUE, /* Collapse Unique string into comma separated values */ 65 | OP_COLLAPSE, /* Collapse strings into comma separated values */ 66 | OP_COUNT_UNIQUE, /* count number of unique values */ 67 | OP_BASE64, /* Encode Field to Base64 */ 68 | OP_DEBASE64, /* Decode Base64 field */ 69 | OP_MD5, /* Calculate MD5 of a field */ 70 | OP_SHA1, /* Calculate SHA1 of a field */ 71 | OP_SHA224, /* Calculate SHA224 of a field */ 72 | OP_SHA256, /* Calculate SHA256 of a field */ 73 | OP_SHA384, /* Calculate SHA384 of a field */ 74 | OP_SHA512, /* Calculate SHA512 of a field */ 75 | OP_P_COVARIANCE, /* Population Covariance */ 76 | OP_S_COVARIANCE, /* Sample Covariance */ 77 | OP_P_PEARSON_COR, /* Pearson Correlation Coefficient (population) */ 78 | OP_S_PEARSON_COR, /* Pearson Correlation Coefficient (sample) */ 79 | OP_DOT_PRODUCT, /* Scalar Product */ 80 | OP_BIN_BUCKETS, /* numeric binning operation */ 81 | OP_STRBIN, /* String hash/binning */ 82 | OP_FLOOR, /* Floor */ 83 | OP_CEIL, /* Ceiling */ 84 | OP_ROUND, /* Round */ 85 | OP_TRUNCATE, /* Truncate */ 86 | OP_FRACTION, /* Fraction */ 87 | OP_TRIMMED_MEAN, /* Trimmed Mean */ 88 | OP_DIRNAME, /* like dirname (1) */ 89 | OP_BASENAME, /* like basename (1) */ 90 | OP_EXTNAME, /* guess extension of file name */ 91 | OP_BARENAME, /* like basename without the guessed extension */ 92 | OP_GETNUM, /* Extract a number from a string */ 93 | OP_CUT /* like cut (1) */ 94 | }; 95 | 96 | enum processing_mode 97 | { 98 | MODE_INVALID = -1, 99 | MODE_GROUPBY = 0, /* Group By similar keys */ 100 | MODE_TRANSPOSE, /* transpose */ 101 | MODE_REVERSE, /* reverse fields in each line */ 102 | MODE_PER_LINE, /* Operations on each line, no grouping */ 103 | MODE_REMOVE_DUPS, /* Remove duplicated keys from a file */ 104 | MODE_CROSSTAB, /* Cross tabulation (aka pivot tables) */ 105 | MODE_TABULAR_CHECK, /* Verif the file has tabular format */ 106 | MODE_NOOP /* Do nothing. Used for testing and profiling */ 107 | }; 108 | 109 | /* Given a text string, returns the matching operation, or OP_INVALID. 110 | if 'mode' is not NULL, stores the implied processing mode 111 | (e.g. sum=>MODE_GROUPBY, md5=>MODE_PER_LINE). */ 112 | enum field_operation 113 | get_field_operation (const char* s, enum processing_mode* /*out*/ mode); 114 | 115 | const char* 116 | get_field_operation_name (enum field_operation op); 117 | 118 | /* Given a text string, 119 | returns the matching processing mode, or MODE_INVALID. */ 120 | enum processing_mode 121 | get_processing_mode (const char* s); 122 | 123 | const char* 124 | get_processing_mode_name (enum processing_mode m); 125 | 126 | #endif 127 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Copyright (C) 2005, 2006, 2007, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2 | 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Free Software Foundation, Inc. 3 | 4 | Copyright (C) 2013-2021 Assaf Gordon 5 | Copyright (C) 2022-2025 Timothy Rice 6 | 7 | Copying and distribution of this file, with or without modification, 8 | are permitted in any medium without royalty provided the copyright 9 | notice and this notice are preserved. 10 | 11 | GNU Datamash 12 | ============ 13 | 14 | GNU Datamash is a command-line program which performs basic 15 | numeric,textual and statistical operations on input textual data files. 16 | 17 | it is designed to be portable and reliable, and aid researchers 18 | to easily automate analysis pipelines, without writing code or even 19 | short scripts. 20 | 21 | Home page: https://www.gnu.org/software/datamash 22 | 23 | 24 | Usage 25 | ===== 26 | 27 | See `datamash --help` for basic usage information. 28 | 29 | See `man datamash` for examples and operation details. 30 | 31 | For the instrucions manual, see `info datamash` or visit 32 | https://www.gnu.org/software/datamash/manual/ 33 | 34 | 35 | 36 | Examples 37 | ======== 38 | 39 | What's the sum and mean of the values in field 1 ? 40 | 41 | $ seq 10 | datamash sum 1 mean 1 42 | 55 5.5 43 | 44 | Given a file with three columns (Name, College Major, Score), 45 | what is the average, grouped by college major? 46 | 47 | $ cat scores.txt 48 | John Life-Sciences 91 49 | Dilan Health-Medicine 84 50 | Nathaniel Arts 88 51 | Antonio Engineering 56 52 | Kerris Business 82 53 | ... 54 | 55 | 56 | # Sort input and group by column 2, calculate average on column 3: 57 | 58 | $ datamash --sort --group 2 mean 3 < scores.txt 59 | Arts 68.9474 60 | Business 87.3636 61 | Health-Medicine 90.6154 62 | Social-Sciences 60.2667 63 | Life-Sciences 55.3333 64 | Engineering 66.5385 65 | 66 | See more examples at https://www.gnu.org/software/datamash/examples/ 67 | 68 | 69 | Download and Installation 70 | ========================= 71 | 72 | Download the latest source code at https://www.gnu.org/software/datamash . 73 | 74 | General installation commands: 75 | 76 | $ tar -xzf datamash-[VERSION].tar.gz 77 | $ cd datamash-[VERSION] 78 | $ ./configure 79 | $ make 80 | $ make check 81 | $ sudo make install 82 | 83 | Also see INSTALL. 84 | 85 | See Platform/OS-specific download instructions at 86 | https://www.gnu.org/software/datamash/download/ 87 | 88 | 89 | To build from latest git sources, see the HACKING.md file. This file is 90 | available when cloning from git, but is not distributed in the tar archive. 91 | To clone the git repository run 92 | git clone git://git.savannah.gnu.org/datamash.git 93 | It is also available online at 94 | https://git.savannah.gnu.org/cgit/datamash.git/tree/HACKING.md 95 | 96 | 97 | BASH Auto-completion 98 | ==================== 99 | 100 | The datamash package inclueds a bash auto-completion script. 101 | The installation location can be controlled using 102 | 103 | ./configure --with-bash-completion-dir=[no|local|global|PATH] 104 | 105 | The options are: 106 | 107 | * local - install under the package's $PREFIX path. 108 | typically `/usr/local/share/datamash/bash-completion.d/` , 109 | but can be changed with `./configure --prefix`. 110 | This is the default. 111 | 112 | * no - do not install the bash completion script. 113 | 114 | * [PATH] - install into the PATH specified on the command line, e.g. 115 | `./configure --with-bash-completion-dir=/for/bar/bash-completion.d/` 116 | 117 | * global - install into the system's global bash-completion directory, 118 | as reported by `pkg-config`. This will be the result of: 119 | `pkg-config --variable=completionsdir bash-completion` 120 | Which is commonly `/usr/share/bash-completion/completions` 121 | or `/etc/bash.d`. 122 | If `pkg-config` is not found or if `pkg-config` does not have 123 | the config (.pc) file for the bash-completion package, 124 | defaults to 'local'. 125 | 126 | `local` is the default, and should be used particularly if installing under 127 | a non-default `--prefix` without root permissions. 128 | `global` should be used if you are installing to default location (/usr/local) 129 | and have root permissions (e.g. `sudo make install`). 130 | Using custom PATH or `global` should be used when packaging datamash for 131 | further distribution. 132 | 133 | Questions and Bug Reports 134 | ========================= 135 | 136 | - Please send questions and bug reports to bug-datamash@gnu.org . 137 | - Searchable archive at https://lists.gnu.org/archive/html/bug-datamash . 138 | - Subscribe at https://lists.gnu.org/mailman/listinfo/bug-datamash . 139 | 140 | 141 | Copyright and License 142 | ===================== 143 | Copyright (C) 2013-2021 Assaf Gordon 144 | 145 | License: GPL Version 3 (or later). See COPYING. 146 | 147 | For any copyright year range specified as YYYY-ZZZZ in this package 148 | note that the range specifies every single year in that closed interval. 149 | -------------------------------------------------------------------------------- /tests/datamash-tests-deprecated.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - perform simple calculation on input data 4 | 5 | Copyright (C) 2013-2021 Assaf Gordon 6 | Copyright (C) 2022-2025 Timothy Rice 7 | 8 | This file is part of GNU Datamash. 9 | 10 | GNU Datamash is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | GNU Datamash is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with GNU Datamash. If not, see . 22 | 23 | Written by Assaf Gordon. 24 | =cut 25 | use strict; 26 | use warnings; 27 | 28 | # Until a better way comes along to auto-use Coreutils Perl modules 29 | # as in the coreutils' autotools system. 30 | use Coreutils; 31 | use CuSkip; 32 | use CuTmpdir qw(datamash); 33 | use MIME::Base64 ; 34 | 35 | (my $program_name = $0) =~ s|.*/||; 36 | my $prog_bin = 'datamash'; 37 | 38 | ## Cross-Compiling portability hack: 39 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 40 | ## the full path of the binary, if the binary is on the $PATH. 41 | ## So we try to detect what is the actual returned value of the program 42 | ## in case of an error. 43 | my $prog = `$prog_bin ---print-progname`; 44 | $prog = $prog_bin unless $prog; 45 | 46 | ## Portability hack 47 | ## Check if the system's sort supports stable sorting ('-s'). 48 | ## If it doesn't - skip some tests 49 | my $rc = system("sort -s < /dev/null > /dev/null 2>/dev/null"); 50 | die "testing framework failure: failed to execute sort -s" 51 | if ( ($rc == -1) || ($rc & 127) ); 52 | my $sort_exit_code = ($rc >> 8); 53 | my $have_stable_sort = ($sort_exit_code==0); 54 | 55 | 56 | # TODO: add localization tests with "grouping" 57 | # Turn off localization of executable's output. 58 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 59 | 60 | my $in_g3=<<'EOF'; 61 | A 3 W 62 | A 5 W 63 | A 7 W 64 | A 11 X 65 | A 13 X 66 | B 17 Y 67 | B 19 Z 68 | C 23 Z 69 | EOF 70 | 71 | my $in_g4=<<'EOF'; 72 | A 5 73 | K 6 74 | P 2 75 | EOF 76 | 77 | my $in_hdr1=<<'EOF'; 78 | x y z 79 | A 1 10 80 | A 2 10 81 | A 3 10 82 | A 4 10 83 | A 4 10 84 | B 5 10 85 | B 6 20 86 | B 7 30 87 | C 8 11 88 | C 9 22 89 | C 1 33 90 | C 2 44 91 | EOF 92 | 93 | my $in_hdr_only=<<'EOF'; 94 | X:Y:Z 95 | EOF 96 | 97 | my $full_deprecation = "$prog: Using -f/--full with non-linewise operations " . 98 | "is deprecated and will be disabled in a future release.\n"; 99 | 100 | my @Tests = 101 | ( 102 | # empty input = empty output, regardless of options 103 | ['emp2dep', '--full count 2', {IN_PIPE=>""},{OUT=>""}, 104 | {ERR=>"$full_deprecation"}], 105 | ['emp5dep', '--full --header-in count 2', {IN_PIPE=>""},{OUT=>""}, 106 | {ERR=>"$full_deprecation"}], 107 | ['emp6dep', '--full --header-out count 2', {IN_PIPE=>""},{OUT=>""}, 108 | {ERR=>"$full_deprecation"}], 109 | ['emp7dep', '--full --header-in --header-out count 2', 110 | {IN_PIPE=>""},{OUT=>""}, 111 | {ERR=>"$full_deprecation"}], 112 | ['emp8dep', '-g3,4 --full --header-in --header-out count 2', 113 | {IN_PIPE=>""},{OUT=>""}, 114 | {ERR=>"$full_deprecation"}], 115 | 116 | # --full option - without grouping, returns the first line 117 | ['fl1dep', '-t" " --full sum 2', {IN_PIPE=>$in_g3}, 118 | {OUT=>"A 3 W 98\n"}, 119 | {ERR=>"$full_deprecation"}], 120 | # --full with grouping - print entire line of each group 121 | ['fl2dep', '-t" " --full -g3 sum 2', {IN_PIPE=>$in_g3}, 122 | {OUT=>"A 3 W 15\nA 11 X 24\nB 17 Y 17\nB 19 Z 42\n"}, 123 | {ERR=>"$full_deprecation"}], 124 | 125 | # Input and output header, with full line 126 | ['hdr3dep', '-t" " -g 1 --full --header-in --header-out count 2', 127 | {IN_PIPE=>$in_hdr1}, 128 | {OUT=>"x y z count(y)\nA 1 10 5\nB 5 10 3\nC 8 11 4\n"}, 129 | {ERR=>"$full_deprecation"}], 130 | 131 | # Output Header with --full 132 | ['hdr5dep', '-t" " -g 1 --full --header-out count 2', {IN_PIPE=>$in_g3}, 133 | {OUT=>"field-1 field-2 field-3 count(field-2)\n" . 134 | "A 3 W 5\nB 17 Y 2\nC 23 Z 1\n"}, 135 | {ERR=>"$full_deprecation"}], 136 | 137 | # Input has only one header line (no data lines), and the user requested 138 | # header-in and header-out => header line should be printed 139 | ['hdr15dep', '-t: --full -H sum 1', {IN_PIPE=>$in_hdr_only}, 140 | {OUT=>"X:Y:Z:sum(X)\n"}, 141 | {ERR=>"$full_deprecation"}], 142 | ['hdr17dep', '-t: --full -s -g1 -H sum 2', {IN_PIPE=>$in_hdr_only}, 143 | {OUT=>"X:Y:Z:sum(Y)\n"}, 144 | {ERR=>"$full_deprecation"}], 145 | 146 | # Test single line per group 147 | ['sl2dep', '-t" " --full -g 1 mean 2', {IN_PIPE=>$in_g4}, 148 | {OUT=>"A 5 5\nK 6 6\nP 2 2\n"}, 149 | {ERR=>"$full_deprecation"}], 150 | ); 151 | 152 | my $save_temps = $ENV{SAVE_TEMPS}; 153 | my $verbose = $ENV{VERBOSE}; 154 | 155 | my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); 156 | exit $fail; 157 | -------------------------------------------------------------------------------- /doc/datamash-texinfo.css: -------------------------------------------------------------------------------- 1 | /* 2 | CSS for TexInfo/HTML files. 3 | 4 | Copyright (C) 2015-2021 Assaf Gordon (assafgordon@gmail.com) 5 | 6 | License: 7 | GNU All Permissive License 8 | https://www.gnu.org/prep/maintain/html_node/License-Notices-for-Other-Files.html 9 | 10 | Copying and distribution of this file, with or without modification, 11 | are permitted in any medium without royalty provided the copyright 12 | notice and this notice are preserved. This file is offered as-is, 13 | without any warranty. 14 | 15 | 16 | The used tags/classes were collected from a Texinfo-generated HTML using: 17 | 18 | cd coreutils 19 | makeinfo --html --no-split -o coreutils.html doc/coreutils.texi 20 | cat coreutils.html | sed 's/.*;>;g' \ 21 | | grep '^<' | grep 'class=' | sort -u \ 22 | | perl -lane 'm/<(\w+) .*class="([-\w]+)"/ ; print $1, "\t", $2' \ 23 | | sort -u 24 | 25 | */ 26 | body { 27 | font-family: sans-serif; 28 | font-size: 16px; 29 | margin: 1em; 30 | 31 | overflow-x: hidden; /* Coupled with the div.header trick, 32 | this will extend the header lines 33 | access the entire page width without causing 34 | a horizontal scroll bar to appear. */ 35 | } 36 | 37 | 38 | a { 39 | text-decoration: none; 40 | outline-style: none; 41 | color: blue; 42 | } 43 | a:visited { 44 | color: rgb(16,0,112); 45 | } 46 | a:hover { 47 | text-decoration: underline; 48 | } 49 | 50 | 51 | /***************************************************** 52 | Titles / Headers 53 | ******************************************************/ 54 | 55 | /* @settitle: 56 | The title of the document at the top of the document/header */ 57 | h1.settitle { 58 | color: rgb(51,70,131); 59 | text-shadow: rgb(153,153,153) 1px 1px 0px; 60 | } 61 | 62 | /* The title at the beginning of the document, before the @menu */ 63 | h1.top { 64 | color: rgb(51,70,131); 65 | text-shadow: rgb(153,153,153) 1px 1px 0px; 66 | } 67 | 68 | /* @chapter */ 69 | h2.chapter { 70 | } 71 | 72 | h2.appendix { } 73 | h2.unnumbered { } 74 | 75 | /* @section */ 76 | h3.section { 77 | } 78 | /* @unnumberedsec */ 79 | h3.unnumberedsec { 80 | } 81 | /* @heading (seems to be only used in fdl.texi) */ 82 | h3.heading { 83 | } 84 | 85 | /* @subsection */ 86 | h4.subsection { 87 | } 88 | 89 | 90 | /************************************************** 91 | Short Contents (if @shortcontents command is used) 92 | ***************************************************/ 93 | h2.shortcontents-heading { } 94 | div.shortcontents { } 95 | div.shortcontents ul { } 96 | div.shortcontents ul li { } 97 | 98 | 99 | /************************************************** 100 | Contents (if @contents command is used) 101 | ***************************************************/ 102 | h2.contents-heading { } 103 | div.contents { } 104 | div.contents ul { } 105 | div.contents ul li { } 106 | 107 | 108 | /* The @menu table */ 109 | table.menu { } 110 | pre.menu-comment {} 111 | 112 | 113 | 114 | /************************************ 115 | @example and @verbatim 116 | ************************************/ 117 | div.example { 118 | margin-left: 2em; 119 | margin-right: 2em; 120 | } 121 | div.example pre.example { 122 | /* Round Corners */ 123 | -webkit-border-radius: 3px; 124 | -moz-border-radius: 3px; 125 | border-radius: 3px; 126 | border: 1px solid #c0c0c0; 127 | 128 | padding: 1ex; 129 | background-color: #f3f3f3; 130 | } 131 | 132 | /* Note: @verbatim is also rendered inside a 'div.example' */ 133 | div.example pre.verbatim { 134 | /* Round Corners */ 135 | -webkit-border-radius: 3px; 136 | -moz-border-radius: 3px; 137 | border-radius: 3px; 138 | border: 1px solid #c0c0c0; 139 | 140 | padding: 1ex; 141 | background-color: #f3f3f3; 142 | } 143 | 144 | 145 | 146 | /************************************ 147 | @smallexample 148 | ************************************/ 149 | div.smallexample { 150 | } 151 | div.smallexample pre.smallexample { 152 | } 153 | 154 | /*********************************** 155 | @display 156 | ***********************************/ 157 | div.display { 158 | } 159 | div.display pre.display { 160 | } 161 | 162 | 163 | 164 | /************************************** 165 | @footnote 166 | **************************************/ 167 | div.footnote { } 168 | h4.footnotes-heading { } 169 | 170 | /************************************** 171 | The header at the top of each page / section 172 | (the next/previous/top/up links) 173 | **************************************/ 174 | div.header { 175 | padding-top: 0.5ex; 176 | padding-bottom: 0.5ex; 177 | background-color: #ddddff; 178 | 179 | /* This will extend the background color of the header 180 | bar to the entire width of the page (and beyond), 181 | requires 'overflow-x: hidden' in the 'body'. */ 182 | padding-left: 3000px; 183 | margin-left: -3000px; 184 | padding-right: 3000px; 185 | margin-right: -3000px; 186 | } 187 | 188 | /* Disable any additional margins */ 189 | div.header p { 190 | margin: 0; 191 | } 192 | div.header p a { 193 | color: blue; 194 | } 195 | 196 | 197 | /************************************** 198 | @table is rendered as
(defnition list), 199 | @item is rendered as
(definition term), 200 | text is rendered as
(definition description) 201 | **************************************/ 202 | dl { 203 | margin: 0 1em; 204 | } 205 | dl dt { 206 | margin: 1em 0; 207 | } 208 | dl dd { 209 | margin-left: 2em; 210 | } 211 | 212 | /******************************************************* 213 | Text Styles 214 | *******************************************************/ 215 | 216 | /* @var{} */ 217 | var { 218 | color: #CC0000; 219 | } 220 | 221 | /* @samp{} */ 222 | samp { 223 | color: #6600CC; 224 | } 225 | 226 | /* @env{} will result in

X

*/ 227 | p code { 228 | color: #532c14; 229 | } 230 | 231 | /* @option{} */ 232 | span.nocodebreak { 233 | color: #5D4C46; 234 | } 235 | -------------------------------------------------------------------------------- /src/key-compare.h: -------------------------------------------------------------------------------- 1 | /* Key Comparison functions 2 | 3 | Copyright (C) 2014 Free Software Foundation, Inc. 4 | Copyright (C) 2022-2025 Timothy Rice 5 | 6 | This program is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | This program is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with this program. If not, see . */ 18 | 19 | #ifndef KEY_COMPARE_H 20 | # define KEY_COMPARE_H 21 | 22 | #define UCHAR_LIM (UCHAR_MAX + 1) 23 | 24 | 25 | /* The representation of the decimal point in the current locale. */ 26 | extern int decimal_point; 27 | 28 | /* Thousands separator; if -1, then there isn't one. */ 29 | extern int thousands_sep; 30 | 31 | /* Nonzero if the corresponding locales are hard. */ 32 | extern bool hard_LC_COLLATE; 33 | #if HAVE_NL_LANGINFO 34 | extern bool hard_LC_TIME; 35 | #endif 36 | 37 | #define NONZERO(x) ((x) != 0) 38 | 39 | /* The kind of blanks for '-b' to skip in various options. */ 40 | enum blanktype { bl_start, bl_end, bl_both }; 41 | 42 | /* Lines are held in core as counted strings. */ 43 | struct line 44 | { 45 | char *text; /* Text of the line. */ 46 | size_t length; /* Length including final newline. */ 47 | char *keybeg; /* Start of first key. */ 48 | char *keylim; /* Limit of first key. */ 49 | }; 50 | 51 | /* Sort key. */ 52 | struct keyfield 53 | { 54 | size_t sword; /* Zero-origin 'word' to start at. */ 55 | size_t schar; /* Additional characters to skip. */ 56 | size_t eword; /* Zero-origin last 'word' of key. */ 57 | size_t echar; /* Additional characters in field. */ 58 | bool const *ignore; /* Boolean array of characters to ignore. */ 59 | char const *translate; /* Translation applied to characters. */ 60 | bool skipsblanks; /* Skip leading blanks when finding start. */ 61 | bool skipeblanks; /* Skip leading blanks when finding end. */ 62 | bool numeric; /* Flag for numeric comparison. Handle 63 | strings of digits with optional decimal 64 | point, but no exponential notation. */ 65 | #ifdef KEY_COMPARE_RANDOM 66 | bool random; /* Sort by random hash of key. */ 67 | #endif 68 | bool general_numeric; /* Flag for general, numeric comparison. 69 | Handle numbers in exponential notation. */ 70 | #ifdef KEY_COMPARE_HUMAN_NUMERIC 71 | bool human_numeric; /* Flag for sorting by human readable 72 | units with either SI xor IEC prefixes. */ 73 | #endif 74 | #ifdef KEY_COMPARE_MONTH 75 | bool month; /* Flag for comparison by month name. */ 76 | #endif 77 | #ifdef KEY_COMPARE_REVERSE 78 | bool reverse; /* Reverse the sense of comparison. */ 79 | #endif 80 | #ifdef KEY_COMPARE_VERSION 81 | bool version; /* sort by version number */ 82 | #endif 83 | #ifdef KEY_COMPARE_DECORATION 84 | bool (*decorate_fn)(const char* in); 85 | const char* decorate_cmd; 86 | #endif 87 | bool traditional_used; /* Traditional key option format is used. */ 88 | struct keyfield *next; /* Next keyfield to try. */ 89 | }; 90 | 91 | /* If TAB has this value, blanks separate fields. */ 92 | enum { TAB_DEFAULT = CHAR_MAX + 1 }; 93 | 94 | /* Tab character separating fields. If TAB_DEFAULT, then fields are 95 | separated by the empty string between a non-blank character and a blank 96 | character. */ 97 | extern int tab; 98 | 99 | /* List of key field comparisons to be tried. */ 100 | extern struct keyfield *keylist; 101 | 102 | /* Return a pointer to the first character of the field specified 103 | by KEY in LINE. */ 104 | 105 | char * 106 | begfield (struct line const *line, struct keyfield const *key); 107 | 108 | /* Return the limit of (a pointer to the first character after) the field 109 | in LINE specified by KEY. */ 110 | 111 | char * 112 | limfield (struct line const *line, struct keyfield const *key); 113 | 114 | /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */ 115 | 116 | extern struct keyfield* 117 | insertkey (struct keyfield *key_arg); 118 | 119 | /* Report a bad field specification SPEC, with extra info MSGID. */ 120 | void badfieldspec (char const *, char const *) 121 | ATTRIBUTE_NORETURN; 122 | 123 | /* Parse the leading integer in STRING and store the resulting value 124 | (which must fit into size_t) into *VAL. Return the address of the 125 | suffix after the integer. If the value is too large, silently 126 | substitute SIZE_MAX. If MSGID is NULL, return NULL after 127 | failure; otherwise, report MSGID and exit on failure. */ 128 | 129 | char const * 130 | parse_field_count (char const *string, size_t *val, char const *msgid); 131 | 132 | /* Set the ordering options for KEY specified in S. 133 | Return the address of the first character in S that 134 | is not a valid ordering option. 135 | BLANKTYPE is the kind of blanks that 'b' should skip. */ 136 | 137 | char * 138 | set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype); 139 | 140 | /* Initialize KEY. */ 141 | struct keyfield * 142 | key_init (struct keyfield *key); 143 | 144 | /* print the key spec as a parameter */ 145 | void 146 | debug_keylist (FILE* stream); 147 | 148 | char* 149 | debug_keyfield (const struct keyfield *key); 150 | 151 | 152 | 153 | /* Initializes 'common' key-comparison global variables: 154 | thousand_sep 155 | decimal_point 156 | hard_LC_COLLATE 157 | hard_LC_TIME 158 | blanks, months, nonprintable tables (calls inittables). 159 | 160 | This function should be called once from main . 161 | */ 162 | void 163 | init_key_spec (void); 164 | 165 | #endif /* KEY_COMPARE_H */ 166 | -------------------------------------------------------------------------------- /src/field-ops.h: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2013-2021 Assaf Gordon 4 | Copyright (C) 2022-2025 Timothy Rice 5 | 6 | This file is part of GNU Datamash. 7 | 8 | GNU Datamash is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | GNU Datamash is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with GNU Datamash. If not, see . 20 | */ 21 | 22 | /* Written by Assaf Gordon */ 23 | #ifndef __FIELD_OPS_H__ 24 | #define __FIELD_OPS_H__ 25 | 26 | /* 27 | Operations Module 28 | */ 29 | 30 | enum accumulation_type 31 | { 32 | NUMERIC_SCALAR = 0, 33 | NUMERIC_VECTOR, 34 | STRING_SCALAR, 35 | STRING_VECTOR 36 | }; 37 | 38 | enum operation_result_type 39 | { 40 | NUMERIC_RESULT = 0, 41 | STRING_RESULT 42 | }; 43 | 44 | enum operation_first_value 45 | { 46 | AUTO_SET_FIRST = true, 47 | IGNORE_FIRST = false 48 | }; 49 | 50 | enum FIELD_OP_COLLECT_RESULT 51 | { 52 | FLOCR_OK = 0, 53 | FLOCR_OK_KEEP_LINE, 54 | FLOCR_OK_SKIPPED, 55 | FLOCR_INVALID_NUMBER, 56 | FLOCR_INVALID_BASE64 57 | }; 58 | 59 | struct operation_data 60 | { 61 | enum accumulation_type acc_type; 62 | enum operation_first_value auto_first; 63 | enum operation_result_type res_type; 64 | }; 65 | 66 | /* Operation on a field */ 67 | struct fieldop 68 | { 69 | /* operation 'class' information */ 70 | enum field_operation op; 71 | enum accumulation_type acc_type; 72 | enum operation_result_type res_type; 73 | bool numeric; 74 | bool auto_first; /* if true, automatically set 'value' if 'first' */ 75 | bool master; /* if true, this field_op uses another as a slave */ 76 | bool slave; /* if true, not used directly, but referenced by 77 | another field_op */ 78 | size_t slave_idx; 79 | struct fieldop* slave_op; 80 | 81 | /* Instance information */ 82 | size_t field; /* field number. 1 = first field in input file. */ 83 | bool field_by_name; /* if true, user gave field name (instead of number), 84 | which needs to be resolved AFTER the header line 85 | is loaded */ 86 | char* field_name; 87 | 88 | union { 89 | long double bin_bucket_size; 90 | size_t strbin_bucket_size; 91 | size_t percentile; 92 | long double trimmed_mean; 93 | enum extract_number_type get_num_type; 94 | } params; 95 | 96 | /* Collected Data */ 97 | bool first; /* true if this is the first item in a new group */ 98 | 99 | /* NUMERIC_SCALAR operations */ 100 | size_t count; /* number of items collected so far in a group */ 101 | long double value; /* for single-value operations (sum, min, max, absmin, 102 | absmax, mean) - this is the accumulated value */ 103 | 104 | /* NUMERIC_VECTOR operations */ 105 | long double *values; /* array for multi-valued ops (median,mode,stdev) */ 106 | size_t num_values; /* number of used values */ 107 | size_t alloc_values;/* number of allocated values */ 108 | 109 | /* String buffer for STRING_VECTOR operations */ 110 | char *str_buf; /* points to the beginning of the buffer */ 111 | size_t str_buf_used; /* number of bytes used in the buffer */ 112 | size_t str_buf_alloc; /* number of bytes allocated in the buffer */ 113 | 114 | /* Output buffer containing the final results of an operation, 115 | set by 'summarize' functions. 116 | also used for line operations (md5/sha1/256/512/base64). */ 117 | char *out_buf; 118 | size_t out_buf_used; 119 | size_t out_buf_alloc; 120 | }; 121 | 122 | /* Initializes a new field-op, using an *existing* (pre-allocated) struct. */ 123 | void 124 | field_op_init (struct fieldop* /*in-out*/ op, 125 | enum field_operation oper, 126 | bool by_name, size_t num, const char* name); 127 | 128 | /* Frees the internal structures in the field-op. 129 | Does *not* free 'op' itself */ 130 | void 131 | field_op_free (struct fieldop* op); 132 | 133 | /* Add a value (from input) to the current field operation. 134 | 'str' does not need to be null-terminated. 135 | 136 | Returns true if the operation was successful. 137 | Returns false if the input was invalid numeric value. 138 | */ 139 | enum FIELD_OP_COLLECT_RESULT 140 | field_op_collect (struct fieldop *op, const char* str, size_t slen); 141 | 142 | /* Evaluates to true/false depending if the value returned from 143 | field_op_collect represents a successful operation. */ 144 | #define field_op_ok(X) \ 145 | (((X)==FLOCR_OK)||((X)==FLOCR_OK_KEEP_LINE)||((X)==FLOCR_OK_SKIPPED)) 146 | 147 | /* If field_op_ok returned false, this function will return a textual 148 | error message of the error. The returned value is a static string, 149 | do not free it. */ 150 | const char* 151 | field_op_collect_result_name (const enum FIELD_OP_COLLECT_RESULT flocr); 152 | 153 | 154 | /* Called after all values in a group are collected in a field-op, 155 | to perform any (optional) finalizing steps 156 | (e.g. in OP_MEAN, calculate the mean). 157 | Result will be stored in op->out_buf. */ 158 | void 159 | field_op_summarize (struct fieldop *op); 160 | 161 | /* resets internal variables, should be called when starting a new 162 | group of values. */ 163 | void 164 | field_op_reset (struct fieldop *op); 165 | 166 | /* Output precision, to be used with "printf ("%.*Lg",)" */ 167 | extern int field_op_output_precision; 168 | 169 | /* Helper function to print to stdout the 'empty value' of a numeric 170 | operation (e.g. what's printed by 'OP_MEAN' with empty input). 171 | Used in some of the tests. */ 172 | void 173 | field_op_print_empty_value (enum field_operation mode); 174 | 175 | #endif 176 | -------------------------------------------------------------------------------- /tests/datamash-tests-2-deprecated.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - perform simple calculation on input data 4 | 5 | Copyright (C) 2013-2021 Assaf Gordon 6 | Copyright (C) 2022-2025 Timothy Rice 7 | 8 | This file is part of GNU Datamash. 9 | 10 | GNU Datamash is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | GNU Datamash is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with GNU Datamash. If not, see . 22 | 23 | Written by Assaf Gordon. 24 | =cut 25 | use strict; 26 | use warnings; 27 | 28 | ## 29 | ## This is a continuation of 'datamash-tests.pl' 30 | ## split into two files, as it was getting too large. 31 | ## 32 | 33 | # Until a better way comes along to auto-use Coreutils Perl modules 34 | # as in the coreutils' autotools system. 35 | use Coreutils; 36 | use CuSkip; 37 | use CuTmpdir qw(datamash); 38 | use MIME::Base64 ; 39 | 40 | (my $program_name = $0) =~ s|.*/||; 41 | my $prog_bin = 'datamash'; 42 | 43 | ## Cross-Compiling portability hack: 44 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 45 | ## the full path of the binary, if the binary is on the $PATH. 46 | ## So we try to detect what is the actual returned value of the program 47 | ## in case of an error. 48 | my $prog = `$prog_bin ---print-progname`; 49 | $prog = $prog_bin unless $prog; 50 | 51 | ## Portability hack: 52 | ## find the exact wording of 'nan' and inf (not-a-number). 53 | ## It's lower case in GNU/Linux,FreeBSD,OpenBSD, 54 | ## but is "NaN" on Illumos/OpenSolaris 55 | my $nan = `$prog_bin ---print-nan`; 56 | die "test infrastructure failed: can't determine 'nan' string" unless $nan; 57 | my $inf = `$prog_bin ---print-inf`; 58 | die "test infrastructure failed: can't determine 'inf' string" unless $inf; 59 | 60 | ## Portability hack 61 | ## Check if the system's sort supports stable sorting ('-s'). 62 | ## If it doesn't - skip some tests 63 | my $rc = system("sort -s < /dev/null > /dev/null 2>/dev/null"); 64 | die "testing framework failure: failed to execute sort -s" 65 | if ( ($rc == -1) || ($rc & 127) ); 66 | my $sort_exit_code = ($rc >> 8); 67 | my $have_stable_sort = ($sort_exit_code==0); 68 | 69 | 70 | # TODO: add localization tests with "grouping" 71 | # Turn off localization of executable's output. 72 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 73 | 74 | 75 | # Test the selection operation (first/last/min/max) together with "FULL": 76 | # group by column 1 ("A" or "B"), and operate on column 2 (numeric). 77 | # Ensure the matching "iX" is displayed, despite not being part of the 78 | # operation (example: if 'min(2)' is the operation, then "B 8" should be 79 | # selected, and "i7" must be displayed with "-full" (because "i7" is on the 80 | # same line as the min(2) value zero). 81 | my $in_full1=<<'EOF'; 82 | A 4 i1 83 | A 3 i2 84 | A 5 i3 85 | B 1 i4 86 | B 8 i5 87 | B 0 i6 88 | B 3 i7 89 | EOF 90 | 91 | my $full_deprecation = "$prog: Using -f/--full with non-linewise operations " . 92 | "is deprecated and will be disabled in a future release.\n"; 93 | 94 | my @Tests = 95 | ( 96 | # Test 'min' + --full 97 | # Test with "--full", "i2" and "i6" should be displayed 98 | ['slct2dep', '-t" " -f -g1 min 2', {IN_PIPE=>$in_full1}, 99 | {OUT=>"A 3 i2 3\nB 0 i6 0\n"}, 100 | {ERR=>"$full_deprecation"}], 101 | # --full with --sort => should not change results 102 | ['slct3dep', '-s -t" " -f -g1 min 2', {IN_PIPE=>$in_full1}, 103 | {OUT=>"A 3 i2 3\nB 0 i6 0\n"}, 104 | {ERR=>"$full_deprecation"}], 105 | 106 | # Test 'max' + --full 107 | # Test with "--full", "i3" and "i7" should be displayed 108 | ['slct5dep', '-t" " -f -g1 max 2', {IN_PIPE=>$in_full1}, 109 | {OUT=>"A 5 i3 5\nB 8 i5 8\n"}, 110 | {ERR=>"$full_deprecation"}], 111 | # --full with --sort => should not change results 112 | ['slct6dep', '-s -t" " -f -g1 max 2', {IN_PIPE=>$in_full1}, 113 | {OUT=>"A 5 i3 5\nB 8 i5 8\n"}, 114 | {ERR=>"$full_deprecation"}], 115 | 116 | # Test 'first' + --full 117 | # Test with "--full", "i1" and "i4" should be displayed 118 | ['slct8dep', '-t" " -f -g1 first 2', {IN_PIPE=>$in_full1}, 119 | {OUT=>"A 4 i1 4\nB 1 i4 1\n"}, 120 | {ERR=>"$full_deprecation"}], 121 | # more --full with --sort => see test 'sortslct1' below 122 | 123 | # Test 'last' + --full 124 | # Test with "--full", "i1" and "i4" should be displayed 125 | ['slct10dep', '-t" " -f -g1 last 2', {IN_PIPE=>$in_full1}, 126 | {OUT=>"A 5 i3 5\nB 3 i7 3\n"}, 127 | {ERR=>"$full_deprecation"}], 128 | ); 129 | 130 | if ($have_stable_sort) { 131 | push @Tests, ( 132 | # Test 'first' + --full + --sort 133 | # NOTE: This is subtle: 134 | # Sorting should be stable: only ordering the column which is used 135 | # for grouping (column 1 in this test). This means that the second 136 | # column (containing numbers) should NOT affect sorting, and the order 137 | # of the lines should not change. The results of this test 138 | # should be the same as 'slct8'. If the system doesn't have stable 139 | # 'sort', then the order will change. 140 | ['sortslct1dep', '-s -t" " -f -g1 first 2', {IN_PIPE=>$in_full1}, 141 | {OUT=>"A 4 i1 4\nB 1 i4 1\n"}, 142 | {ERR=>"$full_deprecation"}], 143 | # Test 'last' + --full + --sort 144 | # See note above regarding 'first' - applies to 'last' as well. 145 | ['sortslct2dep', '-s -t" " -f -g1 last 2', {IN_PIPE=>$in_full1}, 146 | {OUT=>"A 5 i3 5\nB 3 i7 3\n"}, 147 | {ERR=>"$full_deprecation"}], 148 | ) 149 | } 150 | 151 | my $save_temps = $ENV{SAVE_TEMPS}; 152 | my $verbose = $ENV{VERBOSE}; 153 | 154 | my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); 155 | exit $fail; 156 | -------------------------------------------------------------------------------- /src/decorate-functions.c: -------------------------------------------------------------------------------- 1 | /* Decorate functions 2 | 3 | Copyright (C) 2020-2021 Assaf Gordon 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #include "system.h" 29 | #include "die.h" 30 | #include "decorate-functions.h" 31 | 32 | 33 | static bool 34 | decorate_as_is (const char* in) 35 | { 36 | fprintf (stdout, "%s", in); 37 | return true; 38 | } 39 | 40 | static bool 41 | decorate_strlen (const char* in) 42 | { 43 | uintmax_t u = (uintmax_t)strlen (in); 44 | printf ("%0*"PRIuMAX, (int)INT_BUFSIZE_BOUND (u), u); 45 | return true; 46 | } 47 | 48 | static _GL_ATTRIBUTE_CONST int 49 | roman_numeral_to_value (char c) 50 | { 51 | switch (c) 52 | { 53 | case 'M': return 1000; 54 | case 'D': return 500; 55 | case 'C': return 100; 56 | case 'L': return 50; 57 | case 'X': return 10; 58 | case 'V': return 5; 59 | case 'I': return 1; 60 | default: return 0; 61 | } 62 | } 63 | 64 | 65 | /* Naive implementation of Roman numerals conversion. 66 | Does not support alternative forms such as 67 | XIIX,IIXX for 18, 68 | IIC for 98. */ 69 | static bool 70 | decorate_roman_numerals (const char* in) 71 | { 72 | intmax_t result = 0; 73 | intmax_t cur,last = 0; 74 | if (*in=='\0') 75 | { 76 | error (0, 0, _("invalid empty roman numeral")); 77 | return false; 78 | } 79 | while (*in) 80 | { 81 | cur = roman_numeral_to_value (*in); 82 | if (!cur) 83 | { 84 | error (0, 0, _("invalid roman numeral '%c' in %s"), *in, quote (in)); 85 | return false; 86 | } 87 | 88 | if (last) 89 | { 90 | if (last >= cur) 91 | { 92 | result += last; 93 | } 94 | else 95 | { 96 | result += (cur - last); 97 | cur = 0; 98 | } 99 | } 100 | 101 | last = cur; 102 | ++in; 103 | } 104 | 105 | result += last; 106 | printf ("%0*"PRIiMAX, (int)INT_BUFSIZE_BOUND (result), result); 107 | return true; 108 | } 109 | 110 | static bool 111 | decorate_ipv4_inet_addr (const char* in) 112 | { 113 | struct in_addr adr; 114 | int s; 115 | 116 | s = inet_aton (in, &adr); 117 | 118 | if (s == 0) 119 | { 120 | error (0, 0, _("invalid IPv4 address %s"), quote (in)); 121 | return false; 122 | } 123 | 124 | 125 | printf ("%08X", ntohl (adr.s_addr)); 126 | return true; 127 | } 128 | 129 | 130 | static bool 131 | decorate_ipv4_dot_decimal (const char* in) 132 | { 133 | struct in_addr adr; 134 | int s; 135 | 136 | s = inet_pton (AF_INET, in, &adr); 137 | 138 | if (s < 0) 139 | die (SORT_FAILURE, errno, _("inet_pton (AF_INET) failed")); 140 | 141 | if (s == 0) 142 | { 143 | error (0, 0, _("invalid dot-decimal IPv4 address %s"), quote (in)); 144 | return false; 145 | } 146 | 147 | printf ("%08X", ntohl (adr.s_addr)); 148 | return true; 149 | } 150 | 151 | 152 | static bool 153 | decorate_ipv6 (const char* in) 154 | { 155 | struct in6_addr adr; 156 | int s; 157 | 158 | s = inet_pton (AF_INET6, in, &adr); 159 | 160 | if (s < 0) 161 | die (SORT_FAILURE, errno, _("inet_pton (AF_INET6) failed")); 162 | 163 | if (s == 0) 164 | { 165 | error (0, 0, _("invalid IPv6 address %s"), quote (in)); 166 | return false; 167 | } 168 | 169 | /* A portable way to print IPv6 binary representation. */ 170 | for (int i=0;i<16;i+=2) 171 | { 172 | printf ("%02X%02X", adr.s6_addr[i], adr.s6_addr[i+1]); 173 | if (i != 14) 174 | fputc (':', stdout); 175 | } 176 | 177 | return true; 178 | } 179 | 180 | 181 | static bool 182 | decorate_ipv6_ipv4 (const char* in, uint32_t mapping) 183 | { 184 | struct in_addr adr4; 185 | struct in6_addr adr6; 186 | int s4, s6; 187 | 188 | s4 = inet_pton (AF_INET, in, &adr4); 189 | s6 = inet_pton (AF_INET6, in, &adr6); 190 | 191 | if (s4 < 0 && s6 < 0) 192 | die (SORT_FAILURE, errno, _("inet_pton failed for AF_INET and AF_INET6")); 193 | 194 | if (!(s4 > 0 || s6 > 0)) 195 | { 196 | error (0, 0, _("invalid IP address %s"), quote (in)); 197 | return false; 198 | } 199 | 200 | if (s6) 201 | for (int i=0;i<16;++i) 202 | printf ("%02X", adr6.s6_addr[i]); 203 | else 204 | printf ("%024X%08X", mapping, ntohl (adr4.s_addr)); 205 | 206 | return true; 207 | } 208 | 209 | 210 | static bool 211 | decorate_ipv6_ipv4_mapped (const char* in) 212 | { 213 | return decorate_ipv6_ipv4 (in, 0xFFFF); 214 | } 215 | 216 | 217 | static bool 218 | decorate_ipv6_ipv4_compat (const char* in) 219 | { 220 | return decorate_ipv6_ipv4 (in, 0); 221 | } 222 | 223 | 224 | 225 | struct conversions_t builtin_conversions[] = { 226 | { "as-is", "copy as-is", decorate_as_is }, /* for debugging */ 227 | { "roman", "roman numerals", decorate_roman_numerals }, 228 | { "strlen", "length (in bytes) of the specified field", decorate_strlen }, 229 | { "ipv4", "dotted-decimal IPv4 addresses", decorate_ipv4_dot_decimal }, 230 | { "ipv6", "IPv6 addresses", decorate_ipv6 }, 231 | { "ipv4inet", "number-and-dots IPv4 addresses (incl. octal, hex values)", 232 | decorate_ipv4_inet_addr }, 233 | { "ipv6v4map", "IPv6 and IPv4 (as IPv4-Mapped IPv6) addresses", 234 | decorate_ipv6_ipv4_mapped}, 235 | { "ipv6v4comp", "IPv6 and IPv4 (as IPv4-Compatible IPv6) addresses", 236 | decorate_ipv6_ipv4_compat}, 237 | { NULL, NULL, 0 } 238 | }; 239 | -------------------------------------------------------------------------------- /src/crosstab.c: -------------------------------------------------------------------------------- 1 | /* GNU Datamash - perform simple calculation on input data 2 | 3 | Copyright (C) 2013-2021 Assaf Gordon 4 | 5 | This file is part of GNU Datamash. 6 | 7 | GNU Datamash is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | GNU Datamash is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with GNU Datamash. If not, see . 19 | */ 20 | 21 | /* Written by Assaf Gordon */ 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include "hash.h" 30 | #include "hashcode-string2.h" 31 | #include "xalloc.h" 32 | 33 | #include "system.h" 34 | #include "crosstab.h" 35 | #include "utils.h" 36 | #include "text-options.h" 37 | 38 | 39 | static bool _GL_ATTRIBUTE_PURE 40 | str_comparator (const void* a, const void* b) 41 | { 42 | assert (a!=NULL && b!=NULL); /* LCOV_EXCL_LINE */ 43 | if (a==b) 44 | return true; 45 | return (STREQ ((const char*)a, (const char*)b)); 46 | } 47 | 48 | static size_t _GL_ATTRIBUTE_PURE 49 | hash_crosstab_data_cell (void const *x, size_t tablesize) 50 | { 51 | struct crosstab_datacell *dc = (struct crosstab_datacell*)x; 52 | 53 | const char *s; 54 | size_t h = 0; 55 | #define SIZE_BITS (sizeof (size_t) * CHAR_BIT) 56 | 57 | for (s = dc->row_name; *s; s++) 58 | h = *s + ((h << 9) | (h >> (SIZE_BITS - 9))); 59 | for (s = dc->col_name; *s; s++) 60 | h = *s + ((h << 9) | (h >> (SIZE_BITS - 9))); 61 | 62 | return h % tablesize; 63 | } 64 | 65 | static bool _GL_ATTRIBUTE_PURE 66 | crosstab_datacell_comparator (const void* a, const void* b) 67 | { 68 | assert (a!=NULL && b!=NULL); /* LCOV_EXCL_LINE */ 69 | if (a==b) 70 | return true; 71 | const struct crosstab_datacell *da = (struct crosstab_datacell*)a; 72 | const struct crosstab_datacell *db = (struct crosstab_datacell*)b; 73 | return (STREQ (da->row_name, db->row_name) 74 | && STREQ (da->col_name, db->col_name)); 75 | } 76 | 77 | 78 | static struct crosstab_datacell* 79 | new_datacell (const char* row, const char* col, const char* data) 80 | { 81 | struct crosstab_datacell *dc = xmalloc (sizeof (struct crosstab_datacell)); 82 | dc->row_name = row; 83 | dc->col_name = col; 84 | dc->data = xstrdup (data); 85 | return dc; 86 | } 87 | 88 | static void 89 | crosstab_datacell_free (void *a) 90 | { 91 | struct crosstab_datacell *dc = (struct crosstab_datacell*)a; 92 | dc->row_name = NULL; 93 | dc->col_name = NULL; 94 | /* syntax-check doesn't like casting the argument to free; free 95 | doesn't like const values passed to it. */ 96 | void *data = (void*)dc->data; 97 | free (data); 98 | dc->data = NULL; 99 | free (dc); 100 | } 101 | 102 | /* Setup needed variables for the cross-tabulation */ 103 | struct crosstab* 104 | crosstab_init () 105 | { 106 | struct crosstab *ct = XMALLOC (struct crosstab); 107 | 108 | ct->rows = hash_initialize (1000,NULL,hash_pjw,str_comparator,free); 109 | ct->columns = hash_initialize (1000,NULL,hash_pjw,str_comparator,free); 110 | ct->data = hash_initialize (1000,NULL,hash_crosstab_data_cell, 111 | crosstab_datacell_comparator, 112 | crosstab_datacell_free); 113 | return ct; 114 | } 115 | 116 | void 117 | crosstab_free (struct crosstab* ct) 118 | { 119 | assert (ct!=NULL); /* LCOV_EXCL_LINE */ 120 | hash_free (ct->rows); 121 | ct->rows = NULL; 122 | hash_free (ct->columns); 123 | ct->columns = NULL; 124 | hash_free (ct->data); 125 | ct->data = NULL; 126 | free (ct); 127 | } 128 | 129 | /* Add new cross-tabulation result */ 130 | void 131 | crosstab_add_result (struct crosstab* ct, 132 | const char* row, const char* col, const char* data) 133 | { 134 | const char* r = hash_lookup (ct->rows, row); 135 | if (r==NULL) 136 | r = hash_insert (ct->rows, xstrdup (row)); 137 | 138 | const char* c = hash_lookup (ct->columns, col); 139 | if (c==NULL) 140 | c = hash_insert (ct->columns, xstrdup (col)); 141 | 142 | struct crosstab_datacell *ctdc = new_datacell (r,c,data); 143 | struct crosstab_datacell *existing_ctdc = hash_insert (ct->data, ctdc); 144 | if (ctdc != existing_ctdc) { 145 | crosstab_datacell_free (ctdc); 146 | } 147 | } 148 | 149 | 150 | /* Print table */ 151 | void 152 | crosstab_print (const struct crosstab* ct) 153 | { 154 | const size_t n_rows = hash_get_n_entries (ct->rows); 155 | char** rows_list = XNMALLOC (n_rows,char*); 156 | hash_get_entries (ct->rows, (void**)rows_list, n_rows); 157 | qsort (rows_list, n_rows, sizeof (char*), cmpstringp); 158 | 159 | const size_t n_cols = hash_get_n_entries (ct->columns); 160 | char** cols_list = XNMALLOC (n_cols,char*); 161 | hash_get_entries (ct->columns, (void**)cols_list, n_cols); 162 | qsort (cols_list, n_cols, sizeof (char*), cmpstringp); 163 | 164 | /* Print columns */ 165 | for (size_t c = 0; c < n_cols; ++c) 166 | { 167 | print_field_separator (); 168 | fputs (cols_list[c], stdout); 169 | } 170 | print_line_separator (); 171 | 172 | /* Print rows */ 173 | for (size_t r = 0; r < n_rows; ++r) 174 | { 175 | fputs (rows_list[r], stdout); 176 | 177 | for (size_t c = 0; c < n_cols; ++c) 178 | { 179 | struct crosstab_datacell curr; 180 | curr.row_name = rows_list[r]; 181 | curr.col_name = cols_list[c]; 182 | 183 | const struct crosstab_datacell *dc = hash_lookup (ct->data, &curr); 184 | print_field_separator (); 185 | fputs ((dc==NULL)?missing_field_filler:dc->data, stdout); 186 | } 187 | 188 | print_line_separator (); 189 | } 190 | 191 | free (rows_list); 192 | free (cols_list); 193 | } 194 | /* vim: set cinoptions=>4,n-2,{2,^-2,:2,=2,g0,h2,p5,t0,+2,(0,u0,w1,m1: */ 195 | /* vim: set shiftwidth=2: */ 196 | /* vim: set tabstop=8: */ 197 | /* vim: set expandtab: */ 198 | -------------------------------------------------------------------------------- /tests/datamash-crosstab.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - perform simple calculation on input data 4 | Tests for 'transpose' and 'reverse' operation modes. 5 | 6 | 7 | Copyright (C) 2013-2021 Assaf Gordon 8 | Copyright (C) 2022-2025 Timothy Rice 9 | 10 | This file is part of GNU Datamash. 11 | 12 | GNU Datamash is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | GNU Datamash is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with GNU Datamash. If not, see . 24 | 25 | Written by Assaf Gordon. 26 | =cut 27 | use strict; 28 | use warnings; 29 | use List::Util qw/max/; 30 | use Data::Dumper; 31 | 32 | # Until a better way comes along to auto-use Coreutils Perl modules 33 | # as in the coreutils' autotools system. 34 | use Coreutils; 35 | use CuSkip; 36 | use CuTmpdir qw(datamash); 37 | 38 | (my $program_name = $0) =~ s|.*/||; 39 | my $prog_bin = 'datamash'; 40 | 41 | my $prog = `$prog_bin ---print-progname`; 42 | 43 | # Turn off localization of executable's output. 44 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 45 | 46 | my $in1=<<'EOF'; 47 | a x 1 48 | a y 2 49 | a z 3 50 | b x 4 51 | b y 5 52 | b z 6 53 | c x 7 54 | c y 8 55 | c z 9 56 | EOF 57 | 58 | my $out1_first=<<'EOF'; 59 | x y z 60 | a 1 2 3 61 | b 4 5 6 62 | c 7 8 9 63 | EOF 64 | 65 | my $out1_count=<<'EOF'; 66 | x y z 67 | a 1 1 1 68 | b 1 1 1 69 | c 1 1 1 70 | EOF 71 | 72 | #unsorted input with duplicates 73 | my $in2=<<'EOF'; 74 | a x 1 75 | a y 2 76 | a x 3 77 | EOF 78 | 79 | # when using 'first' operation 80 | my $out2_first=<<'EOF'; 81 | x y 82 | a 1 2 83 | EOF 84 | 85 | # when using 'last' operation without sorting the data 86 | # this output is considered incorrect... 87 | # TODO: perhaps warn the user about it (like join warns of unsorted input). 88 | my $out2_last_unsorted=<<'EOF'; 89 | x y 90 | a 1 2 91 | EOF 92 | 93 | # when using 'last' operation with sorting the data, 94 | # correct value is shown 95 | my $out2_last_sorted=<<'EOF'; 96 | x y 97 | a 3 2 98 | EOF 99 | 100 | my $out2_count_unsorted=<<'EOF'; 101 | x y 102 | a 1 1 103 | EOF 104 | 105 | my $out2_count_sorted=<<'EOF'; 106 | x y 107 | a 2 1 108 | EOF 109 | 110 | # using 'sum' without sorting the data. 111 | # this output is considered incorrect... 112 | # TODO: perhaps warn the user about it (like join warns of unsorted input). 113 | my $out2_sum_unsorted=<<'EOF'; 114 | x y 115 | a 1 2 116 | EOF 117 | 118 | # using 'sum' with sorting the data 119 | my $out2_sum_sorted=<<'EOF'; 120 | x y 121 | a 4 2 122 | EOF 123 | 124 | #input with missing values (b/y is missing) 125 | my $in3=<<'EOF'; 126 | a x 1 127 | a y 2 128 | b x 3 129 | EOF 130 | 131 | # default filler is 'N/A' 132 | my $out3_na=<<'EOF'; 133 | x y 134 | a 1 2 135 | b 3 N/A 136 | EOF 137 | 138 | # custom filler 'XX' 139 | my $out3_xx=<<'EOF'; 140 | x y 141 | a 1 2 142 | b 3 XX 143 | EOF 144 | 145 | my $in4=<<'EOF'; 146 | x y 147 | 1 0.5 148 | 2 1 149 | 3 1.5 150 | 4 2 151 | EOF 152 | 153 | my $out4_hdr=<<'EOF'; 154 | GroupBy(x) GroupBy(y) count(x) 155 | 0.5 1 1.5 2 156 | 1 1 N/A N/A N/A 157 | 2 N/A 1 N/A N/A 158 | 3 N/A N/A 1 N/A 159 | 4 N/A N/A N/A 1 160 | EOF 161 | 162 | my $out4_no_hdr=<<'EOF'; 163 | 0.5 1 1.5 2 164 | 1 1 N/A N/A N/A 165 | 2 N/A 1 N/A N/A 166 | 3 N/A N/A 1 N/A 167 | 4 N/A N/A N/A 1 168 | EOF 169 | 170 | my @Tests = 171 | ( 172 | ['c1','crosstab 1,2 first 3', {IN_PIPE=>$in1}, {OUT=>$out1_first}], 173 | ['c2','ct 1,2 first 3', {IN_PIPE=>$in1}, {OUT=>$out1_first}], 174 | ['c3','ct 1,2 count 1', {IN_PIPE=>$in1}, {OUT=>$out1_count}], 175 | ['c4','ct 1-2 count 1', {IN_PIPE=>$in1}, {OUT=>$out1_count}], 176 | 177 | # Default operation is count 178 | ['c5','ct 1,2', {IN_PIPE=>$in1}, {OUT=>$out1_count}], 179 | 180 | # test unsorted input with duplicates 181 | ['c10','ct 1,2 first 3', {IN_PIPE=>$in2}, {OUT=>$out2_first}], 182 | 183 | ['c11',' ct 1,2 last 3', {IN_PIPE=>$in2}, {OUT=>$out2_last_unsorted}], 184 | ['c12','-s ct 1,2 last 3', {IN_PIPE=>$in2}, {OUT=>$out2_last_sorted}], 185 | 186 | ['c13',' ct 1,2 sum 3', {IN_PIPE=>$in2}, {OUT=>$out2_sum_unsorted}], 187 | ['c14','-s ct 1,2 sum 3', {IN_PIPE=>$in2}, {OUT=>$out2_sum_sorted}], 188 | 189 | # test default operation (count) on unsorted data 190 | ['c15',' ct 1,2 count 3', {IN_PIPE=>$in2}, {OUT=>$out2_count_unsorted}], 191 | ['c16','-s ct 1,2 count 3', {IN_PIPE=>$in2}, {OUT=>$out2_count_sorted}], 192 | 193 | # test headers 194 | ['c17','-W --header-in --header-out ct x,y', 195 | {IN_PIPE=>$in4}, {OUT=>$out4_hdr}], 196 | ['c18','-W --header-in ct x,y', 197 | {IN_PIPE=>$in4}, {OUT=>$out4_no_hdr}], 198 | 199 | # Test missing values 200 | ['c30','ct 1,2 first 3', {IN_PIPE=>$in3}, {OUT=>$out3_na}], 201 | ['c31','--filler XX ct 1,2 first 3', {IN_PIPE=>$in3}, {OUT=>$out3_xx}], 202 | 203 | # Test wrong usage 204 | ['e1', 'ct', {IN_PIPE=>""}, {EXIT=>1}, 205 | {ERR=>"$prog: missing field for operation 'crosstab'\n"}], 206 | ['e2', 'ct 1', {IN_PIPE=>""}, {EXIT=>1}, 207 | {ERR=>"$prog: crosstab requires exactly 2 fields, found 1\n"}], 208 | ['e3', 'ct 1,2,3,4', {IN_PIPE=>""}, {EXIT=>1}, 209 | {ERR=>"$prog: crosstab requires exactly 2 fields, found 4\n"}], 210 | ['e4', 'ct 1,2 md5 4', {IN_PIPE=>""}, {EXIT=>1}, 211 | {ERR=>"$prog: conflicting operation found: expecting crosstab " . 212 | "operations, but found line operation 'md5'\n"}], 213 | ['e5', 'ct 1,2 sum 1,2', {IN_PIPE=>""}, {EXIT=>1}, 214 | {ERR=>"$prog: crosstab supports one operation, found 2\n"}], 215 | ['e6', 'ct 1,2 min 2 max 2', {IN_PIPE=>""}, {EXIT=>1}, 216 | {ERR=>"$prog: crosstab supports one operation, found 2\n"}], 217 | ['e7', 'ct 1:2', {IN_PIPE=>""}, {EXIT=>1}, 218 | {ERR=>"$prog: invalid field pair for operation 'crosstab'\n"}], 219 | ['e8', 'ct 1-3', {IN_PIPE=>""}, {EXIT=>1}, 220 | {ERR=>"$prog: crosstab requires exactly 2 fields, found 3\n"}], 221 | ); 222 | 223 | my $save_temps = $ENV{SAVE_TEMPS}; 224 | my $verbose = $ENV{VERBOSE}; 225 | 226 | my $fail = run_tests ($program_name, $prog_bin, \@Tests, $save_temps, $verbose); 227 | exit $fail; 228 | -------------------------------------------------------------------------------- /tests/datamash-check.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - perform simple calculation on input data 4 | Tests for 'check' operation mode 5 | 6 | Copyright (C) 2013-2021 Assaf Gordon 7 | Copyright (C) 2022-2025 Timothy Rice 8 | 9 | This file is part of GNU Datamash. 10 | 11 | GNU Datamash is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | GNU Datamash is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU General Public License for more details. 20 | 21 | You should have received a copy of the GNU General Public License 22 | along with GNU Datamash. If not, see . 23 | 24 | Written by Assaf Gordon. 25 | =cut 26 | use strict; 27 | use warnings; 28 | use List::Util qw/max/; 29 | use Data::Dumper; 30 | 31 | # Until a better way comes along to auto-use Coreutils Perl modules 32 | # as in the coreutils' autotools system. 33 | use Coreutils; 34 | use CuSkip; 35 | use CuTmpdir qw(datamash); 36 | 37 | (my $program_name = $0) =~ s|.*/||; 38 | my $prog_bin = 'datamash'; 39 | 40 | ## Cross-Compiling portability hack: 41 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 42 | ## the full path of the binary, if the binary is on the $PATH. 43 | ## So we try to detect what is the actual returned value of the program 44 | ## in case of an error. 45 | my $prog = `$prog_bin --foobar 2>&1 | head -n 1 | cut -f1 -d:`; 46 | chomp $prog if $prog; 47 | $prog = $prog_bin unless $prog; 48 | 49 | # Turn off localization of executable's output. 50 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 51 | 52 | 53 | my $in1=<<'EOF'; 54 | A 1 ! 55 | B 2 @ 56 | C 3 # 57 | D 4 $ 58 | E 5 % 59 | EOF 60 | 61 | my $in2=<<'EOF'; 62 | A 1 63 | B 64 | C 3 65 | EOF 66 | 67 | my $in3=<<'EOF'; 68 | A 69 | EOF 70 | 71 | my $in4=<<'EOF'; 72 | #comment 73 | A 74 | ;comment 75 | 76 | B 77 | EOF 78 | 79 | 80 | my @Tests = 81 | ( 82 | # Simple transpose and reverse 83 | ['c1', 'check', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 84 | 85 | # Variations on command-line parsing 86 | ['c2', 'check 3 field', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 87 | ['c3', 'check 3 fields', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 88 | ['c4', 'check 3 col', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 89 | ['c5', 'check 3 columns', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 90 | ['c6', 'check 3 column', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 91 | 92 | ['c7', 'check field 3', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 93 | ['c8', 'check fields 3', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 94 | ['c9', 'check col 3', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 95 | ['c10', 'check columns 3', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 96 | ['c11', 'check column 3', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 97 | 98 | ['c12', 'check 5 lines', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 99 | ['c13', 'check 5 line', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 100 | ['c14', 'check 5 rows', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 101 | ['c15', 'check 5 row', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 102 | 103 | ['c16', 'check lines 5', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 104 | ['c17', 'check line 5', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 105 | ['c18', 'check row 5', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 106 | ['c19', 'check rows 5', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}], 107 | 108 | 109 | # Duplicated options 110 | ['e1', 'check rows 5 lines 6', {IN_PIPE=>$in1}, {EXIT=>1}, 111 | {ERR=>"$prog: number of lines/rows already set in operation 'check'\n"}], 112 | ['e2', 'check fields 6 fields 1', {IN_PIPE=>$in1}, {EXIT=>1}, 113 | {ERR=>"$prog: number of fields/columns already set " . 114 | "in operation 'check'\n"}], 115 | 116 | # Invalid values 117 | ['e3', 'check 0 lines', {IN_PIPE=>$in1}, {EXIT=>1}, 118 | {ERR=>"$prog: invalid value zero for lines/fields in operation 'check'\n"}], 119 | ['e4', 'check 0 fields', {IN_PIPE=>$in1}, {EXIT=>1}, 120 | {ERR=>"$prog: invalid value zero for lines/fields in operation 'check'\n"}], 121 | 122 | 123 | 124 | # Check lines 125 | ['c40', 'check 4 lines', {IN_PIPE=>$in1}, {EXIT=>1}, 126 | {ERR=>"$prog: check failed: input had 5 lines (expecting 4)\n"}], 127 | ['c41', 'check 6 lines', {IN_PIPE=>$in1}, {EXIT=>1}, 128 | {ERR=>"$prog: check failed: input had 5 lines (expecting 6)\n"}], 129 | ['c42', 'check 6 lines', {IN_PIPE=>""}, {EXIT=>1}, 130 | {ERR=>"$prog: check failed: input had 0 lines (expecting 6)\n"}], 131 | 132 | # Check fields 133 | ['c60', 'check 2 fields', {IN_PIPE=>$in1}, {EXIT=>1}, 134 | {ERR=>"line 1 (3 fields):\n" . 135 | " A\t1\t!\n" . 136 | "$prog: check failed: line 1 has 3 fields (expecting 2)\n"}], 137 | 138 | 139 | # Check matrix structure, no expected number of fields 140 | ['c61', 'check', {IN_PIPE=>$in2}, {EXIT=>1}, 141 | {ERR=>"line 1 (2 fields):\n" . 142 | " A\t1\n" . 143 | "line 2 (1 fields):\n" . 144 | " B\n" . 145 | "$prog: check failed: line 2 has 1 fields (previous line had 2)\n"}], 146 | 147 | # With expected number of fields 148 | ['c62', 'check 2 fields', {IN_PIPE=>$in2}, {EXIT=>1}, 149 | {ERR=>"line 2 (1 fields):\n" . 150 | " B\n" . 151 | "$prog: check failed: line 2 has 1 fields (expecting 2)\n"}], 152 | 153 | # no special treatment for comments or empty lines by default 154 | ['c63', 'check', {IN_PIPE=>$in4}, {EXIT=>1}, 155 | {ERR=>"line 3 (1 fields):\n" . 156 | " ;comment\n" . 157 | "line 4 (0 fields):\n" . 158 | " \n" . 159 | "$prog: check failed: line 4 has 0 fields (previous line had 1)\n"}], 160 | # --skip-comments skips only comment lines, but not empty lines 161 | ['c64', '--skip-comments check', {IN_PIPE=>$in4}, {EXIT=>1}, 162 | {ERR=>"line 1 (1 fields):\n" . 163 | " A\n" . 164 | "line 2 (0 fields):\n" . 165 | " \n" . 166 | "$prog: check failed: line 2 has 0 fields (previous line had 1)\n"}], 167 | # --vnlog skips both comment and empty lines, but only '#' starts a comment 168 | # (the first line is actually the vnlog header and requires the same number 169 | # of fields as the data lines, but this test is about ignoring empty lines) 170 | ['c65', '--vnlog check', {IN_PIPE=>$in4}, {OUT=>"3 lines, 1 field\n"}], 171 | ); 172 | 173 | my $save_temps = $ENV{SAVE_TEMPS}; 174 | my $verbose = $ENV{VERBOSE}; 175 | 176 | my $fail = run_tests ($program_name, $prog_bin, \@Tests, $save_temps, $verbose); 177 | exit $fail; 178 | -------------------------------------------------------------------------------- /tests/decorate-sort-tests.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for decorate 4 | 5 | Copyright (C) 2020-2021 Assaf Gordon 6 | Copyright (C) 2022-2025 Timothy Rice 7 | 8 | This file is part of GNU Datamash. 9 | 10 | GNU Datamash is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | GNU Datamash is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with GNU Datamash. If not, see . 22 | 23 | Written by Assaf Gordon. 24 | =cut 25 | use strict; 26 | use warnings; 27 | 28 | # Until a better way comes along to auto-use Coreutils Perl modules 29 | # as in the coreutils' autotools system. 30 | use Coreutils; 31 | use CuSkip; 32 | use CuTmpdir qw(decorate); 33 | use MIME::Base64 ; 34 | 35 | (my $program_name = $0) =~ s|.*/||; 36 | my $prog = 'decorate'; 37 | $prog .= " --sort-cmd=/usr/bin/sort" if $^O eq "netbsd"; 38 | 39 | # TODO: add localization tests with "grouping" 40 | # Turn off localization of executable's output. 41 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 42 | 43 | 44 | my $in1=<<'EOF'; 45 | I 1.20.30.41 46 | II 1.20.30.1 47 | IV 1.10.30.14 48 | M 1.2.10.3 49 | M 192.168.43.1 50 | II 1.20.30.41 51 | D 192.168.17.8 52 | C 1.2.10.3 53 | L 192.168.17.10 54 | EOF 55 | 56 | 57 | my $out1_dec_roman=<<'EOF'; 58 | I 1.20.30.41 59 | II 1.20.30.1 60 | II 1.20.30.41 61 | IV 1.10.30.14 62 | L 192.168.17.10 63 | C 1.2.10.3 64 | D 192.168.17.8 65 | M 1.2.10.3 66 | M 192.168.43.1 67 | EOF 68 | 69 | my $out1_dec_ipv4=<<'EOF'; 70 | C 1.2.10.3 71 | M 1.2.10.3 72 | IV 1.10.30.14 73 | II 1.20.30.1 74 | I 1.20.30.41 75 | II 1.20.30.41 76 | D 192.168.17.8 77 | L 192.168.17.10 78 | M 192.168.43.1 79 | EOF 80 | 81 | my $out1_dec_ipv4_stable=<<'EOF'; 82 | M 1.2.10.3 83 | C 1.2.10.3 84 | IV 1.10.30.14 85 | II 1.20.30.1 86 | I 1.20.30.41 87 | II 1.20.30.41 88 | D 192.168.17.8 89 | L 192.168.17.10 90 | M 192.168.43.1 91 | EOF 92 | 93 | 94 | my $out1_dec_ipv4_rev=<<'EOF'; 95 | M 192.168.43.1 96 | L 192.168.17.10 97 | D 192.168.17.8 98 | I 1.20.30.41 99 | II 1.20.30.41 100 | II 1.20.30.1 101 | IV 1.10.30.14 102 | C 1.2.10.3 103 | M 1.2.10.3 104 | EOF 105 | 106 | my $out1_dec_ipv4_rev_header1=<<'EOF'; 107 | I 1.20.30.41 108 | M 192.168.43.1 109 | L 192.168.17.10 110 | D 192.168.17.8 111 | II 1.20.30.41 112 | II 1.20.30.1 113 | IV 1.10.30.14 114 | C 1.2.10.3 115 | M 1.2.10.3 116 | EOF 117 | 118 | my $out1_dec_ipv4_rev_header2=<<'EOF'; 119 | I 1.20.30.41 120 | II 1.20.30.1 121 | M 192.168.43.1 122 | L 192.168.17.10 123 | D 192.168.17.8 124 | II 1.20.30.41 125 | IV 1.10.30.14 126 | C 1.2.10.3 127 | M 1.2.10.3 128 | EOF 129 | 130 | 131 | my $out1_dec_roman_ipv4rev=<<'EOF'; 132 | I 1.20.30.41 133 | II 1.20.30.41 134 | II 1.20.30.1 135 | IV 1.10.30.14 136 | L 192.168.17.10 137 | C 1.2.10.3 138 | D 192.168.17.8 139 | M 192.168.43.1 140 | M 1.2.10.3 141 | EOF 142 | 143 | my $out1_dec_ipv4_romanrev=<<'EOF'; 144 | M 1.2.10.3 145 | C 1.2.10.3 146 | IV 1.10.30.14 147 | II 1.20.30.1 148 | II 1.20.30.41 149 | I 1.20.30.41 150 | D 192.168.17.8 151 | L 192.168.17.10 152 | M 192.168.43.1 153 | EOF 154 | 155 | 156 | 157 | my $out1_dec_roman_k2=<<'EOF'; 158 | I 1.20.30.41 159 | II 1.20.30.1 160 | II 1.20.30.41 161 | IV 1.10.30.14 162 | L 192.168.17.10 163 | C 1.2.10.3 164 | D 192.168.17.8 165 | M 1.2.10.3 166 | M 192.168.43.1 167 | EOF 168 | 169 | my $out1_dec_k2n_roman=<<'EOF'; 170 | IV 1.10.30.14 171 | I 1.20.30.41 172 | II 1.20.30.1 173 | II 1.20.30.41 174 | C 1.2.10.3 175 | M 1.2.10.3 176 | L 192.168.17.10 177 | D 192.168.17.8 178 | M 192.168.43.1 179 | EOF 180 | 181 | my $in2=<<'EOF'; 182 | 203.0.113.47 183 | 192.0.2.33 184 | 203.0.113.0 185 | 192.0.2.3 186 | 0.0.0.0 187 | ::ffff:192.0.2.42 188 | 2001:db8:6:5:4:3:2:1 189 | 2001:Db8:: 190 | ::192.0.2.41 191 | ::1 192 | EOF 193 | 194 | my $out2_ipv6v4map=<<'EOF'; 195 | ::1 196 | ::192.0.2.41 197 | 0.0.0.0 198 | 192.0.2.3 199 | 192.0.2.33 200 | ::ffff:192.0.2.42 201 | 203.0.113.0 202 | 203.0.113.47 203 | 2001:Db8:: 204 | 2001:db8:6:5:4:3:2:1 205 | EOF 206 | 207 | my $out2_ipv6v4comp=<<'EOF'; 208 | 0.0.0.0 209 | ::1 210 | 192.0.2.3 211 | 192.0.2.33 212 | ::192.0.2.41 213 | 203.0.113.0 214 | 203.0.113.47 215 | ::ffff:192.0.2.42 216 | 2001:Db8:: 217 | 2001:db8:6:5:4:3:2:1 218 | EOF 219 | 220 | 221 | my @Tests = 222 | ( 223 | ['s1', '-k1,1:roman', {IN_PIPE=>$in1}, {OUT => $out1_dec_roman}], 224 | ['s2', '-k2,2:ipv4', {IN_PIPE=>$in1}, {OUT => $out1_dec_ipv4}], 225 | ['s3', '-k2,2:ipv4 --stable', {IN_PIPE=>$in1}, 226 | {OUT => $out1_dec_ipv4_stable}], 227 | ['s4', '-k2,2r:ipv4', {IN_PIPE=>$in1}, {OUT => $out1_dec_ipv4_rev}], 228 | 229 | ['s5', '-k1,1:roman -k2,2r:ipv4' , {IN_PIPE=>$in1}, 230 | {OUT => $out1_dec_roman_ipv4rev}], 231 | ['s6', '-k2,2:ipv4 -k1r,1:roman' , {IN_PIPE=>$in1}, 232 | {OUT => $out1_dec_ipv4_romanrev}], 233 | 234 | ['s10', '-k1,1:roman -k2,2' , {IN_PIPE=>$in1}, 235 | {OUT => $out1_dec_roman_k2}], 236 | ['s11', '-k2n,2 -k1,1:roman' , {IN_PIPE=>$in1}, 237 | {OUT => $out1_dec_k2n_roman}], 238 | 239 | ['s12', '-k1,1:ipv6v4map', {IN_PIPE=>$in2}, {OUT => $out2_ipv6v4map}], 240 | ['s13', '-k1,1:ipv6v4comp', {IN_PIPE=>$in2}, {OUT => $out2_ipv6v4comp}], 241 | 242 | 243 | # Sort with header lines 244 | ['sh1', '-H -k2,2r:ipv4', {IN_PIPE=>$in1}, {OUT=>$out1_dec_ipv4_rev_header1}], 245 | ['sh2', '--header=2 -k2,2r:ipv4', {IN_PIPE=>$in1}, 246 | {OUT => $out1_dec_ipv4_rev_header2}], 247 | # More header lines than in the input 248 | ['sh3', '--header=9 -k2,2r:ipv4', {IN_PIPE=>$in1}, {OUT => $in1}], 249 | ['sh4', '--header=10 -k2,2r:ipv4', {IN_PIPE=>$in1}, {OUT => $in1}], 250 | 251 | ); 252 | 253 | # Repeat all tests with --debug option, ensure it does not cause any regression 254 | my @debug_tests; 255 | foreach my $t (@Tests) 256 | { 257 | # Skip tests with EXIT!=0 or ERR_SUBST part 258 | # (as '--debug' requires its own ERR_SUBST). 259 | my $exit_val; 260 | my $have_err_subst; 261 | foreach my $e (@$t) 262 | { 263 | next unless ref $e && ref $e eq 'HASH'; 264 | $exit_val = $e->{EXIT} if defined $e->{EXIT}; 265 | $have_err_subst = 1 if defined $e->{ERR_SUBST}; 266 | } 267 | next if $exit_val || $have_err_subst; 268 | 269 | # Duplicate the test, add '--debug' argument 270 | my @newt = @$t; 271 | $newt[0] = 'dbg_' . $newt[0]; 272 | $newt[1] = '---debug ' . $newt[1]; 273 | 274 | # Discard all debug printouts before comparing output 275 | push @newt, {ERR_SUBST => q!s/.*\n//m!}; 276 | 277 | push @debug_tests, \@newt; 278 | } 279 | push @Tests, @debug_tests; 280 | 281 | 282 | my $save_temps = $ENV{SAVE_TEMPS}; 283 | my $verbose = $ENV{VERBOSE}; 284 | 285 | my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); 286 | exit $fail; 287 | -------------------------------------------------------------------------------- /tests/datamash-transpose.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - perform simple calculation on input data 4 | Tests for 'transpose' and 'reverse' operation modes. 5 | 6 | 7 | Copyright (C) 2013-2021 Assaf Gordon 8 | Copyright (C) 2022-2025 Timothy Rice 9 | 10 | This file is part of GNU Datamash. 11 | 12 | GNU Datamash is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | GNU Datamash is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with GNU Datamash. If not, see . 24 | 25 | Written by Assaf Gordon. 26 | =cut 27 | use strict; 28 | use warnings; 29 | use List::Util qw/max/; 30 | use Data::Dumper; 31 | 32 | # Until a better way comes along to auto-use Coreutils Perl modules 33 | # as in the coreutils' autotools system. 34 | use Coreutils; 35 | use CuSkip; 36 | use CuTmpdir qw(datamash); 37 | 38 | (my $program_name = $0) =~ s|.*/||; 39 | my $prog_bin = 'datamash'; 40 | 41 | ## Cross-Compiling portability hack: 42 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 43 | ## the full path of the binary, if the binary is on the $PATH. 44 | ## So we try to detect what is the actual returned value of the program 45 | ## in case of an error. 46 | my $prog = `$prog_bin --foobar 2>&1 | head -n 1 | cut -f1 -d:`; 47 | chomp $prog if $prog; 48 | $prog = $prog_bin unless $prog; 49 | 50 | # Turn off localization of executable's output. 51 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 52 | 53 | sub perl_reverse_fields 54 | { 55 | my $field_sep = shift; 56 | my $input = shift; 57 | return join ("\n", 58 | map { 59 | join($field_sep, 60 | reverse( 61 | split /$field_sep/, $_ 62 | ) 63 | ) 64 | } 65 | split /\n/, $input) . "\n"; 66 | } 67 | 68 | sub perl_transpose 69 | { 70 | my $field_sep = shift; 71 | my $filler = shift ; 72 | my $input = shift; 73 | my @lines = map { [ split /$field_sep/, $_ ] } split /\n/, $input; 74 | my $max_field = max ( map{ scalar(@$_) } @lines ); 75 | 76 | my @output; 77 | foreach my $i ( 0 .. ( $max_field - 1) ) { 78 | my @new_line ; 79 | foreach my $l (@lines) { 80 | push @new_line, 81 | (scalar(@$l) > $i) ? $l->[$i] : $filler; 82 | } 83 | 84 | push @output, join($field_sep, @new_line); 85 | } 86 | 87 | return join("\n", @output) . "\n"; 88 | } 89 | 90 | my $in1=<<'EOF'; 91 | A 1 ! 92 | B 2 @ 93 | C 3 # 94 | D 4 $ 95 | E 5 % 96 | EOF 97 | 98 | my $in2 = $in1; 99 | $in2 =~ s/\t/:/gms; 100 | 101 | my $in3=<<'EOF'; 102 | A 1 103 | B 104 | C 3 105 | EOF 106 | 107 | my $in4=<<'EOF'; 108 | A 109 | B 110 | C 111 | EOF 112 | 113 | my $in5="A\tB\tC\tD\n"; 114 | 115 | my $in6="A\n"; 116 | 117 | my $in7=""; 118 | 119 | my $out1_rev = perl_reverse_fields ( "\t", $in1 ); 120 | my $out2_rev = perl_reverse_fields ( ":", $in2 ); 121 | my $out3_rev = perl_reverse_fields ( "\t", $in3 ); 122 | my $out4_rev = perl_reverse_fields ( "\t", $in4 ); 123 | my $out5_rev = perl_reverse_fields ( "\t", $in5 ); 124 | 125 | my $out1_tr = perl_transpose ( "\t", "N/A", $in1 ); 126 | my $out2_tr = perl_transpose ( ":", "N/A", $in2 ) ; 127 | my $out3_tr = perl_transpose ("\t", "N/A", $in3 ) ; 128 | my $out3_filler_tr = perl_transpose ("\t", "xxx", $in3 ) ; 129 | my $out4_tr = perl_transpose ("\t", "N/A", $in4 ) ; 130 | my $out5_tr = perl_transpose ("\t", "N/A", $in5 ) ; 131 | 132 | my $in_hdr1=<<'EOF'; 133 | X:Y 134 | 1:a 135 | 2:b 136 | EOF 137 | 138 | 139 | # Transposing with missing value in the last line 140 | # (bug in 1.1.0 would result in 'c' being silently dropped). 141 | my $in_missing1=<<'EOF'; 142 | a b c 143 | 1 2 144 | EOF 145 | my $out_missing1=<<'EOF'; 146 | a 1 147 | b 2 148 | c N/A 149 | EOF 150 | 151 | my @Tests = 152 | ( 153 | # Simple transpose and reverse 154 | ['tr1', 'transpose', {IN_PIPE=>$in1}, {OUT=>$out1_tr}], 155 | ['rev1', 'reverse', {IN_PIPE=>$in1}, {OUT=>$out1_rev}], 156 | 157 | # non-tab delimiter 158 | ['tr2', '-t: transpose', {IN_PIPE=>$in2}, {OUT=>$out2_tr}], 159 | ['rev2', '-t: reverse', {IN_PIPE=>$in2}, {OUT=>$out2_rev}], 160 | 161 | # missing fields, strict mode 162 | ['tr3', 'transpose', {IN_PIPE=>$in3}, {EXIT=>1}, 163 | {OUT_SUBST=>'s/.*//'}, 164 | {ERR=>"$prog: transpose input error: line 2 has 1 fields ". 165 | "(previous lines had 2);\n" . 166 | "see --help to disable strict mode\n"}], 167 | ['rev3', 'reverse', {IN_PIPE=>$in3}, {EXIT=>1}, 168 | {OUT_SUBST=>'s/.*//s'}, 169 | {ERR=>"$prog: reverse-field input error: line 2 has 1 fields ". 170 | "(previous lines had 2);\n" . 171 | "see --help to disable strict mode\n"}], 172 | 173 | # missing fields, non-strict mode 174 | ['tr4', '--no-strict transpose', {IN_PIPE=>$in3}, {OUT=>$out3_tr}], 175 | ['rev4', '--no-strict reverse', {IN_PIPE=>$in3}, {OUT=>$out3_rev}], 176 | ['tr4.1', '--no-strict --filler xxx transpose', 177 | {IN_PIPE=>$in3}, {OUT=>$out3_filler_tr}], 178 | 179 | 180 | # Single column 181 | ['tr5', 'transpose', {IN_PIPE=>$in4}, {OUT=>$out4_tr}], 182 | ['rev5', 'reverse', {IN_PIPE=>$in4}, {OUT=>$out4_rev}], 183 | 184 | # Single row 185 | ['tr6', 'transpose', {IN_PIPE=>$in5}, {OUT=>$out5_tr}], 186 | ['rev6', 'reverse', {IN_PIPE=>$in5}, {OUT=>$out5_rev}], 187 | 188 | # Single field 189 | ['tr7', 'transpose', {IN_PIPE=>$in6}, {OUT=>$in6}], 190 | ['rev7', 'reverse', {IN_PIPE=>$in6}, {OUT=>$in6}], 191 | 192 | # Empty input 193 | ['tr8', 'transpose', {IN_PIPE=>$in7}, {OUT=>""}], 194 | ['rev8', 'reverse', {IN_PIPE=>$in7}, {OUT=>""}], 195 | 196 | # Extra operands 197 | ['tr9', 'transpose aaa', {IN_PIPE=>''}, {EXIT=>1}, 198 | {ERR=>"$prog: extra operand 'aaa'\n"}], 199 | ['rev9', 'reverse aaa', {IN_PIPE=>''}, {EXIT=>1}, 200 | {ERR=>"$prog: extra operand 'aaa'\n"}], 201 | 202 | # empty input 203 | ['tr10', 'transpose', {IN_PIPE=>""}, {OUT=>""}], 204 | ['rev10', 'reverse', {IN_PIPE=>""}, {OUT=>""}], 205 | 206 | # Reverse with header combinations 207 | ['rev-hdr1','-H reverse', {IN_PIPE=>""}, {OUT=>""}], 208 | ['rev-hdr2','--header-in reverse', {IN_PIPE=>""}, {OUT=>""}], 209 | ['rev-hdr3','-t: reverse', {IN_PIPE=>$in_hdr1}, 210 | {OUT=>"Y:X\na:1\nb:2\n"}], 211 | ['rev-hdr4','-t: -H reverse', {IN_PIPE=>$in_hdr1}, 212 | {OUT=>"Y:X\na:1\nb:2\n"}], 213 | # first line is header line, discard it (there's no --header-out). 214 | ['rev-hdr5','-t: --header-in reverse', {IN_PIPE=>$in_hdr1}, 215 | {OUT=>"a:1\nb:2\n"}], 216 | # Generate a new header, assuming the first line is a NOT header line. 217 | ['rev-hdr6','-t: --header-out reverse', {IN_PIPE=>$in_hdr1}, 218 | {OUT=>"field-2:field-1\nY:X\na:1\nb:2\n"}], 219 | 220 | # bug uncovered by report in: 221 | # http://lists.gnu.org/archive/html/bug-datamash/2016-09/msg00000.html 222 | ['msg1', '--no-strict transpose', {IN_PIPE=>$in_missing1}, 223 | {OUT=>$out_missing1}], 224 | ); 225 | 226 | my $save_temps = $ENV{SAVE_TEMPS}; 227 | my $verbose = $ENV{VERBOSE}; 228 | 229 | my $fail = run_tests ($program_name, $prog_bin, \@Tests, $save_temps, $verbose); 230 | exit $fail; 231 | -------------------------------------------------------------------------------- /tests/datamash-pair-tests.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | =pod 3 | Unit Tests for GNU Datamash - perform simple calculation on input data 4 | 5 | Copyright (C) 2013-2021 Assaf Gordon 6 | Copyright (C) 2022-2025 Timothy Rice 7 | 8 | This file is part of GNU Datamash. 9 | 10 | GNU Datamash is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | 15 | GNU Datamash is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with GNU Datamash. If not, see . 22 | 23 | Written by Assaf Gordon. 24 | =cut 25 | use strict; 26 | use warnings; 27 | 28 | # Until a better way comes along to auto-use Coreutils Perl modules 29 | # as in the coreutils' autotools system. 30 | use Coreutils; 31 | use CuSkip; 32 | use CuTmpdir qw(datamash); 33 | use MIME::Base64 ; 34 | 35 | (my $program_name = $0) =~ s|.*/||; 36 | my $prog_bin = 'datamash'; 37 | 38 | ## Cross-Compiling portability hack: 39 | ## under qemu/binfmt, argv[0] (which is used to report errors) will contain 40 | ## the full path of the binary, if the binary is on the $PATH. 41 | ## So we try to detect what is the actual returned value of the program 42 | ## in case of an error. 43 | my $prog = `$prog_bin ---print-progname`; 44 | $prog = $prog_bin unless $prog; 45 | 46 | # TODO: add localization tests with "grouping" 47 | # Turn off localization of executable's output. 48 | @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; 49 | 50 | ## 51 | ## Portability hack: 52 | ## find the exact wording of 'nan' and inf (not-a-number). 53 | ## It's lower case in GNU/Linux,FreeBSD,OpenBSD, 54 | ## but is "NaN" on Illumos/OpenSolaris 55 | my $nan = `$prog ---print-nan`; 56 | die "test infrastructure failed: can't determine 'nan' string" unless $nan; 57 | my $inf = `$prog ---print-inf`; 58 | die "test infrastructure failed: can't determine 'inf' string" unless $inf; 59 | 60 | =pod 61 | Equivalent R code 62 | 63 | pop.sd=function(x)(sqrt(var(x)*(length(x)-1)/length(x))) 64 | smp.sd=sd 65 | 66 | # alternatively, use the built-in covariance function: 67 | # smp.cov=cov 68 | smp.cov <- function(x,y) { 69 | stopifnot(identical(length(x), length(y))) 70 | sum((x - mean(x)) * (y - mean(y))) / (length(x) - 1) 71 | } 72 | pop.cov <- function(x,y) { 73 | stopifnot(identical(length(x), length(y))) 74 | sum((x - mean(x)) * (y - mean(y))) / (length(x) ) 75 | } 76 | 77 | # alternative, use the built-in covariance fuction: 78 | # smp.pearsoncor=cor 79 | smp.pearsoncor=function(x,y) { smp.cov(x,y)/ ( smp.sd(x)*smp.sd(y) ) } 80 | pop.pearsoncor=function(x,y) { pop.cov(x,y)/ ( pop.sd(x)*pop.sd(y) ) } 81 | 82 | in1.x=c(-0.49,0.14,1.62,2.76,-0.46,3.28,-0.01,2.90,2.46,1.52) 83 | in1.y=c(-0.21,-0.16,1.86,1.81,0.39,4.17,0.38,1.90,2.69,0.78) 84 | 85 | in2.x = c(1.599,-1.011,-1.687,5.070,6.944,7.934,2.134,5.150, 86 | 10.197,11.427,10.379,14.867,11.399,13.479,18.328,16.573, 87 | 17.804,18.694,16.690,21.805) 88 | in2.y = seq(20) 89 | 90 | =cut 91 | 92 | my $in1=<<"EOF"; 93 | -0.49 -0.21 94 | 0.14 -0.16 95 | 1.62 1.86 96 | 2.76 1.81 97 | -0.46 0.39 98 | 3.28 4.17 99 | -0.01 0.38 100 | 2.90 1.90 101 | 2.46 2.69 102 | 1.52 0.78 103 | EOF 104 | 105 | 106 | my $out1_scov=<<'EOF'; 107 | 1.802 108 | EOF 109 | 110 | my $out1_pcov=<<'EOF'; 111 | 1.622 112 | EOF 113 | 114 | my $out1_pcov_hdr=<<'EOF'; 115 | pcov(field-1,field-2) 116 | 1.622 117 | EOF 118 | 119 | my $out1_dotprod_hdr=<<'EOF'; 120 | dotprod(field-1,field-2) 121 | 34.896 122 | EOF 123 | 124 | my $in2=<<'EOF'; 125 | 1.599 1 126 | -1.011 2 127 | -1.687 3 128 | 5.070 4 129 | 6.944 5 130 | 7.934 6 131 | 2.134 7 132 | 5.150 8 133 | 10.197 9 134 | 11.427 10 135 | 10.379 11 136 | 14.867 12 137 | 11.399 13 138 | 13.479 14 139 | 18.328 15 140 | 16.573 16 141 | 17.804 17 142 | 18.694 18 143 | 16.690 19 144 | 21.805 20 145 | EOF 146 | 147 | my $out2_p=<<'EOF'; 148 | 0.944 149 | EOF 150 | 151 | my $out2_s=<<'EOF'; 152 | 0.944 153 | EOF 154 | 155 | my $in3=<<'EOF'; 156 | 1 2 157 | EOF 158 | 159 | my $in4=<<'EOF'; 160 | NA NA 161 | EOF 162 | 163 | my $in5=<<'EOF'; 164 | 1 2 165 | 2 NA 166 | 3 6 167 | EOF 168 | 169 | my $in6=<<'EOF'; 170 | x y 171 | 1 0.5 172 | 2 1 173 | 3 1.5 174 | 4 2 175 | EOF 176 | 177 | my $out6_pcov_hdr=<<'EOF'; 178 | pcov(x,y) 179 | 0.625 180 | EOF 181 | 182 | my $out6_scov_hdr=<<'EOF'; 183 | scov(x,y) 184 | 0.833 185 | EOF 186 | 187 | my $out6_ppears_hdr=<<'EOF'; 188 | ppearson(x,y) 189 | 1 190 | EOF 191 | 192 | my $out6_spears_hdr=<<'EOF'; 193 | spearson(x,y) 194 | 1 195 | EOF 196 | 197 | my $out6_dotprod_hdr=<<'EOF'; 198 | dotprod(x,y) 199 | 15 200 | EOF 201 | 202 | my @Tests = 203 | ( 204 | ['c1', 'scov 1:2', {IN_PIPE=>$in1}, {OUT=>$out1_scov}], 205 | ['c2', 'pcov 1:2', {IN_PIPE=>$in1}, {OUT=>$out1_pcov}], 206 | ['dp1', 'dotprod 1:2', {IN_PIPE=>$in1}, {OUT=>"34.896\n"}], 207 | 208 | # Pair with output headers - only one field and header should be printed 209 | ['c3', '--header-out pcov 1:2', {IN_PIPE=>$in1}, {OUT=>$out1_pcov_hdr}], 210 | ['c3_hin_p', '-W --header-in --header-out pcov x:y', 211 | {IN_PIPE=>$in6}, {OUT=>$out6_pcov_hdr}], 212 | ['c3_hin_s', '-W --header-in --header-out scov x:y', 213 | {IN_PIPE=>$in6}, {OUT=>$out6_scov_hdr}], 214 | 215 | ['p1', 'ppearson 1:2', {IN_PIPE=>$in2}, {OUT=>$out2_p}], 216 | ['p1_hin', '-W --header-in --header-out ppearson x:y', 217 | {IN_PIPE=>$in6}, {OUT=>$out6_ppears_hdr}], 218 | ['p2', 'spearson 1:2', {IN_PIPE=>$in2}, {OUT=>$out2_s}], 219 | ['p2_hin', '-W --header-in --header-out spearson x:y', 220 | {IN_PIPE=>$in6}, {OUT=>$out6_spears_hdr}], 221 | 222 | ['dp2', '--header-out dotprod 1:2', 223 | {IN_PIPE=>$in1}, {OUT=>$out1_dotprod_hdr}], 224 | ['dp3', '-W --header-in --header-out dotprod x:y', 225 | {IN_PIPE=>$in6}, {OUT=>$out6_dotprod_hdr}], 226 | 227 | # Test operations on edge-cases of input (one items, no items, 228 | # different number of items) 229 | ['c4', 'scov 1:2', {IN_PIPE=>$in3}, {OUT=>"$nan\n"}], 230 | ['p4', 'spearson 1:2', {IN_PIPE=>$in3}, {OUT=>"$nan\n"}], 231 | 232 | ['c5', '--narm scov 1:2', {IN_PIPE=>$in4}, {OUT=>"$nan\n"}], 233 | ['p5', '--narm spearson 1:2', {IN_PIPE=>$in4}, {OUT=>"$nan\n"}], 234 | ['dp5', '--narm dotprod 1:2', {IN_PIPE=>$in4}, {OUT=>"$nan\n"}], 235 | 236 | ['c6', '--narm scov 1:2', {IN_PIPE=>$in5}, {EXIT=>1}, 237 | {ERR=>"$prog: input error for operation 'scov': " . 238 | "fields 1,2 have different number of items\n"}], 239 | ['p6', '--narm spearson 1:2', {IN_PIPE=>$in5}, {EXIT=>1}, 240 | {ERR=>"$prog: input error for operation 'spearson': " . 241 | "fields 1,2 have different number of items\n"}], 242 | ['dp6', '--narm dotprod 1:2', {IN_PIPE=>$in5}, {EXIT=>1}, 243 | {ERR=>"$prog: input error for operation 'dotprod': " . 244 | "fields 1,2 have different number of items\n"}], 245 | ); 246 | 247 | my $save_temps = $ENV{SAVE_TEMPS}; 248 | my $verbose = $ENV{VERBOSE}; 249 | 250 | ## 251 | ## For each test, trim the resulting value to maximum three digits 252 | ## after the decimal point. 253 | ## 254 | for my $t (@Tests) { 255 | push @{$t}, {OUT_SUBST=>'s/^(-?\d+\.\d{1,3})\d*/\1/'}; 256 | } 257 | 258 | 259 | my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); 260 | exit $fail; 261 | --------------------------------------------------------------------------------