├── .codecov.yml ├── .gitattributes ├── .github └── workflows │ └── build-test.yml ├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── NOTICES.txt ├── README.md ├── _config.yml ├── bash_completion └── tsv-utils ├── buildtools ├── ReleasePackageReadme.txt ├── aggregate-codecov.d ├── codecov-to-relative-paths.d ├── diff-test-result-dirs.d ├── dircat.d └── makefile ├── common ├── README.md ├── dub.json ├── makefile └── src │ └── tsv_utils │ └── common │ ├── fieldlist.d │ ├── getopt_inorder.d │ ├── numerics.d │ ├── package.d │ ├── tsvutils_version.d │ ├── unittest_utils.d │ └── utils.d ├── csv2tsv ├── README.md ├── dub.json ├── makefile ├── profile_data │ ├── collect_profile_data.sh │ ├── profile_data_1a.csv │ ├── profile_data_1b.csv │ ├── profile_data_3a.csv │ ├── profile_data_3b.csv │ └── profile_data_5.csv ├── src │ └── tsv_utils │ │ └── csv2tsv.d ├── src_v1 │ └── tsv_utils │ │ └── csv2tsv.d └── tests │ ├── gold │ ├── basic_tests_1.txt │ └── error_tests_1.txt │ ├── header1.csv │ ├── header2.csv │ ├── header3.csv │ ├── header4.csv │ ├── header5.csv │ ├── input1_format1.csv │ ├── input1_format2.csv │ ├── input1_format3.csv │ ├── input2.csv │ ├── input3.csv │ ├── input_bom.csv │ ├── input_unicode.csv │ ├── invalid1.csv │ ├── invalid2.csv │ └── tests.sh ├── docs ├── AboutTheCode.md ├── BuildCommands.md ├── BuildingWithLTO.md ├── OtherToolkits.md ├── Performance.md ├── TipsAndTricks.md ├── ToolReference.md ├── ToolReference_v1.6.md ├── comparative-benchmarks-2017.md ├── comparative-benchmarks-2018.md ├── comparing-tsv-and-csv.md ├── dconf2018.pdf ├── dlang-meetup-14dec2017.pdf ├── images │ ├── column-selection-narrow_linux_2018.jpg │ ├── column-selection-narrow_macos_2018.jpg │ ├── column-selection_linux_2018.jpg │ ├── column-selection_macos_2018.jpg │ ├── csv2tsv_linux_2018.jpg │ ├── csv2tsv_macos_2018.jpg │ ├── join-two-files_linux_2018.jpg │ ├── join-two-files_macos_2018.jpg │ ├── numeric-row-filter_linux_2018.jpg │ ├── numeric-row-filter_macos_2018.jpg │ ├── regex-row-filter_linux_2018.jpg │ ├── regex-row-filter_macos_2018.jpg │ ├── summary-statistics_linux_2018.jpg │ └── summary-statistics_macos_2018.jpg ├── lto-pgo-study.md └── tool_reference │ ├── common-options-and-behavior.md │ ├── csv2tsv.md │ ├── keep-header.md │ ├── number-lines.md │ ├── tsv-append.md │ ├── tsv-filter.md │ ├── tsv-join.md │ ├── tsv-pretty.md │ ├── tsv-sample.md │ ├── tsv-select.md │ ├── tsv-split.md │ ├── tsv-summarize.md │ └── tsv-uniq.md ├── dub.json ├── dub_build.d ├── extras └── scripts │ ├── tsv-sort │ └── tsv-sort-fast ├── keep-header ├── README.md ├── dub.json ├── makefile ├── src │ └── tsv_utils │ │ └── keep-header.d └── tests │ ├── emptyfile.txt │ ├── gold │ ├── basic_tests_1.txt │ └── error_tests_1.txt │ ├── input1.csv │ ├── input2.csv │ ├── input_headeronly.csv │ ├── oneblankline.txt │ └── tests.sh ├── makeapp.mk ├── makedefs.mk ├── makefile ├── number-lines ├── README.md ├── dub.json ├── makefile ├── src │ └── tsv_utils │ │ └── number-lines.d └── tests │ ├── empty-file.txt │ ├── gold │ ├── basic_tests_1.txt │ └── error_tests_1.txt │ ├── input1.txt │ ├── input2.txt │ ├── one-line-file.txt │ └── tests.sh ├── tsv-append ├── README.md ├── dub.json ├── makefile ├── src │ └── tsv_utils │ │ └── tsv-append.d └── tests │ ├── empty-file.txt │ ├── gold │ ├── basic_tests_1.txt │ └── error_tests_1.txt │ ├── input1x3.tsv │ ├── input1x4.tsv │ ├── input3x2.tsv │ ├── input3x5.tsv │ └── tests.sh ├── tsv-filter ├── README.md ├── dub.json ├── makefile ├── profile_data │ ├── collect_profile_data.sh │ ├── profile_data_1.tsv │ ├── profile_data_2.tsv │ ├── profile_data_3.tsv │ ├── profile_data_4.tsv │ └── profile_data_5.tsv ├── src │ └── tsv_utils │ │ └── tsv-filter.d └── tests │ ├── gold │ ├── basic_tests_1.txt │ ├── error_tests_1.2081.txt │ └── error_tests_1.txt │ ├── input1.dos_tsv │ ├── input1.tsv │ ├── input1_noheader.tsv │ ├── input2.tsv │ ├── input2_pipe-sep.tsv │ ├── input4.tsv │ ├── input_3x0.tsv │ ├── input_3x1.tsv │ ├── input_3x2.tsv │ ├── input_3x3.tsv │ ├── input_emptyfile.tsv │ ├── input_num_or_empty.tsv │ ├── input_numeric_tests.tsv │ ├── input_onefield.txt │ ├── input_unicode.tsv │ ├── test-config.json │ └── tests.sh ├── tsv-join ├── README.md ├── dub.json ├── makefile ├── src │ └── tsv_utils │ │ └── tsv-join.d └── tests │ ├── gold │ ├── basic_tests_1.txt │ └── error_tests_1.txt │ ├── input1.dos_tsv │ ├── input1.tsv │ ├── input1_noheader.tsv │ ├── input1_rotated.tsv │ ├── input2.dos_tsv │ ├── input2.tsv │ ├── input2_noheader.tsv │ ├── input_1x5.tsv │ ├── input_2x3_colon.tsv │ ├── input_5x4_colon.tsv │ ├── input_emptyfile.tsv │ └── tests.sh ├── tsv-pretty ├── README.md ├── dub.json ├── makefile ├── src │ ├── code-notes.md │ └── tsv_utils │ │ └── tsv-pretty.d └── tests │ ├── emptyfile.tsv │ ├── gold │ ├── basic_tests_1.txt │ ├── basic_tests_2.txt │ ├── basic_tests_3.txt │ ├── basic_tests_4.txt │ ├── basic_tests_5.txt │ ├── basic_tests_6.txt │ ├── basic_tests_7.txt │ ├── basic_tests_8.txt │ └── error_tests_1.txt │ ├── input_5x1.tsv │ ├── input_5x1_alltext.tsv │ ├── input_5x1_noheader.tsv │ ├── input_5x1_noheader_preamble1.tsv │ ├── input_5x1_noheader_preamble2.tsv │ ├── input_5x1_preamble1.tsv │ ├── input_5x1_preamble2.tsv │ ├── input_5x2.tsv │ ├── input_5x2_noheader.tsv │ ├── input_5x2_noheader_preamble1.tsv │ ├── input_5x2_noheader_preamble2.tsv │ ├── input_5x2_preamble1.tsv │ ├── input_5x2_preamble2.tsv │ ├── input_5x3.tsv │ ├── input_5x3_preamble1.tsv │ ├── input_5x3_preamble2.tsv │ ├── input_5x4_noheader.tsv │ ├── input_5x5.tsv │ ├── input_comma_delim.tsv │ ├── input_mixed_1.tsv │ ├── input_mixed_2.tsv │ ├── input_numbers_1.tsv │ ├── input_numbers_2.tsv │ ├── input_numbers_3.tsv │ ├── input_numbers_4.tsv │ ├── input_numbers_noheader_1.tsv │ ├── input_numbers_noheader_2.tsv │ ├── input_numbers_noheader_3.tsv │ ├── input_numbers_noheader_4.tsv │ ├── input_sample_preamble.tsv │ ├── input_text_1.tsv │ ├── input_unicode.tsv │ ├── invalid_unicode.tsv │ └── tests.sh ├── tsv-sample ├── README.md ├── dub.json ├── makefile ├── profile_data │ ├── collect_profile_data.sh │ ├── profile_data_1.tsv │ ├── profile_data_2.tsv │ └── profile_data_3.tsv ├── src │ └── tsv_utils │ │ └── tsv-sample.d └── tests │ ├── gold │ ├── basic_tests_1.txt │ ├── error_tests_1.txt │ ├── error_tests_2.2081.txt │ └── error_tests_2.txt │ ├── input2x10_noheader.tsv │ ├── input2x1_noheader.tsv │ ├── input2x5_noheader.dos_tsv │ ├── input2x5_noheader.tsv │ ├── input2x7_atsign.tsv │ ├── input3x0.tsv │ ├── input3x10.tsv │ ├── input3x25.dos_tsv │ ├── input3x25.tsv │ ├── input3x25_negative_wt.tsv │ ├── input3x3.tsv │ ├── input3x4.tsv │ ├── input4x15.tsv │ ├── input4x50.tsv │ ├── test-config.json │ └── tests.sh ├── tsv-select ├── README.md ├── dub.json ├── makefile ├── profile_data │ ├── collect_profile_data.sh │ ├── profile_data_1.tsv │ ├── profile_data_2.tsv │ └── profile_data_3.tsv ├── src │ └── tsv_utils │ │ └── tsv-select.d ├── src_no-template │ └── tsv-select_no-template-version.d.txt └── tests │ ├── gold │ ├── basic_tests_1.txt │ └── error_tests_1.txt │ ├── input1.dos_tsv │ ├── input1.tsv │ ├── input_1field.tsv │ ├── input_2fields.tsv │ ├── input_2plus_hat_delim.tsv │ ├── input_3plus_fields.tsv │ ├── input_3x0.tsv │ ├── input_3x1.tsv │ ├── input_3x2.tsv │ ├── input_3x3.tsv │ ├── input_8xlong.tsv │ ├── input_emptyfile.tsv │ ├── input_header1.tsv │ ├── input_header2.tsv │ ├── input_header3.tsv │ ├── input_header4.tsv │ ├── input_header_variants.tsv │ └── tests.sh ├── tsv-split ├── README.md ├── dub.json ├── makefile ├── profile_data │ ├── collect_profile_data.sh │ ├── profile_data_1.tsv │ ├── profile_data_2.tsv │ └── profile_data_3.tsv ├── src │ └── tsv_utils │ │ └── tsv-split.d └── tests │ ├── empty-file.txt │ ├── gold │ ├── error_tests_1.txt │ ├── help_and_version_tests.txt │ ├── key_assignment_tests.txt │ ├── lines_per_file_tests.txt │ └── random_assignment_tests.txt │ ├── input1x3.txt │ ├── input1x5.txt │ ├── input4x18.tsv │ ├── input4x58.tsv │ ├── input4x58_colon-delim.tsv │ └── tests.sh ├── tsv-summarize ├── README.md ├── dub.json ├── makefile ├── profile_data │ ├── collect_profile_data.sh │ ├── profile_data_1.tsv │ ├── profile_data_2.tsv │ └── profile_data_3.tsv ├── src │ └── tsv_utils │ │ └── tsv-summarize.d └── tests │ ├── empty_file.tsv │ ├── gold │ ├── basic_tests_1.txt │ ├── error_tests_1.2081.txt │ └── error_tests_1.txt │ ├── input_1field_a.dos_tsv │ ├── input_1field_a.tsv │ ├── input_1field_b.tsv │ ├── input_2field_a.tsv │ ├── input_2field_b.tsv │ ├── input_5field_a.tsv │ ├── input_5field_b.tsv │ ├── input_5field_c.tsv │ ├── input_5field_d.tsv │ ├── input_5field_header_only.tsv │ ├── test-config.json │ └── tests.sh └── tsv-uniq ├── README.md ├── dub.json ├── makefile ├── profile_data ├── collect_profile_data.sh ├── profile_data_1.tsv ├── profile_data_2.tsv └── profile_data_3.tsv ├── src └── tsv_utils │ └── tsv-uniq.d └── tests ├── empty-file.txt ├── gold ├── basic_tests_1.txt └── error_tests_1.txt ├── input1.tsv ├── input1_noheader.tsv ├── input2.tsv ├── input3.tsv ├── input_delim_underscore.tsv └── tests.sh /.codecov.yml: -------------------------------------------------------------------------------- 1 | comment: 2 | layout: "diff, files" 3 | require_changes: yes 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Default behavior in this repository is Unix LF 2 | * text eol=lf 3 | 4 | # CSV files and .dos_tsv in this repository are used for testing and multiple types of line endings 5 | *.csv binary 6 | *.dos_tsv eol=crlf 7 | 8 | # Binary types should remain binary 9 | *.pdf binary 10 | *.jpg binary 11 | -------------------------------------------------------------------------------- /.github/workflows/build-test.yml: -------------------------------------------------------------------------------- 1 | # Workflow to build and test tsv-utils on Linux, MacOS, and Windows. 2 | name: build-test 3 | 4 | on: 5 | push: 6 | branches: [ master ] 7 | pull_request: 8 | branches: [ master ] 9 | schedule: 10 | - cron: '36 10 * * 2' 11 | 12 | jobs: 13 | code-cov-build: 14 | name: Code Coverage build. Linux/DMD 15 | strategy: 16 | matrix: 17 | os: [ubuntu-latest] 18 | dc: [dmd-latest] 19 | 20 | runs-on: ${{ matrix.os }} 21 | 22 | steps: 23 | - uses: actions/checkout@v2 24 | 25 | - name: Install D compiler 26 | uses: dlang-community/setup-dlang@v1 27 | with: 28 | compiler: ${{ matrix.dc }} 29 | 30 | - name: make test-codecov 31 | shell: bash 32 | run: | 33 | make test-codecov DCOMPILER=${DC} 34 | 35 | - uses: codecov/codecov-action@v1 36 | 37 | windows-build: 38 | name: Build/test tsv-utils on Windows 39 | strategy: 40 | fail-fast: false 41 | matrix: 42 | os: [windows-latest] 43 | dc: [dmd-latest, ldc-latest] 44 | 45 | runs-on: ${{ matrix.os }} 46 | 47 | steps: 48 | - uses: actions/checkout@v2 49 | 50 | - name: Install D compiler 51 | uses: dlang-community/setup-dlang@v1 52 | with: 53 | compiler: ${{ matrix.dc }} 54 | 55 | - name: Dub Build and Run 56 | shell: bash 57 | run: | 58 | dub run 59 | dub clean 60 | make clean 61 | 62 | - name: make unittest 63 | # Currently, only run unit tests on DMD. LDC has failures in tsv-sample due 64 | # to minor descrepancies in printed random numbers. 65 | if: ${{ startsWith(matrix.dc, 'dmd') }} 66 | shell: bash 67 | run: | 68 | make unittest DCOMPILER=${DC} DFLAGS=-m64 69 | 70 | linux-macos-build: 71 | name: Build/test tsv-utils on Linux, macOS 72 | strategy: 73 | fail-fast: false 74 | matrix: 75 | os: [macOS-latest, ubuntu-latest] 76 | dc: [ldc-latest, dmd-latest, ldc-beta] 77 | include: 78 | - os: ubuntu-latest 79 | dc: dmd-2.088.1 80 | 81 | runs-on: ${{ matrix.os }} 82 | 83 | steps: 84 | - uses: actions/checkout@v2 85 | 86 | - name: Install D compiler 87 | uses: dlang-community/setup-dlang@v1 88 | with: 89 | compiler: ${{ matrix.dc }} 90 | 91 | - name: make test 92 | shell: bash 93 | run: | 94 | make test DCOMPILER=${DC} 95 | make clean 96 | 97 | macos-release-build: 98 | name: Release build/test tsv-utils on macOS 99 | strategy: 100 | fail-fast: false 101 | matrix: 102 | os: [macOS-latest] 103 | dc: [ldc-latest] 104 | 105 | runs-on: ${{ matrix.os }} 106 | 107 | steps: 108 | - uses: actions/checkout@v2 109 | 110 | - name: Install D compiler 111 | uses: dlang-community/setup-dlang@v1 112 | with: 113 | compiler: ${{ matrix.dc }} 114 | 115 | - name: make test-release ldc-lto-pgo 116 | if: ${{ startsWith(matrix.dc, 'ldc') }} 117 | shell: bash 118 | run: | 119 | make test-release DCOMPILER=${DC} LDC_LTO_RUNTIME=1 LDC_PGO=2 DFLAGS='--lowmem' 120 | make clean 121 | 122 | linux-release-build: 123 | name: Release build/test tsv-utils on Linux 124 | strategy: 125 | fail-fast: false 126 | matrix: 127 | os: [ubuntu-latest] 128 | dc: [ldc-latest, ldc-beta] 129 | 130 | runs-on: ${{ matrix.os }} 131 | 132 | steps: 133 | - uses: actions/checkout@v2 134 | 135 | - name: Install D compiler 136 | uses: dlang-community/setup-dlang@v1 137 | with: 138 | compiler: ${{ matrix.dc }} 139 | 140 | - name: make test-release ldc-lto-pgo 141 | if: ${{ startsWith(matrix.dc, 'ldc') }} 142 | shell: bash 143 | run: | 144 | make test-release DCOMPILER=${DC} LDC_LTO_RUNTIME=1 LDC_PGO=2 DFLAGS='-static --lowmem' 145 | make clean 146 | 147 | dub-build: 148 | name: Dub build/test tsv-utils 149 | strategy: 150 | fail-fast: false 151 | matrix: 152 | os: [ubuntu-latest] 153 | dc: [ldc-latest, dmd-latest] 154 | 155 | runs-on: ${{ matrix.os }} 156 | 157 | steps: 158 | - uses: actions/checkout@v2 159 | 160 | - name: Install D compiler 161 | uses: dlang-community/setup-dlang@v1 162 | with: 163 | compiler: ${{ matrix.dc }} 164 | 165 | - name: Dub build/test 166 | shell: bash 167 | run: | 168 | dub run 169 | make test-nobuild DCOMPILER=$DC 170 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.o 3 | 4 | # Compiled Static libraries 5 | *.a 6 | 7 | # DUB build artifacts 8 | .dub/ 9 | dub.selections.json 10 | 11 | # Code coverage files 12 | *.lst 13 | 14 | # Temporary test files 15 | latest_debug/ 16 | latest_release/ 17 | 18 | # Build artifacts 19 | bin/ 20 | obj/ 21 | dub_build 22 | buildtools/aggregate-codecov 23 | buildtools/codecov-to-relative-paths 24 | tsv-utils-*.tar.gz 25 | 26 | # LDC build artifacts 27 | ldc-build-runtime.tmp/ 28 | ldc-build-runtime.thin/ 29 | ldc-build-runtime.full/ 30 | 31 | # Profile data intermediate files 32 | profile.*.raw 33 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Boost Software License - Version 1.0 - August 17th, 2003 2 | 3 | Permission is hereby granted, free of charge, to any person or organization 4 | obtaining a copy of the software and accompanying documentation covered by 5 | this license (the "Software") to use, reproduce, display, distribute, 6 | execute, and transmit the Software, and to prepare derivative works of the 7 | Software, and to permit third-parties to whom the Software is furnished to 8 | do so, all subject to the following: 9 | 10 | The copyright notices in the Software and this entire statement, including 11 | the above license grant, this restriction and the following disclaimer, 12 | must be included in all copies of the Software, in whole or in part, and 13 | all derivative works of the Software, unless such copies or derivative 14 | works are solely in the form of machine-executable object code generated by 15 | a source language processor. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /NOTICES.txt: -------------------------------------------------------------------------------- 1 | This file contains 3rd party license notifications. 2 | 3 | * D Programming Language (http://dlang.org/). 4 | 5 | Unit test included in common/src/getopt_inorder.d were adapted from unit 6 | tests in the source code for the D standard library std.getopt module. 7 | The std.getopt module is licensed under Boost Licence 1.0 8 | (http://boost.org/LICENSE_1_0.txt). Copyright and license text are: 9 | 10 | Copyright Andrei Alexandrescu 2008 - 2015. 11 | 12 | Boost Software License - Version 1.0 - August 17th, 2003 13 | 14 | Permission is hereby granted, free of charge, to any person or organization 15 | obtaining a copy of the software and accompanying documentation covered by 16 | this license (the "Software") to use, reproduce, display, distribute, 17 | execute, and transmit the Software, and to prepare derivative works of the 18 | Software, and to permit third-parties to whom the Software is furnished to 19 | do so, all subject to the following: 20 | 21 | The copyright notices in the Software and this entire statement, including 22 | the above license grant, this restriction and the following disclaimer, 23 | must be included in all copies of the Software, in whole or in part, and 24 | all derivative works of the Software, unless such copies or derivative 25 | works are solely in the form of machine-executable object code generated by 26 | a source language processor. 27 | 28 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 31 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 32 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 33 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 34 | DEALINGS IN THE SOFTWARE. 35 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-midnight 2 | title: "eBay's TSV Utilities" 3 | description: "Command line tools for large tabular data files." 4 | encoding: utf-8 5 | markdown: kramdown 6 | kramdown: 7 | smart_quotes: apos,apos,quot,quot 8 | 9 | -------------------------------------------------------------------------------- /buildtools/ReleasePackageReadme.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015-2021, eBay Inc. 2 | Initially written by Jon Degenhardt 3 | 4 | This package contains binary executables from the eBay/tsv-utils 5 | open-source project (https://github.com/eBay/tsv-utils). 6 | 7 | The executables are in the 'bin' directory. Install by adding the 'bin' 8 | directory or individual tools to the PATH environment variable. Run tools 9 | with the '--help' option for help, or see the documentation in the Github 10 | repository (https://github.com/eBay/tsv-utils/blob/master/README.md). 11 | 12 | The 'bash_completion' directory contains support for enabling command 13 | option completion for the individual tools. For setup instructions, see 14 | https://github.com/eBay/tsv-utils/blob/master/docs/TipsAndTricks.md#enable-bash-completion. 15 | 16 | The 'extras/scripts' directory contains sample implementations of scripts 17 | described on the Tips and Tricks page on the Github repository: 18 | https://github.com/eBay/tsv-utils/blob/master/docs/TipsAndTricks.md. 19 | -------------------------------------------------------------------------------- /buildtools/codecov-to-relative-paths.d: -------------------------------------------------------------------------------- 1 | /** 2 | This tool converts D code coverage files from absolute to relative paths. 3 | 4 | D code coverage files are generated based on absolute path names if absolute paths are 5 | used in the build command. This is reflected in the file's actual name, which reflects all 6 | the path components. The absolute path is also listed at the end of the code coverage 7 | report. 8 | 9 | This tool checks a coverage file to see if absolute names where used. If so, it renames 10 | the file and updates the report to use a relative path. 11 | 12 | Copyright (c) 2017-2021, eBay Inc. 13 | Initially written by Jon Degenhardt 14 | 15 | License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 16 | 17 | **/ 18 | module buildtools.codecov_to_relative_paths; 19 | 20 | import std.algorithm : findSplit; 21 | import std.array : appender; 22 | import std.conv : to; 23 | import std.file : exists, isDir, isFile, remove, rename; 24 | import std.path : absolutePath, baseName, buildPath, buildNormalizedPath, dirName, extension, 25 | isAbsolute, stripExtension; 26 | import std.range : empty; 27 | import std.stdio; 28 | import std.string : tr; 29 | 30 | /** Convert a D code coverage file to use relative paths. 31 | * 32 | * Files provides on the command line are checked to see if the name represents an 33 | * absolute path. If so, the file is renamed to reflect the relative name and the 34 | * last line of the coverage report is changed to reflect this as well. 35 | */ 36 | int main(string[] cmdArgs) 37 | { 38 | auto programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 39 | 40 | if (cmdArgs.length < 2) 41 | { 42 | writefln("Synopsis: %s coverage-file [coverage-file...]", programName); 43 | return 1; 44 | } 45 | 46 | auto coverageFiles = cmdArgs[1..$]; 47 | 48 | foreach (cf; coverageFiles) 49 | { 50 | if (!cf.exists || !cf.isFile) 51 | { 52 | writefln("%s is not a file", cf); 53 | return 1; 54 | } 55 | } 56 | 57 | foreach (cf; coverageFiles) 58 | { 59 | auto rootDir = cf.absolutePath.buildNormalizedPath.dirName; 60 | auto fileName = cf.baseName; 61 | auto fileNameNoExt = fileName.stripExtension; 62 | auto lines = appender!(string[])(); 63 | foreach (l; cf.File.byLine) lines ~= l.to!string; 64 | if (lines.data.length > 0) 65 | { 66 | /* Check that the last line matches our file name. */ 67 | auto lastLine = lines.data[$ - 1]; 68 | auto lastLineSplit = lastLine.findSplit(" "); 69 | auto lastLinePath = lastLineSplit[1].empty ? "" : lastLineSplit[0]; 70 | auto lastLinePathNoExt = lastLinePath.stripExtension; 71 | if (lastLinePath.isAbsolute && 72 | lastLinePathNoExt.tr("\\/", "--") == fileNameNoExt && 73 | rootDir.length + 1 <= lastLine.length && 74 | rootDir.length + 1 <= fileName.length) 75 | { 76 | auto updatedLastLine = lastLine[rootDir.length + 1 .. $]; 77 | auto newFileName = fileName[rootDir.length + 1 .. $]; 78 | if (newFileName != fileName) 79 | { 80 | auto ofile = newFileName.File("w"); 81 | foreach (l; lines.data[0 .. $ - 1]) ofile.writeln(l); 82 | ofile.writeln(updatedLastLine); 83 | fileName.remove; 84 | } 85 | } 86 | } 87 | } 88 | 89 | return 0; 90 | } 91 | -------------------------------------------------------------------------------- /buildtools/dircat.d: -------------------------------------------------------------------------------- 1 | /** 2 | This tool concatenates all the files in a directory, with a line at the start of each 3 | new file giving the name of the file. This is used for testing tools generating 4 | multiple output files. It is similar to 'tail -n +1 dir/*'. The main difference is 5 | that it assembles files in the same order on all platforms, a characteristic 6 | necessary for testing. 7 | 8 | Copyright (c) 2020-2021, eBay Inc. 9 | Initially written by Jon Degenhardt 10 | 11 | License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 12 | 13 | */ 14 | module buildtools.dircat; 15 | 16 | import std.range; 17 | import std.stdio; 18 | import std.typecons : tuple; 19 | 20 | version(unittest) 21 | { 22 | // When running unit tests, use main from -main compiler switch. 23 | } 24 | else 25 | { 26 | int main(string[] cmdArgs) 27 | { 28 | /* When running in DMD code coverage mode, turn on report merging. */ 29 | version(D_Coverage) version(DigitalMars) 30 | { 31 | import core.runtime : dmd_coverSetMerge; 32 | dmd_coverSetMerge(true); 33 | } 34 | 35 | DirCatOptions cmdopt; 36 | auto r = cmdopt.processArgs(cmdArgs); 37 | if (!r[0]) return r[1]; 38 | 39 | try concatenateDirectoryFiles(cmdopt); 40 | catch (Exception e) 41 | { 42 | stderr.writefln("Error [%s]: %s", cmdopt.programName, e.msg); 43 | return 1; 44 | } 45 | return 0; 46 | } 47 | } 48 | 49 | auto helpText = q"EOS 50 | Synopsis: dircat [options] 51 | 52 | This tool concatenates all files in a directory, writing the contents to 53 | standard output. The contents of each file is preceded with a line 54 | containing the path of the file. 55 | 56 | The current features are very simple. The directory must contain only 57 | regular files. It is an error if the directory contains subdirectories 58 | or symbolic links. 59 | 60 | Exit status is '0' on success, '1' if an error occurred. 61 | 62 | Options: 63 | EOS"; 64 | 65 | struct DirCatOptions 66 | { 67 | string programName; 68 | string dir; // Required argument 69 | 70 | /* Returns a tuple. First value is true if command line arguments were successfully 71 | * processed and execution should continue, or false if an error occurred or the user 72 | * asked for help. If false, the second value is the appropriate exit code (0 or 1). 73 | * 74 | * Returning true (execution continues) means args have been validated and derived 75 | * values calculated. 76 | */ 77 | auto processArgs (ref string[] cmdArgs) 78 | { 79 | import std.getopt; 80 | import std.path : baseName, stripExtension; 81 | 82 | programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 83 | 84 | try 85 | { 86 | auto r = getopt(cmdArgs); 87 | 88 | if (r.helpWanted) 89 | { 90 | defaultGetoptPrinter(helpText, r.options); 91 | return tuple(false, 0); 92 | } 93 | 94 | /* Get the directory path. Should be the one command line arg remaining. */ 95 | if (cmdArgs.length == 2) dir = cmdArgs[1]; 96 | else if (cmdArgs.length < 2) throw new Exception("A directory is required."); 97 | else throw new Exception("Unexpected arguments."); 98 | } 99 | catch (Exception exc) 100 | { 101 | stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 102 | return tuple(false, 1); 103 | } 104 | return tuple(true, 0); 105 | } 106 | } 107 | 108 | void concatenateDirectoryFiles(DirCatOptions cmdopt) 109 | { 110 | import std.algorithm : copy, sort; 111 | import std.conv : to; 112 | import std.exception : enforce; 113 | import std.file : dirEntries, DirEntry, exists, isDir, SpanMode; 114 | import std.format : format; 115 | import std.path; 116 | 117 | string[] filepaths; 118 | 119 | enforce(cmdopt.dir.exists, format("Directory '%s' does not exist.", cmdopt.dir)); 120 | enforce(cmdopt.dir.isDir, format("File path '%s' is not a directory.", cmdopt.dir)); 121 | 122 | foreach (DirEntry de; dirEntries(cmdopt.dir, SpanMode.shallow)) 123 | { 124 | enforce(!de.isDir, format("Directory member '%s' is a directory.", de.name)); 125 | enforce(!de.isSymlink, format("Directory member '%s' is a symbolic link.", de.name)); 126 | enforce(de.isFile, format("Directory member '%s' is not a file.", de.name)); 127 | 128 | filepaths ~= de.name; 129 | } 130 | filepaths.sort; 131 | foreach (filenum, path; filepaths) 132 | { 133 | if (filenum > 0) writeln; 134 | writefln("==> %s <==", path); 135 | path.File.byChunk(1024L * 128L).copy(stdout.lockingTextWriter); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /buildtools/makefile: -------------------------------------------------------------------------------- 1 | DCOMPILER = dmd 2 | DFLAGS = 3 | 4 | all: aggregate-codecov codecov-to-relative-paths diff-test-result-dirs dircat 5 | 6 | aggregate-codecov: aggregate-codecov.d 7 | $(DCOMPILER) -release -O aggregate-codecov.d 8 | 9 | codecov-to-relative-paths: codecov-to-relative-paths.d 10 | $(DCOMPILER) -release -O codecov-to-relative-paths.d 11 | 12 | diff-test-result-dirs: diff-test-result-dirs.d 13 | $(DCOMPILER) -release -O diff-test-result-dirs.d 14 | 15 | dircat: dircat.d 16 | $(DCOMPILER) -release -O dircat.d 17 | 18 | clean: 19 | -rm aggregate-codecov 20 | -rm codecov-to-relative-paths 21 | -rm diff-test-result-dirs 22 | -rm dircat 23 | -rm *.o 24 | -------------------------------------------------------------------------------- /common/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # Utility functions 4 | 5 | This directory contains utility functions shared by multiple TSV utility tools. A few that may be of more general interest: 6 | * **InputFieldReordering** - A class that creates a reordered subset of fields from an input line. Used to operate on a subset of fields in the order specified on the command line. *File: utils.d*. 7 | * **BufferedOutputRange** - An OutputRange with an internal buffer used to buffer output. 8 | Intended for use with stdout, it is a significant performance benefit. *File: utils.d*. 9 | * **bufferedByLine** - An OutputRange with an internal buffer used to buffer output. Intended for use with stdout, it is a significant performance benefit. *File: utils.d*. 10 | * **InputSourceRange** - An input range that provides open file access to a set of files. It is used to iterate over files passed as command line arguments. *File: utils.d*. 11 | * **quantile** - Calculates a cumulative probability for values in a data set. Supports the same interpolation methods as the quantile function in R and many other statistical packages. *File: numerics.d*. 12 | * **formatNumber** - An alternate print format for numbers, especially useful when doubles are being used to represent integer and float values. *File: tsv_numerics.d*. 13 | * **getoptInorder** - A cover for `std.getopt` that processes command line arguments in the order given on the command line. *File: getopt_inorder.d*. 14 | * **parseFieldList** - Implements the parsing for numeric and named fields entered in the command line. *File: fieldlist.d*. 15 | 16 | Code level documentation is available at: [tsv-utils.dpldocs.info/tsv_utils.common](https://tsv-utils.dpldocs.info/tsv_utils.common.html). 17 | -------------------------------------------------------------------------------- /common/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "common", 3 | "description": "Routines used by applications in eBay's TSV Utilities project.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2015-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "sourceLibrary" 9 | } 10 | -------------------------------------------------------------------------------- /common/makefile: -------------------------------------------------------------------------------- 1 | include ../makedefs.mk 2 | 3 | srcdir = $(CURDIR)/src/tsv_utils/common 4 | unittest_utils_srcs ?= $(srcdir)/unittest_utils.d 5 | imports ?= -I$(srcdir) 6 | 7 | release: ; 8 | debug: ; 9 | codecov: ; 10 | 11 | .PHONY: clean-bin-relics 12 | clean-bin-relics: ; 13 | 14 | .PHONY: clean-relics 15 | clean-relics: 16 | -rm -f ./*.lst 17 | -rm -f ./*.o 18 | 19 | .PHONY: clean 20 | clean: clean-relics 21 | 22 | test: unittest 23 | 24 | .PHONY: unittest 25 | unittest: 26 | @echo '---> Running $(notdir $(basename $(CURDIR))) unit tests' 27 | $(DCOMPILER) $(imports) $(unittest_utils_srcs) $(unittest_flags) $(srcdir)/utils.d 28 | $(DCOMPILER) $(imports) $(unittest_utils_srcs) $(unittest_flags) $(srcdir)/getopt_inorder.d 29 | $(DCOMPILER) $(imports) $(unittest_utils_srcs) $(unittest_flags) $(srcdir)/numerics.d 30 | $(DCOMPILER) $(imports) $(unittest_utils_srcs) $(unittest_flags) $(srcdir)/fieldlist.d 31 | $(DCOMPILER) $(imports) $(unittest_utils_srcs) $(unittest_flags) $(srcdir)/tsvutils_version.d 32 | @echo '---> Unit tests completed successfully.' 33 | 34 | test-debug: ; 35 | test-release: ; 36 | test-nobuild: ; 37 | 38 | .PHONY: test-codecov 39 | test-codecov: unittest-codecov 40 | 41 | .PHONY: unittest-codecov 42 | unittest-codecov: 43 | @echo '---> Running $(notdir $(basename $(CURDIR))) unit tests with code coverage.' 44 | -rm ./*.lst 45 | $(DCOMPILER) $(imports) $(unittest_utils_srcs) $(unittest_codecov_flags) $(srcdir)/utils.d 46 | $(DCOMPILER) $(imports) $(unittest_utils_srcs) $(unittest_codecov_flags) $(srcdir)/getopt_inorder.d 47 | $(DCOMPILER) $(imports) $(unittest_utils_srcs) $(unittest_codecov_flags) $(srcdir)/numerics.d 48 | $(DCOMPILER) $(imports) $(unittest_utils_srcs) $(unittest_codecov_flags) $(srcdir)/fieldlist.d 49 | $(DCOMPILER) $(imports) $(unittest_utils_srcs) $(unittest_codecov_flags) $(srcdir)/tsvutils_version.d 50 | -rm ./__main.lst 51 | @echo '---> Unit tests completed successfully (code coverage on).' 52 | 53 | apptest-codecov: ; 54 | -------------------------------------------------------------------------------- /common/src/tsv_utils/common/package.d: -------------------------------------------------------------------------------- 1 | /** 2 | Utility functions used by tsv-utils programs. 3 | 4 | A few of the utilities that may be of more general interest: 5 | 6 | $(LIST 7 | * [tsv_utils.common.utils.InputFieldReordering] - A class that creates a reordered 8 | subset of fields from an input line. Used to operate on a subset of fields in the 9 | order specified on the command line. 10 | * [tsv_utils.common.utils.BufferedOutputRange] - An OutputRange with an internal 11 | buffer used to buffer output. Intended for use with stdout, it is a significant 12 | performance benefit. 13 | * [tsv_utils.common.utils.bufferedByLine] - An input range that reads from a File 14 | handle line by line. It is similar to standard library method std.stdio.File.byLine, 15 | but quite a bit faster. This is achieved by reading in larger blocks and buffering. 16 | * [tsv_utils.common.numerics.quantile] - Calculates a cummulative probability for 17 | values in a data set. Supports the same interpolation methods as the quantile 18 | function in R and many other statistical packages. 19 | * [tsv_utils.common.numerics.rangeMedian] - Finds the median in a range. Implements 20 | via the faster of std.algorithm.topN or std.algorithm.sort depending on the 21 | Phobos version. 22 | * [tsv_utils.common.numerics.formatNumber] - An alternate print format for numbers, 23 | especially useful when doubles are being used to represent integer and float values. 24 | * [tsv_utils.common.getopt_inorder.getoptInorder] - A cover for std.getopt that 25 | processes command line arguments in the order given on the command line. 26 | * [tsv_utils.common.fieldlist] - Routines supporting entry of $(I field-lists) on the 27 | command line. 28 | ) 29 | 30 | Copyright (c) 2015-2021, eBay Inc. 31 | Initially written by Jon Degenhardt 32 | 33 | License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 34 | */ 35 | module tsv_utils.common; 36 | -------------------------------------------------------------------------------- /common/src/tsv_utils/common/tsvutils_version.d: -------------------------------------------------------------------------------- 1 | /** tsv-utils version file. 2 | */ 3 | 4 | module tsv_utils.common.tsvutils_version; 5 | 6 | enum string tsvutilsVersion = "v2.2.3"; 7 | 8 | string tsvutilsVersionNotice (string toolName) @safe pure nothrow 9 | { 10 | return toolName ~ " (eBay/tsv-utils) " ~ tsvutilsVersion ~ "\n" ~ q"EOS 11 | Copyright (c) 2015-2021, eBay Inc. 12 | https://github.com/eBay/tsv-utils 13 | EOS"; 14 | } 15 | 16 | @safe unittest 17 | { 18 | string programName = "program.name"; 19 | assert(tsvutilsVersionNotice(programName).length > programName.length); 20 | } 21 | -------------------------------------------------------------------------------- /common/src/tsv_utils/common/unittest_utils.d: -------------------------------------------------------------------------------- 1 | /** 2 | Helper functions for tsv-utils unit tests. 3 | 4 | Copyright (c) 2017-2021, eBay Inc. 5 | Initially written by Jon Degenhardt 6 | 7 | License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 8 | */ 9 | 10 | module tsv_utils.common.unittest_utils; 11 | 12 | version(unittest) 13 | { 14 | /* Creates a temporary directory for writing unit test files. The path of the created 15 | * directory is returned. The 'toolDirName' argument will be included in the directory 16 | * name, and should consist of generic filename characters. e.g. "tsv_append". This 17 | * name will also be used in assert error messages. 18 | * 19 | * The caller should delete the temporary directory and all its contents when tests 20 | * are finished. This can be done using std.file.rmdirRecurse. For example: 21 | * 22 | * unittest 23 | * { 24 | * import std.file : rmdirRecurse; 25 | * auto testDir = makeUnittestTempDir("tsv_append"); 26 | * scope(exit) testDir.rmdirRecurse; 27 | * ... test code 28 | * } 29 | * 30 | * An assert is triggered if the directory cannot be created. There are two typical 31 | * reasons: 32 | * - Unable to find an available directory name. A number of unique names are tried 33 | * (currently 1000). If they are all taken, it will normally be because the directories 34 | * haven't been properly cleaned up from previous unit test runs. 35 | * - Directory creation failed. e.g. Permission denied. 36 | * 37 | * This routine is intended to be run in 'unittest' mode, so that an assert is triggered 38 | * on failure. However, if run with asserts disabled, the returned path will be empty in 39 | * event of a failure. 40 | */ 41 | string makeUnittestTempDir(string toolDirName) @safe 42 | { 43 | import std.conv : to; 44 | import std.file : exists, mkdir, tempDir; 45 | import std.format : format; 46 | import std.path : buildPath; 47 | import std.range; 48 | 49 | string dirNamePrefix = "ebay_tsv_utils__" ~ toolDirName ~ "_unittest_"; 50 | string systemTempDirPath = tempDir(); 51 | string newTempDirPath = ""; 52 | 53 | for (auto i = 0; i < 1000 && newTempDirPath.empty; i++) 54 | { 55 | string path = buildPath(systemTempDirPath, dirNamePrefix ~ i.to!string); 56 | if (!path.exists) newTempDirPath = path; 57 | } 58 | assert (!newTempDirPath.empty, 59 | format("Unable to obtain a new temp directory, paths tried already exist.\nPath prefix: %s", 60 | buildPath(systemTempDirPath, dirNamePrefix))); 61 | 62 | if (!newTempDirPath.empty) 63 | { 64 | try mkdir(newTempDirPath); 65 | catch (Exception exc) 66 | { 67 | assert(false, format("Failed to create temp directory: %s\n Error: %s", 68 | newTempDirPath, exc.msg)); 69 | } 70 | } 71 | 72 | return newTempDirPath; 73 | } 74 | 75 | /* Write a TSV file. The 'tsvData' argument is a 2-dimensional array of rows and 76 | * columns. Asserts if the file cannot be written. 77 | * 78 | * This routine is intended to be run in 'unittest' mode, so that it will assert 79 | * if the write fails. However, if run in a mode with asserts disabled, it will 80 | * return false if the write failed. 81 | */ 82 | bool writeUnittestTsvFile(string filepath, string[][] tsvData, char delimiter = '\t') @safe 83 | { 84 | import std.algorithm : each, joiner, map; 85 | import std.conv : to; 86 | import std.format: format; 87 | import std.stdio : File; 88 | 89 | try 90 | { 91 | auto file = File(filepath, "wb"); 92 | tsvData 93 | .map!(row => row.joiner(delimiter.to!string)) 94 | .each!(str => file.writeln(str)); 95 | file.close; 96 | } 97 | catch (Exception exc) 98 | { 99 | assert(false, format("Failed to write TSV file: %s.\n Error: %s", 100 | filepath, exc.msg)); 101 | return false; 102 | } 103 | 104 | return true; 105 | } 106 | 107 | /* Convert a 2-dimensional array of values to an in-memory string. */ 108 | string tsvDataToString(string[][] tsvData, char delimiter = '\t') @safe 109 | { 110 | import std.algorithm : joiner, map; 111 | import std.conv : to; 112 | 113 | return tsvData 114 | .map!(row => row.joiner(delimiter.to!string).to!string ~ "\n") 115 | .joiner 116 | .to!string; 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /csv2tsv/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # csv2tsv 4 | 5 | This tool does what you expect: convert CSV data to TSV. Example: 6 | ``` 7 | $ csv2tsv data.csv > data.tsv 8 | ``` 9 | 10 | TSV files have many advantages over CSV files for large scale data processing, but CSV is a very common exchange format. Data from spreadsheets, databases, and other tools is often exported in CSV format. 11 | 12 | The main issue when working with CSV data is the potential for CSV escapes in the data. Standard Unix tools like `cut`, `awk`, and `sort` do not work properly if the data contains CSV escapes, and neither do eBay's TSV Utilities. The `csv2tsv` tool eliminates issues with CSV escapes, allowing the resulting data to be processed correctly by both eBay's TSV Utilities and standard Unix tools. 13 | 14 | Many csv-to-tsv conversion tools don't remove escapes. Instead they generate CSV-style escapes, producing data in CSV format except using TAB as the record delimiter rather than comma. Such data is not correctly interpreted by traditional Unix tools. 15 | 16 | `csv2tsv` avoids escapes by replacing TAB and newline characters in the data with a single space. These characters are rare in data mining scenarios, and space is usually a good substitute in cases where they do occur. The replacement strings are customizable to enable alternate handling when needed. 17 | 18 | Another useful benefit of the `csv2tsv` converter is that it normalizes newlines. Many programs generate Windows newlines when exporting in CSV format, even on Unix systems. 19 | 20 | CSV files come in different formats. See the [csv2tsv reference](../docs/tool_reference/csv2tsv.md) for details of how this tool operates and the format variations handled. 21 | 22 | See [Comparing TSV and CSV formats](../docs/comparing-tsv-and-csv.md) for more information on CSV escapes and other differences between CSV and TSV formats. 23 | -------------------------------------------------------------------------------- /csv2tsv/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "csv2tsv", 3 | "description": "Convert comma-separated values data to tab-separated values format (CSV to TSV).", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2015-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "configurations": [ 10 | { 11 | "name" : "executable", 12 | "targetName": "csv2tsv", 13 | "targetPath": "../bin/", 14 | "mainSourceFile": "src/tsv_utils/csv2tsv.d", 15 | "dependencies": { 16 | "tsv-utils:common": { "path": ".." } 17 | } 18 | 19 | }, 20 | { 21 | "name" : "unittest", 22 | "targetType" : "none" 23 | } 24 | ], 25 | "buildTypes": { 26 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 27 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 28 | "dflags": ["-boundscheck=off"], 29 | "dflags-osx-ldc": ["-flto=thin"] }, 30 | "unittest" : { "buildOptions": ["unittests"] } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /csv2tsv/makefile: -------------------------------------------------------------------------------- 1 | APP_USES_LDC_PGO=1 2 | include ../makedefs.mk 3 | include ../makeapp.mk 4 | -------------------------------------------------------------------------------- /csv2tsv/profile_data/collect_profile_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | if [ $# -eq 0 ]; then 4 | echo "Insufficient arguments. The path of the instrumented program is required." 5 | exit 1 6 | fi 7 | 8 | prog=$1 9 | shift 10 | 11 | ldc_profdata_tool_name=ldc-profdata 12 | ldc_profdata_tool=${ldc_profdata_tool_name} 13 | 14 | if [ $# -ne 0 ]; then 15 | ldc_profdata_tool=${1}/bin/${ldc_profdata_tool_name} 16 | fi 17 | 18 | for f in profile.*.raw; do 19 | if [ -e $f ]; then 20 | rm $f 21 | fi 22 | done 23 | 24 | if [ -e app.profdata ]; then 25 | rm -f app.profdata 26 | fi 27 | 28 | $prog profile_data_1a.csv > /dev/null 29 | $prog profile_data_1b.csv > /dev/null 30 | $prog profile_data_3a.csv > /dev/null 31 | $prog profile_data_3b.csv > /dev/null 32 | $prog profile_data_5.csv > /dev/null 33 | 34 | ${ldc_profdata_tool} merge -o app.profdata profile.*.raw 35 | -------------------------------------------------------------------------------- /csv2tsv/tests/gold/error_tests_1.txt: -------------------------------------------------------------------------------- 1 | Error test set 1 2 | ---------------- 3 | 4 | ====[csv2tsv nosuchfile.txt]==== 5 | 6 | Error [csv2tsv]: Cannot open file `nosuchfile.txt' in mode `rb' (No such file or directory) 7 | 8 | ====[csv2tsv --nosuchparam input1.txt]==== 9 | [csv2tsv] Error processing command line arguments: Unrecognized option --nosuchparam 10 | 11 | ====[csv2tsv --quote $'\n' input2.csv]==== 12 | [csv2tsv] Error processing command line arguments: CSV quote character cannot be newline (--q|quote). 13 | 14 | ====[csv2tsv --quote $'\r' input2.csv]==== 15 | [csv2tsv] Error processing command line arguments: CSV quote character cannot be newline (--q|quote). 16 | 17 | ====[csv2tsv --csv-delim $'\n' input2.csv]==== 18 | [csv2tsv] Error processing command line arguments: CSV field delimiter cannot be newline (--c|csv-delim). 19 | 20 | ====[csv2tsv --csv-delim $'\r' input2.csv]==== 21 | [csv2tsv] Error processing command line arguments: CSV field delimiter cannot be newline (--c|csv-delim). 22 | 23 | ====[csv2tsv --tsv-delim $'\n' input2.csv]==== 24 | [csv2tsv] Error processing command line arguments: TSV field delimiter cannot be newline (--t|tsv-delim). 25 | 26 | ====[csv2tsv --tsv-delim $'\r' input2.csv]==== 27 | [csv2tsv] Error processing command line arguments: TSV field delimiter cannot be newline (--t|tsv-delim). 28 | 29 | ====[csv2tsv --tab-replacement $'\n' input2.csv]==== 30 | [csv2tsv] Error processing command line arguments: Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement). 31 | 32 | ====[csv2tsv --tab-replacement $'\r' input2.csv]==== 33 | [csv2tsv] Error processing command line arguments: Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement). 34 | 35 | ====[csv2tsv -r $'__\n__' input2.csv]==== 36 | [csv2tsv] Error processing command line arguments: Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement). 37 | 38 | ====[csv2tsv -r $'__\r__' input2.csv]==== 39 | [csv2tsv] Error processing command line arguments: Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement). 40 | 41 | ====[csv2tsv --newline-replacement $'\n' input2.csv]==== 42 | [csv2tsv] Error processing command line arguments: Replacement character cannot contain newlines or TSV field delimiters (--n|newline-replacement). 43 | 44 | ====[csv2tsv --newline-replacement $'\r' input2.csv]==== 45 | [csv2tsv] Error processing command line arguments: Replacement character cannot contain newlines or TSV field delimiters (--n|newline-replacement). 46 | 47 | ====[csv2tsv -n $'__\n__' input2.csv]==== 48 | [csv2tsv] Error processing command line arguments: Replacement character cannot contain newlines or TSV field delimiters (--n|newline-replacement). 49 | 50 | ====[csv2tsv -n $'__\r__' input2.csv]==== 51 | [csv2tsv] Error processing command line arguments: Replacement character cannot contain newlines or TSV field delimiters (--n|newline-replacement). 52 | 53 | ====[csv2tsv -q x -c x input2.csv]==== 54 | [csv2tsv] Error processing command line arguments: CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim). 55 | 56 | ====[csv2tsv -q x -t x input2.csv]==== 57 | [csv2tsv] Error processing command line arguments: CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim). 58 | 59 | ====[csv2tsv -t x -r wxyz input2.csv]==== 60 | [csv2tsv] Error processing command line arguments: Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement). 61 | 62 | ====[csv2tsv invalid1.csv]==== 63 | field1 field2 field3 64 | 100 ab c de f 65 | 200 gh i, 66 | Error [csv2tsv]: Invalid CSV. Improperly terminated quoted field. File: invalid1.csv, Line: 3 67 | 68 | ====[csv2tsv invalid2.csv]==== 69 | field1 field2 field3 70 | 100 ab c de f 71 | 200 gh i jk l 72 | 300 mn o pq r 73 | Error [csv2tsv]: Invalid CSV. Improperly terminated quoted field. File: invalid2.csv, Line: 4 74 | -------------------------------------------------------------------------------- /csv2tsv/tests/header1.csv: -------------------------------------------------------------------------------- 1 | field1,field2,field3 2 | 123,456,789 3 | 234,567,890 4 | 345,678,901 5 | -------------------------------------------------------------------------------- /csv2tsv/tests/header2.csv: -------------------------------------------------------------------------------- 1 | field1,field2,field3 2 | abc,def,ghi 3 | jkl,mno,pqr 4 | -------------------------------------------------------------------------------- /csv2tsv/tests/header3.csv: -------------------------------------------------------------------------------- 1 | field1,field2,field3 2 | -------------------------------------------------------------------------------- /csv2tsv/tests/header4.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/csv2tsv/tests/header4.csv -------------------------------------------------------------------------------- /csv2tsv/tests/header5.csv: -------------------------------------------------------------------------------- 1 | field1,field2,field3 2 | ABC,DEF,GHI 3 | JKL,MNO,PQR 4 | -------------------------------------------------------------------------------- /csv2tsv/tests/input1_format1.csv: -------------------------------------------------------------------------------- 1 | "Previous line 2 | specifies next",Legend,A - Char,_ - Space 3 | T - Tab,N - Newline,Q - Quote,C - Comma 4 | AAA,AAA_AAA,_A_,AAA 5 | abc,abc def, a ,abc 6 | ATAT,T,TT,AAA 7 | a b , , ,abc 8 | T,T,T,T 9 | , , , 10 | ,,, 11 | AAA,AAA,AAA,AAA 12 | abc,abc,abc,abc 13 | ANA,AANNAA,AA_AANAA_AA,AAA 14 | "a 15 | b","ab 16 | 17 | cd","ab cd 18 | ef gh",abc 19 | Q,QQ,AQA,AQAAQA 20 | """","""""","a""b","a""bc""d" 21 | QQQ,AQQA,QAQAQ,QQAQQAQQ 22 | """""""","a""""b","""a""a""","""""a""""a""""" 23 | C,CC,ACA,ACAACA 24 | ",",",,","a,b","a,bc,d" 25 | CCC,ACCA,CACAC,CCACCACC 26 | ",,,","a,,b",",a,a,",",,a,,a,," 27 | QCQ,QNQ,CNQACAQ,_Q_NCCQCQ 28 | """a""",""" 29 | """,", 30 | ""a,b,""",""" 31 | ,,"",""" 32 | A,AA,AAA,AAAA 33 | a,ab,abc,abcd 34 | -------------------------------------------------------------------------------- /csv2tsv/tests/input1_format2.csv: -------------------------------------------------------------------------------- 1 | "Previous line specifies next",Legend,A - Char,_ - Space 2 | T - Tab,N - Newline,Q - Quote,C - Comma 3 | AAA,AAA_AAA,_A_,AAA 4 | abc,abc def, a ,abc 5 | ATAT,T,TT,AAA 6 | a b , , ,abc 7 | T,T,T,T 8 | , , , 9 | ,,, 10 | AAA,AAA,AAA,AAA 11 | abc,abc,abc,abc 12 | ANA,AANNAA,AA_AANAA_AA,AAA 13 | "a b","ab cd","ab cd ef gh",abc 14 | Q,QQ,AQA,AQAAQA 15 | """","""""","a""b","a""bc""d" 16 | QQQ,AQQA,QAQAQ,QQAQQAQQ 17 | """""""","a""""b","""a""a""","""""a""""a""""" 18 | C,CC,ACA,ACAACA 19 | ",",",,","a,b","a,bc,d" 20 | CCC,ACCA,CACAC,CCACCACC 21 | ",,,","a,,b",",a,a,",",,a,,a,," 22 | QCQ,QNQ,CNQACAQ,_Q_NCCQCQ 23 | """a""",""" """,", ""a,b,""",""" ,,"",""" 24 | A,AA,AAA,AAAA 25 | a,ab,abc,abcd -------------------------------------------------------------------------------- /csv2tsv/tests/input1_format3.csv: -------------------------------------------------------------------------------- 1 | "Previous line specifies next",Legend,A - Char,_ - Space 2 | T - Tab,N - Newline,Q - Quote,C - Comma 3 | AAA,AAA_AAA,_A_,AAA 4 | abc,abc def, a ,abc 5 | ATAT,T,TT,AAA 6 | a b , , ,abc 7 | T,T,T,T 8 | , , , 9 | ,,, 10 | AAA,AAA,AAA,AAA 11 | abc,abc,abc,abc 12 | ANA,AANNAA,AA_AANAA_AA,AAA 13 | "a b","ab cd","ab cd ef gh",abc 14 | Q,QQ,AQA,AQAAQA 15 | """","""""","a""b","a""bc""d" 16 | QQQ,AQQA,QAQAQ,QQAQQAQQ 17 | """""""","a""""b","""a""a""","""""a""""a""""" 18 | C,CC,ACA,ACAACA 19 | ",",",,","a,b","a,bc,d" 20 | CCC,ACCA,CACAC,CCACCACC 21 | ",,,","a,,b",",a,a,",",,a,,a,," 22 | QCQ,QNQ,CNQACAQ,_Q_NCCQCQ 23 | """a""",""" """,", ""a,b,""",""" ,,"",""" 24 | A,AA,AAA,AAAA 25 | a,ab,abc,abcd -------------------------------------------------------------------------------- /csv2tsv/tests/input2.csv: -------------------------------------------------------------------------------- 1 | field1|field2|field3 2 | 123|456|789 3 | #234#|567|#890# 4 | #|abc#|###def###|#gh> 5 | ijk> 6 | lmn<# 7 | ABC|DEF|GHI 8 | -------------------------------------------------------------------------------- /csv2tsv/tests/input3.csv: -------------------------------------------------------------------------------- 1 | Type,Value1,Value2 2 | Vanilla,ABC,123 3 | Quoted,"ABC","123" 4 | With Comma,"abc,def","123,4" 5 | With Quotes,"Say ""Hello World!""","10"" high" 6 | With Newline,"Value 1 Line 1 7 | Value 1 Line 2","Value 2 Line 1 8 | Value 2 Line 2" 9 | With TAB,"ABC DEF","123 456" 10 | -------------------------------------------------------------------------------- /csv2tsv/tests/input_bom.csv: -------------------------------------------------------------------------------- 1 | abc,def,ghi 2 | "ABC","DEF","GHI" 3 | 12.3,45.6,78.9 4 | -------------------------------------------------------------------------------- /csv2tsv/tests/input_unicode.csv: -------------------------------------------------------------------------------- 1 | english,color green yellow blue white black,green,blue 2 | 日本語,カラーグリーンイエローブルーホワイトブラック,緑,青 3 | deutsche,Farbe grün gelb blau weiß schwarz,grün,blau 4 | suomalainen,väri vihreä keltainen sininen valkoinen musta,vihreä,sininen 5 | 中文,颜色绿色黄色蓝色白色黑色,绿色,蓝色 6 | -------------------------------------------------------------------------------- /csv2tsv/tests/invalid1.csv: -------------------------------------------------------------------------------- 1 | field1,field2,field3 2 | 100,"ab c","de f" 3 | 200,"gh i,"jk l" 4 | 300,"mn o","pq r" 5 | -------------------------------------------------------------------------------- /csv2tsv/tests/invalid2.csv: -------------------------------------------------------------------------------- 1 | field1,field2,field3 2 | 100,"ab c","de f" 3 | 200,"gh i","jk l" 4 | 300,"mn o","pq r 5 | -------------------------------------------------------------------------------- /docs/BuildCommands.md: -------------------------------------------------------------------------------- 1 | _Visit the [main page](../README.md)_ 2 | 3 | # Build commands 4 | 5 | *Note: This file is no longer being updated. However, should it be necessary to run build commands manually, the information here should be a good starting point.* 6 | 7 | Using the make system if make runs on your system. Simply running `make` from the top-level will build the release executables. DUB is also a good way to build, see the install section of the readme file. However, if these are not options, the individual build commands are easy enough to run manually. The commands below are the same issued by the make system. Replace ${DCOMPILER} with the compiler being used, e.g. `dmd` or `ldc2`. If using `dmd`, performance can be improved further by adding the `-inline` switch to the compiler line. 8 | 9 | ## tsv-filter 10 | 11 | ``` 12 | $ tsv-filter 13 | $ ${DCOMPILER} -release -O -boundscheck=off -odobj -of../bin/tsv-filter -I../common/src src/tsv-filter.d ../common/src/tsvutil.d ../common/src/getopt_inorder.d ../common/src/unittest_utils.d 14 | ``` 15 | 16 | ## tsv-select 17 | 18 | ``` 19 | $ cd tsv-select 20 | $ ${DCOMPILER} -release -O -boundscheck=off -odobj -of../bin/tsv-select -I../common/src src/tsv-select.d ../common/src/tsvutil.d ../common/src/getopt_inorder.d ../common/src/unittest_utils.d 21 | ``` 22 | 23 | ## tsv-summarize 24 | 25 | ``` 26 | $ cd tsv-summarize 27 | $ ${DCOMPILER} -release -O -boundscheck=off -odobj -of../bin/tsv-summarize -I../common/src src/tsv-summarize.d ../common/src/tsvutil.d ../common/src/getopt_inorder.d ../common/src/unittest_utils.d 28 | ``` 29 | 30 | ## tsv-join 31 | 32 | ``` 33 | $ cd tsv-join 34 | $ ${DCOMPILER} -release -O -boundscheck=off -odobj -of../bin/tsv-join -I../common/src src/tsv-join.d ../common/src/tsvutil.d ../common/src/getopt_inorder.d ../common/src/unittest_utils.d 35 | ``` 36 | 37 | ## tsv-sample 38 | 39 | ``` 40 | $ cd tsv-sample 41 | $ ${DCOMPILER} -release -O -boundscheck=off -odobj -of../bin/tsv-sample -I../common/src src/tsv-sample.d ../common/src/tsvutil.d ../common/src/getopt_inorder.d ../common/src/unittest_utils.d 42 | ``` 43 | 44 | ## tsv-append 45 | 46 | ``` 47 | $ cd tsv-append 48 | $ ${DCOMPILER} -release -O -boundscheck=off -odobj -of../bin/tsv-append -I../common/src src/tsv-append.d ../common/src/tsvutil.d ../common/src/getopt_inorder.d ../common/src/unittest_utils.d 49 | ``` 50 | 51 | ## tsv-uniq 52 | 53 | ``` 54 | $ cd tsv-uniq 55 | $ ${DCOMPILER} -release -O -boundscheck=off -odobj -of../bin/tsv-uniq -I../common/src src/tsv-uniq.d ../common/src/tsvutil.d ../common/src/getopt_inorder.d ../common/src/unittest_utils.d 56 | ``` 57 | 58 | ## csv2tsv 59 | 60 | ``` 61 | $ cd csv2tsv 62 | $ ${DCOMPILER} -release -O -boundscheck=off -odob -of../bin/csv2tsv -I../common/src src/csv2tsv.d ../common/src/tsvutil.d ../common/src/getopt_inorder.d ../common/src/unittest_utils.d 63 | ``` 64 | 65 | ## number-lines 66 | 67 | ``` 68 | $ cd number-lines 69 | $ ${DCOMPILER} -release -O -boundscheck=off -odobj -of../bin/number-lines -I../common/src src/number-lines.d ../common/src/tsvutil.d ../common/src/getopt_inorder.d ../common/src/unittest_utils.d 70 | ``` 71 | -------------------------------------------------------------------------------- /docs/OtherToolkits.md: -------------------------------------------------------------------------------- 1 | _Visit the [main page](../README.md)_ 2 | 3 | # Other open-source tools 4 | 5 | There are a number of open-source toolkits with functionality similar to the TSV Utilities. Several are listed below: 6 | 7 | * [clarkgrubb/data-tools](https://github.com/clarkgrubb/data-tools) - A variety of tools, especially rich in format converters. Written in Python, Ruby, and C. 8 | * [csvkit](https://github.com/wireservice/csvkit) - CSV tools, written in Python. 9 | * [csvtk](https://github.com/shenwei356/csvtk) - CSV tools, written in Go. 10 | * [GNU Datamash](https://www.gnu.org/software/datamash/) - Numeric, textual and statistical operations on TSV files. This tool has many similarities to [tsv-summarize](tool_reference/tsv-summarize.md). Written in C. 11 | * [dplyr](https://github.com/hadley/dplyr) - Tools for tabular data in R storage formats. Runs in an R environment, code is in C++. 12 | * [miller](https://github.com/johnkerl/miller) - Tools for CSV, JSON, and other formats. written in C. 13 | * [GNU shuf](https://www.gnu.org/software/coreutils/manual/html_node/shuf-invocation.html), part of [GNU Core Utils](https://www.gnu.org/software/coreutils/coreutils.html) - Generates permutations of input lines. Sampling with and without replacement is supported. This tool has many of the same features as [tsv-sample](tool_reference/tsv-sample.md). Written in C. 14 | * [brendano/tsvutils](https://github.com/brendano/tsvutils) - TSV tools, especially rich in format converters. Written in Python. 15 | * [xsv](https://github.com/BurntSushi/xsv) - CSV tools, written in Rust. 16 | 17 | A much more comprehensive list of tools can be found here: [Structured text tools](https://github.com/dbohdan/structured-text-tools). 18 | 19 | The different toolkits are certainly worth investigating if you work with tabular data files. Several have quite extensive feature sets. Each toolkit has its own strengths, your workflow and preferences are likely to fit some toolkits better than others. 20 | 21 | File format is perhaps the most important dimension. CSV files are very common. However, CSV files cannot be processed reliably by standard Unix tools. For this reason, CSV toolkit functionality typically extends into the space of traditional Unix tools. For example, CSV toolkits often have their own "sort" operation, as Unix `sort` does not operate reliably on CSV files. This is unfortunate, as creating a program with the speed and quality of a program like GNU `sort` is a meaningful undertaking. 22 | 23 | Many CSV toolkits also support TSV files, certainly appealing. Unfortunately, usage can be complicated and error prone due to the need to specify record delimiters and CSV style escape rules. Another issue is that not all CSV toolkits support fully turning off CSV escape syntax. This is usually not obvious and can lead to subtle errors when processing TSV files containing quotes. 24 | 25 | Tradeoffs between file formats is its own topic. Appropriate choice of format is often dependent on the specifics of the environment and tasks being performed. See [Comparing TSV and CSV formats](comparing-tsv-and-csv.md) for a discussion of TSV and CSV formats. The [brendano/tsvutils README](https://github.com/brendano/tsvutils#the-philosophy-of-tsvutils) (Brendan O'Conner) has a nice discussion of the rationale for using TSV files. 26 | -------------------------------------------------------------------------------- /docs/ToolReference.md: -------------------------------------------------------------------------------- 1 | _Visit the [TSV Utilities main page](../README.md)_ 2 | 3 | # Tools Reference 4 | 5 | The TSV Utilities Tools Reference provides detailed documentation about each tool. Each tool has itXSs own page, available through the links below. The [Common options and behavior](tool_reference/common-options-and-behavior.md) page provides information about features and options common to all the tools. 6 | 7 | Documentation for individual tools is also available via the `--help` option available on every tool. Most tools provide a `--help-verbose` option offering more extensive documentation similar to what is available in the Tool Reference. 8 | 9 | * [Common options and behavior](tool_reference/common-options-and-behavior.md) 10 | * [csv2tsv](tool_reference/csv2tsv.md) 11 | * [keep-header](tool_reference/keep-header.md) 12 | * [number-lines](tool_reference/number-lines.md) 13 | * [tsv-append](tool_reference/tsv-append.md) 14 | * [tsv-filter](tool_reference/tsv-filter.md) 15 | * [tsv-join](tool_reference/tsv-join.md) 16 | * [tsv-pretty](tool_reference/tsv-pretty.md) 17 | * [tsv-sample](tool_reference/tsv-sample.md) 18 | * [tsv-select](tool_reference/tsv-select.md) 19 | * [tsv-split](tool_reference/tsv-split.md) 20 | * [tsv-summarize](tool_reference/tsv-summarize.md) 21 | * [tsv-uniq](tool_reference/tsv-uniq.md) 22 | 23 | Documentation in the above files is for the current toolkit version. There were significant changes to the documents in release 2.0.0 due to the addition of named fields. Documentation for earlier versions is available in [Tools Reference v1.6](ToolReference_v1.6.md). 24 | -------------------------------------------------------------------------------- /docs/dconf2018.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/dconf2018.pdf -------------------------------------------------------------------------------- /docs/dlang-meetup-14dec2017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/dlang-meetup-14dec2017.pdf -------------------------------------------------------------------------------- /docs/images/column-selection-narrow_linux_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/column-selection-narrow_linux_2018.jpg -------------------------------------------------------------------------------- /docs/images/column-selection-narrow_macos_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/column-selection-narrow_macos_2018.jpg -------------------------------------------------------------------------------- /docs/images/column-selection_linux_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/column-selection_linux_2018.jpg -------------------------------------------------------------------------------- /docs/images/column-selection_macos_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/column-selection_macos_2018.jpg -------------------------------------------------------------------------------- /docs/images/csv2tsv_linux_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/csv2tsv_linux_2018.jpg -------------------------------------------------------------------------------- /docs/images/csv2tsv_macos_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/csv2tsv_macos_2018.jpg -------------------------------------------------------------------------------- /docs/images/join-two-files_linux_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/join-two-files_linux_2018.jpg -------------------------------------------------------------------------------- /docs/images/join-two-files_macos_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/join-two-files_macos_2018.jpg -------------------------------------------------------------------------------- /docs/images/numeric-row-filter_linux_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/numeric-row-filter_linux_2018.jpg -------------------------------------------------------------------------------- /docs/images/numeric-row-filter_macos_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/numeric-row-filter_macos_2018.jpg -------------------------------------------------------------------------------- /docs/images/regex-row-filter_linux_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/regex-row-filter_linux_2018.jpg -------------------------------------------------------------------------------- /docs/images/regex-row-filter_macos_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/regex-row-filter_macos_2018.jpg -------------------------------------------------------------------------------- /docs/images/summary-statistics_linux_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/summary-statistics_linux_2018.jpg -------------------------------------------------------------------------------- /docs/images/summary-statistics_macos_2018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/docs/images/summary-statistics_macos_2018.jpg -------------------------------------------------------------------------------- /docs/tool_reference/csv2tsv.md: -------------------------------------------------------------------------------- 1 | _Visit the [Tools Reference main page](../ToolReference.md)_
2 | _Visit the [TSV Utilities main page](../../README.md)_ 3 | 4 | # csv2tsv reference 5 | 6 | **Synopsis:** csv2tsv [options] [file...] 7 | 8 | csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records are read from files or standard input, converted records are written to standard output. 9 | 10 | Both formats represent tabular data, each record on its own line, fields separated by a delimiter character. The key difference is that CSV uses escape sequences to represent newlines and field separators in the data, whereas TSV disallows these characters in the data. The most common field delimiters are comma for CSV and TAB for TSV, but any character can be used. See [Comparing TSV and CSV formats](../comparing-tsv-and-csv.md) for additional discussion of the formats. 11 | 12 | Conversion to TSV is done by removing CSV escape syntax, changing field delimiters, and replacing newlines and TABs in the data. By default, newlines and TABs in the data are replaced by spaces. Most details are customizable. 13 | 14 | There is no single spec for CSV, any number of variants can be found. The escape syntax is common enough: fields containing newlines or field delimiters are placed in double quotes. Inside a quoted field, a double quote is represented by a pair of double quotes. As with field separators, the quoting character is customizable. 15 | 16 | Behaviors of this program that often vary between CSV implementations: 17 | * Newlines are supported in quoted fields. 18 | * Double quotes are permitted in a non-quoted field. However, a field starting with a quote must follow quoting rules. 19 | * Each record can have a different number of fields. 20 | * The three common forms of newlines are supported: CR, CRLF, LF. Output is written using Unix newlines (LF). 21 | * A newline will be added if the file does not end with one. 22 | * A UTF-8 Byte Order Mark (BOM) at the start of an input file will be removed. 23 | * No whitespace trimming is done. 24 | 25 | This program does not validate CSV correctness, but will terminate with an error upon reaching an inconsistent state. Improperly terminated quoted fields are the primary cause. 26 | 27 | UTF-8 input is assumed. Convert other encodings prior to invoking this tool. 28 | 29 | **Options:** 30 | * `--h|help` - Print help. 31 | * `--help-verbose` - Print detailed help. 32 | * `--V|version` - Print version information and exit. 33 | * `--H|header` - Treat the first line of each file as a header. Only the header of the first file is output. 34 | * `--q|quote CHR` - Quoting character in CSV data. Default: double-quote (") 35 | * `--c|csv-delim CHR` - Field delimiter in CSV data. Default: comma (,). 36 | * `--t|tsv-delim CHR` - Field delimiter in TSV data. Default: TAB 37 | * `--r|tab-replacement STR` - Replacement for TSV field delimiters (typically TABs) found in CSV input. Default: Space. 38 | * `--n|newline-replacement STR` - Replacement for newlines found in CSV input. Default: Space. 39 | -------------------------------------------------------------------------------- /docs/tool_reference/keep-header.md: -------------------------------------------------------------------------------- 1 | _Visit the [Tools Reference main page](../ToolReference.md)_
2 | _Visit the [TSV Utilities main page](../../README.md)_ 3 | 4 | ## keep-header reference 5 | 6 | **Synopsis:** keep-header [file...] \-- program [args] 7 | 8 | Execute a command against one or more files in a header-aware fashion. The first line of each file is assumed to be a header. The first header is output unchanged. Remaining lines are sent to the given command via standard input, excluding the header lines of subsequent files. Output from the command is appended to the initial header line. A double dash (\--) delimits the command, similar to how the pipe operator (\|) delimits commands. 9 | 10 | The following commands sort files in the usual way, except for retaining a single header line: 11 | ``` 12 | $ keep-header file1.txt -- sort 13 | $ keep-header file1.txt file2.txt -- sort -k1,1nr 14 | ``` 15 | 16 | Data can also be read from standard input. For example: 17 | ``` 18 | $ cat file1.txt | keep-header -- sort 19 | $ keep-header file1.txt -- sort -r | keep-header -- grep red 20 | ``` 21 | 22 | The last example can be simplified using a shell command: 23 | ``` 24 | $ keep-header file1.txt -- /bin/sh -c '(sort -r | grep red)' 25 | ``` 26 | 27 | `keep-header` is especially useful for commands like `sort` and `shuf` that reorder input lines. It is also useful with filtering commands like `grep`, many `awk` uses, and even `tail`, where the header should be retained without filtering or evaluation. 28 | 29 | `keep-header` works on any file where the first line is delimited by a newline character. This includes all TSV files and the majority of CSV files. It won't work on CSV files having embedded newlines in the header. 30 | 31 | **Options:** 32 | * `--h|help` - Print help. 33 | * `--V|version` - Print version information and exit. 34 | -------------------------------------------------------------------------------- /docs/tool_reference/number-lines.md: -------------------------------------------------------------------------------- 1 | _Visit the [Tools Reference main page](../ToolReference.md)_
2 | _Visit the [TSV Utilities main page](../../README.md)_ 3 | 4 | # number-lines reference 5 | 6 | **Synopsis:** number-lines [options] [file...] 7 | 8 | number-lines reads from files or standard input and writes each line to standard output preceded by a line number. It is a simplified version of the Unix `nl` program. It supports one feature `nl` does not: the ability to treat the first line of files as a header. This is useful when working with tab-separated-value files. If header processing is used, a header line is written for the first file, and the header lines are dropped from any subsequent files. 9 | 10 | **Options:** 11 | * `--h|help` - Print help. 12 | * `--V|version` - Print version information and exit. 13 | * `--H|header` - Treat the first line of each file as a header. The first input file's header is output, subsequent file headers are discarded. 14 | * `--s|header-string STR` - String to use as the header for the line number field. Implies `--header`. Default: 'line'. 15 | * `--n|start-number NUM` - Number to use for the first line. Default: 1. 16 | * `--d|delimiter CHR` - Character appended to line number, preceding the rest of the line. Default: TAB (Single byte UTF-8 characters only.) 17 | * `--line-buffered` - Immediately output every line. 18 | 19 | **Examples:** 20 | ``` 21 | $ # Number lines in a file 22 | $ number-lines file.tsv 23 | 24 | $ # Number lines from multiple files. Treat the first line of each file 25 | $ # as a header. 26 | $ number-lines --header data*.tsv 27 | ``` 28 | 29 | **See Also:** 30 | 31 | * [tsv-uniq](tsv-uniq.md) supports numbering lines grouped by key. 32 | -------------------------------------------------------------------------------- /docs/tool_reference/tsv-append.md: -------------------------------------------------------------------------------- 1 | _Visit the [Tools Reference main page](../ToolReference.md)_
2 | _Visit the [TSV Utilities main page](../../README.md)_ 3 | 4 | # tsv-append reference 5 | 6 | **Synopsis:** tsv-append [options] [file...] 7 | 8 | tsv-append concatenates multiple TSV files, similar to the Unix `cat` utility. Unlike `cat`, it is header-aware (`--H|header`), writing the header from only the first file. It also supports source tracking, adding a column indicating the original file to each row. Results are written to standard output. 9 | 10 | Concatenation with header support is useful when preparing data for traditional Unix utilities like `sort` and `sed` or applications that read a single file. 11 | 12 | Source tracking is useful when creating long/narrow form tabular data, a format used by many statistics and data mining packages. In this scenario, files have been used to capture related data sets, the difference between data sets being a condition represented by the file. For example, results from different variants of an experiment might each be recorded in their own files. Retaining the source file as an output column preserves the condition represented by the file. 13 | 14 | The file-name (without extension) is used as the source value. This can customized using the `--f|file` option. 15 | 16 | Example: Header processing: 17 | ``` 18 | $ tsv-append -H file1.tsv file2.tsv file3.tsv 19 | ``` 20 | 21 | Example: Header processing and source tracking: 22 | ``` 23 | $ tsv-append -H -t file1.tsv file2.tsv file3.tsv 24 | ``` 25 | 26 | Example: Source tracking with custom source values: 27 | ``` 28 | $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv 29 | ``` 30 | 31 | **Options:** 32 | * `--h|help` - Print help. 33 | * `--help-verbose` - Print detailed help. 34 | * `--V|version` - Print version information and exit. 35 | * `--H|header` - Treat the first line of each file as a header. 36 | * `--t|track-source` - Track the source file. Adds an column with the source name. 37 | * `--s|source-header STR` - Use STR as the header for the source column. Implies `--H|header` and `--t|track-source`. Default: 'file' 38 | * `--f|file STR=FILE` - Read file FILE, using STR as the 'source' value. Implies `--t|track-source`. 39 | * `--d|delimiter CHR` - Field delimiter. Default: TAB. (Single byte UTF-8 characters only.) 40 | * `--line-buffered` - Immediately output every line. 41 | -------------------------------------------------------------------------------- /docs/tool_reference/tsv-join.md: -------------------------------------------------------------------------------- 1 | _Visit the [Tools Reference main page](../ToolReference.md)_
2 | _Visit the [TSV Utilities main page](../../README.md)_ 3 | 4 | # tsv-join reference 5 | 6 | **Synopsis:** tsv-join --filter-file file [options] [file...] 7 | 8 | tsv-join matches input lines (the 'data stream') against lines from a 'filter' file. The match is based on exact match comparison of one or more 'key' fields. The data stream is read from files or standard input. Matching lines are written to standard output, along with any additional fields from the filter file that have been specified. 9 | 10 | This is similar to the "stream-static" joins available in Spark Structured Streaming and "KStream-KTable" joins in Kafka. The filter file plays the same role as the Spark static dataset or Kafka KTable. 11 | 12 | The filter file needs to fit into available memory (the join key and append fields). The data stream is processed one line at a time and can be arbitrarily large. 13 | 14 | **Options:** 15 | * `--h|help` - Print help. 16 | * `--h|help-verbose` - Print detailed help. 17 | * `--help-fields ` - Print help on specifying fields. 18 | * `--V|version` - Print version information and exit. 19 | * `--f|filter-file FILE` - (Required) File with records to use as a filter. 20 | * `--k|key-fields ` - Fields to use as the join key. Default: 0 (entire line). 21 | * `--d|data-fields ` - Data stream fields to use as the join key, if different than `--key-fields`. 22 | * `--a|append-fields ` - Filter file fields to append to matched records. 23 | * `--H|header` - Treat the first line of each file as a header. 24 | * `--p|prefix STR` - String to use as a prefix for `--append-fields` when writing a header line. 25 | * `--w|write-all STR` - Output all data stream records. STR is the `--append-fields` value when writing unmatched records. This is a left outer join. (The data stream is the 'left'.) 26 | * `--e|exclude` - Exclude matching records. This is an anti-join. 27 | * `--delimiter CHR` - Field delimiter. Default: TAB. (Single byte UTF-8 characters only.) 28 | * `--z|allow-duplicate-keys` - Allow duplicate keys with different append values (last entry wins). Default behavior is that this is an error. 29 | * `--line-buffered` - Immediately output every line. 30 | 31 | **Examples:** 32 | 33 | Join using the `Name` field as the key. The `Name` field may be in different columns in the filter file and data stream files. All matching rows from `data.tsv` are written to standard output. The output order is the same as in `data.tsv`. 34 | ``` 35 | $ tsv-join -H --filter-file filter.tsv --key-fields Name data.tsv 36 | ``` 37 | 38 | Join using the `Name` field as key, but also append the `RefID` field from the filter file. 39 | ``` 40 | $ tsv-join -H -f filter.tsv -k Name --append-fields RefID data.tsv 41 | ``` 42 | 43 | Exclude lines from the data stream having the same `RecordNum` as a line in the filter file. 44 | ``` 45 | $ tsv-join -H -f filter.tsv -k RecordNum --exclude data.tsv 46 | ``` 47 | 48 | Filter multiple files, using field numbers 2 & 3 as the join key. 49 | ``` 50 | $ tsv-join -f filter.tsv -k 2,3 data1.tsv data2.tsv data3.tsv 51 | ``` 52 | 53 | Same as previous, except use fields 4 & 5 from the data files as the key. 54 | ``` 55 | $ tsv-join -f filter.tsv -k 2,3 -d 4,5 data1.tsv data2.tsv data3.tsv 56 | ``` 57 | 58 | Same as the previous command, but reading the data stream from standard input. 59 | ``` 60 | $ cat data*.tsv | tsv-join -f filter.tsv -k 2,3 -d 4,5 61 | ``` 62 | 63 | Add population data from `cities.tsv` to a data stream. 64 | ``` 65 | $ tsv-join -H -f cities.tsv -k CityID --append-fields Population data.tsv 66 | ``` 67 | 68 | As in the previous example, add population data, but this time write all records. Use the value '-1' if the city does not appear in the `cities.tsv` file. This is a left outer join, with the data stream as 'left'. 69 | ``` 70 | $ tsv-join -H -f cities.tsv -k CityID -a Population --write-all -1 Population data.tsv 71 | ``` 72 | 73 | Filter one file based on another, using the full line as the key. 74 | ``` 75 | $ tsv-join -f filter.txt data.txt 76 | ``` 77 | 78 | Modifying output headers: Often it's useful to append a field that has a name identical to a field already in the data stream files. The '--p|prefix' option can be used to rename the appended field and avoid name duplication. The following command joins on the `test_id` field, appending the `time` field to matched records. The header for the appended field is `run1_time`, differentiating it from an existing `time` field in the data file (run2.tsv). 79 | ``` 80 | $ tsv-join -f run1.tsv run2.tsv -H -k test_id --append-fields time --prefix run1_ 81 | ``` 82 | 83 | The prefix will be applied to all appended fields. The next example is similar to the previous one, except that it appends all fields ending in `_time`, prefixing `run1_` to all the appended field names: 84 | ``` 85 | $ tsv-join -f run1.tsv run2.tsv -H -k test_id -a '*_time' --prefix run1_ 86 | ``` 87 | 88 | See [Field syntax](common-options-and-behavior.md#field-syntax) for more information about specifying fields. 89 | -------------------------------------------------------------------------------- /docs/tool_reference/tsv-pretty.md: -------------------------------------------------------------------------------- 1 | _Visit the [Tools Reference main page](../ToolReference.md)_
2 | _Visit the [TSV Utilities main page](../../README.md)_ 3 | 4 | # tsv-pretty reference 5 | 6 | **Synopsis:** tsv-pretty [options] [file...] 7 | 8 | `tsv-pretty` outputs TSV data in a format intended to be more human readable when working on the command line. This is done primarily by lining up data into fixed-width columns. Text is left aligned, numbers are right aligned. Floating points numbers are aligned on the decimal point when feasible. 9 | 10 | Processing begins by reading the initial set of lines into memory to determine the field widths and data types of each column. This look-ahead buffer is used for header detection as well. Output begins after this processing is complete. 11 | 12 | By default, only the alignment is changed, the actual values are not modified. Several of the formatting options do modify the values. 13 | 14 | Features: 15 | 16 | * Floating point numbers: Floats can be printed in fixed-width precision, using the same precision for all floats in a column. This makes them line up nicely. Precision is determined by values seen during look-ahead processing. The max precision defaults to 9, this can be changed when smaller or larger values are desired. See the `--f|format-floats` and `--p|precision` options. 17 | 18 | * Header lines: Headers are detected automatically when possible. This can be overridden when automatic detection doesn't work as desired. Headers can be underlined and repeated at regular intervals. 19 | 20 | * Missing values: A substitute value can be used for empty fields. This is often less confusing than spaces. See `--e|replace-empty` and `--E|empty-replacement`. 21 | 22 | * Exponential notation: As part of float formatting, `--f|format-floats` re-formats columns where exponential notation is found so all the values in the column are displayed using exponential notation and the same precision. 23 | 24 | * Preamble: A number of initial lines can be designated as a preamble and output unchanged. The preamble is before the header, if a header is present. Preamble lines can be auto-detected via the heuristic that they lack field delimiters. This works well when the field delimiter is a TAB. 25 | 26 | * Fonts: Fixed-width fonts are assumed. CJK characters are assumed to be double width. This is not always correct, but works well in most cases. 27 | 28 | **Options:** 29 | 30 | * `--help-verbose` - Print full help. 31 | * `--H|header` - Treat the first line of each file as a header. 32 | * `--x|no-header` - Assume no header. Turns off automatic header detection. 33 | * `--l|lookahead NUM` - Lines to read to interpret data before generating output. Default: 1000 34 | * `--r|repeat-header NUM` - Lines to print before repeating the header. Default: No repeating header 35 | * `--u|underline-header` - Underline the header. 36 | * `--f|format-floats` - Format floats for better readability. Default: No 37 | * `--p|precision NUM` - Max floating point precision. Implies --format-floats. Default: 9 38 | * `--e|replace-empty` - Replace empty fields with `--`. 39 | * `--E|empty-replacement STR` - Replace empty fields with a string. 40 | * `--d|delimiter CHR` - Field delimiter. Default: TAB. (Single byte UTF-8 characters only.) 41 | * `--s|space-between-fields NUM` - Spaces between each field (Default: 2) 42 | * `--m|max-text-width NUM` - Max reserved field width for variable width text fields. Default: 40 43 | * `--a|auto-preamble` - Treat initial lines in a file as a preamble if the line contains no field delimiters. The preamble is output unchanged. 44 | * `--b|preamble NUM` - Treat the first NUM lines as a preamble and output them unchanged. 45 | * `--V|version` - Print version information and exit. 46 | * `--h|help` - This help information. 47 | 48 | **Examples:** 49 | 50 | A tab-delimited file printed without any formatting: 51 | ``` 52 | $ cat sample.tsv 53 | Color Count Ht Wt 54 | Brown 106 202.2 1.5 55 | Canary Yellow 7 106 0.761 56 | Chartreuse 1139 77.02 6.22 57 | Fluorescent Orange 422 1141.7 7.921 58 | Grey 19 140.3 1.03 59 | ``` 60 | The same file printed with `tsv-pretty`: 61 | ``` 62 | $ tsv-pretty sample.tsv 63 | Color Count Ht Wt 64 | Brown 106 202.2 1.5 65 | Canary Yellow 7 106 0.761 66 | Chartreuse 1139 77.02 6.22 67 | Fluorescent Orange 422 1141.7 7.921 68 | Grey 19 140.3 1.03 69 | ``` 70 | Printed with float formatting and header underlining: 71 | ``` 72 | $ tsv-pretty -f -u sample.tsv 73 | Color Count Ht Wt 74 | ----- ----- -- -- 75 | Brown 106 202.20 1.500 76 | Canary Yellow 7 106.00 0.761 77 | Chartreuse 1139 77.02 6.220 78 | Fluorescent Orange 422 1141.70 7.921 79 | Grey 19 140.30 1.030 80 | ``` 81 | Printed with setting the precision to one: 82 | ``` 83 | $ tsv-pretty -u -p 1 sample.tsv 84 | Color Count Ht Wt 85 | ----- ----- -- -- 86 | Brown 106 202.2 1.5 87 | Canary Yellow 7 106.0 0.8 88 | Chartreuse 1139 77.0 6.2 89 | Fluorescent Orange 422 1141.7 7.9 90 | Grey 19 140.3 1.0 91 | ``` 92 | -------------------------------------------------------------------------------- /docs/tool_reference/tsv-select.md: -------------------------------------------------------------------------------- 1 | _Visit the [Tools Reference main page](../ToolReference.md)_
2 | _Visit the [TSV Utilities main page](../../README.md)_ 3 | 4 | # tsv-select reference 5 | 6 | **Synopsis:** tsv-select [options] [file...] 7 | 8 | tsv-select reads files or standard input and writes selected fields to standard output. Fields are written in the order listed. This is similar to Unix `cut`, but with the ability to select fields by name, reorder fields, and drop fields. 9 | 10 | Fields can be specified by field number or, for files with header lines, by field name. Field numbers start with one. They are comma separated, and ranges can be used. The `--H|header` option enables selection by field name. This also manages header lines from multiple files, retaining only the first header. 11 | 12 | Fields can be listed more than once, and fields not listed can be selected as a group using the `--rest` option. Fields can be dropped using `--e|exclude`. All fields not excluded are output. `--f|fields` and `--r|rest` can be used with `--e|exclude` to change the order of non-excluded fields. 13 | 14 | **Options:** 15 | * `--h|help` - Print help. 16 | * `--help-verbose` - Print more detailed help. 17 | * `--help-fields ` - Print help on specifying fields. 18 | * `--V|version` - Print version information and exit. 19 | * `--H|header` - Treat the first line of each file as a header. 20 | * `--f|fields ` - Fields to retain. Fields are output in the order listed. 21 | * `--e|--exclude ` - Fields to exclude. 22 | * `--r|rest first|last` - Output location for fields not included in the `--f|fields` field-list. 23 | * `--d|delimiter CHR` - Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.) 24 | * `--line-buffered` - Immediately output every line. 25 | 26 | **Notes:** 27 | * See [Field syntax](common-options-and-behavior.md#field-syntax) for information about specifying fields. 28 | * One of `--f|fields` or `--e|exclude` is required. 29 | * Fields specified by `--f|fields` and `--e|exclude` cannot overlap. 30 | * When `--f|fields` and `--e|exclude` are used together, the effect is to specify `--rest last`. This can be overridden by specifying `--rest first`. 31 | * Each input line must be long enough to contain all fields specified with `--f|fields`. This is not necessary for `--e|exclude` fields. 32 | * Specifying field names containing special characters may require escaping the special characters. See [Field syntax](common-options-and-behavior.md#field-syntax) for details. 33 | * Input and output are buffered by default to improve performance. Use `--line-buffered` to have each line read and written as soon as available. 34 | 35 | **Examples:** 36 | ``` 37 | $ # Keep the first field from two files 38 | $ tsv-select -f 1 file1.tsv file2.tsv 39 | 40 | $ # Keep fields 1 and 2, retain the header from the first file 41 | $ tsv-select -H -f 1,2 file1.tsv file2.tsv 42 | 43 | $ # Keep the 'time' field 44 | $ tsv-select -H -f time file1.tsv 45 | 46 | $ # Keep all fields ending '_date' or '_time' 47 | $ tsv-select -H -f '*_date,*_time' file.tsv 48 | 49 | $ # Drop all the '*_time' fields 50 | $ tsv-select -H --exclude '*_time' file.tsv 51 | 52 | $ # Output fields 2 and 1, in that order 53 | $ tsv-select -f 2,1 file.tsv 54 | 55 | $ # Output a range of fields 56 | $ tsv-select -f 3-30 file.tsv 57 | 58 | $ # Output a range of fields in reverse order 59 | $ tsv-select -f 30-3 file.tsv 60 | 61 | $ # Drop the first field, keep everything else 62 | $ # Equivalent to 'cut -f 2- file.tsv' 63 | $ tsv-select --exclude 1 file.tsv 64 | $ tsv-select -e 1 file.tsv 65 | 66 | $ # Move field 1 to the end of the line 67 | $ tsv-select -f 1 --rest first file.tsv 68 | 69 | $ # Move the 'Date' and 'Time' fields to the start of the line 70 | $ tsv-select -H -f Date,Time --rest last file.tsv 71 | 72 | # Output with repeating fields 73 | $ tsv-select -f 1,2,1 file.tsv 74 | $ tsv-select -f 1-3,3-1 file.tsv 75 | 76 | $ # Read from standard input 77 | $ cat file*.tsv | tsv-select -f 1,4-7,11 78 | 79 | $ # Read from a file and standard input. The '--' terminates command 80 | $ # option processing, '-' represents standard input. 81 | $ cat file1.tsv | tsv-select -f 1-3 -- - file2.tsv 82 | 83 | $ # Files using comma as the separator ('simple csv') 84 | $ # (Note: Does not handle CSV escapes.) 85 | $ tsv-select -d , --fields 5,1,2 file.csv 86 | 87 | $ # Move field 2 to the front and drop fields 10-15 88 | $ tsv-select -f 2 -e 10-15 file.tsv 89 | 90 | $ # Move field 2 to the end, dropping fields 10-15 91 | $ tsv-select -f 2 -rest first -e 10-15 file.tsv 92 | ``` 93 | -------------------------------------------------------------------------------- /dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tsv-utils", 3 | "description": "eBay's TSV utilities. Command line tools for large, tabular data files. Filtering, statistics, sampling, joins and more.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2015-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "#": "NOTE: Add new apps to dub_build.d also.", 10 | "subPackages": [ 11 | "./common/", 12 | "./csv2tsv/", 13 | "./keep-header/", 14 | "./number-lines/", 15 | "./tsv-append/", 16 | "./tsv-filter/", 17 | "./tsv-join/", 18 | "./tsv-pretty/", 19 | "./tsv-sample/", 20 | "./tsv-select/", 21 | "./tsv-split/", 22 | "./tsv-summarize/", 23 | "./tsv-uniq/" 24 | ], 25 | "configurations": [ 26 | { 27 | "name" : "executable", 28 | "targetName": "dub_build", 29 | "mainSourceFile": "dub_build.d" 30 | }, 31 | { 32 | "name": "unittest", 33 | "targetType": "none" 34 | } 35 | ], 36 | "buildTypes": { 37 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 38 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 39 | "dflags": ["-boundscheck=off"] } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /dub_build.d: -------------------------------------------------------------------------------- 1 | /** 2 | This is a simple dub build launcher for tsv-utils to use with Dub installs. 3 | 4 | The tsv-utils package contains multiple executable programs in sub-directories. 5 | Vanilla Dub does not support building multiple executables, a separate invocations is 6 | required for each app. However, experienced Dub users may try to install with a 7 | standard Dub sequence, for example: 8 | 9 | dub fetch tsv-utils 10 | dub run tsv-utils 11 | 12 | Another use-case: 13 | 14 | dub fetch --local 15 | cd 16 | dub run 17 | 18 | This executable is intended to handle these cases. It also has one additional function: 19 | inform the user where the binaries are stored so they can be added to the path. 20 | 21 | This build launcher does not provide general build services. For example, it does not 22 | support 'test'. This can still be done via dub, but on the individual sub-packages, not 23 | the full package. 24 | 25 | Copyright (c) 2015-2021, eBay Inc. 26 | Initially written by Jon Degenhardt 27 | 28 | License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 29 | */ 30 | 31 | auto helpText = q"EOS 32 | Build the apps in the tsv-utils package. Options: 33 | EOS"; 34 | 35 | int main(string[] args) { 36 | import std.array : join; 37 | import std.format; 38 | import std.getopt; 39 | import std.path; 40 | import std.process : escapeShellCommand, executeShell; 41 | import std.stdio; 42 | 43 | bool debugBuild = 0; 44 | string compiler = ""; 45 | 46 | auto r = getopt( 47 | args, 48 | "debug", "Debug build. Release builds are the default.", &debugBuild, 49 | "compiler", "COMPILER Compiler to use. Typically dmd, ldc2, gdc. Can be a path.", &compiler 50 | ); 51 | 52 | if (r.helpWanted) { 53 | defaultGetoptPrinter(helpText, r.options); 54 | return 0; 55 | } 56 | 57 | // Note: At present 'common' is a source library and does not need a standalone compilation step. 58 | auto packageName = "tsv-utils"; 59 | auto subPackages = ["csv2tsv", "keep-header", "number-lines", "tsv-append", "tsv-filter", "tsv-join", "tsv-pretty", "tsv-sample", "tsv-select", "tsv-split", "tsv-summarize", "tsv-uniq"]; 60 | auto buildCmdArgs = ["dub", "build", "", "--force", "-b"]; 61 | buildCmdArgs ~= debugBuild ? "debug" : "release"; 62 | if (compiler.length > 0) { 63 | buildCmdArgs ~= format("--compiler=%s", compiler); 64 | } 65 | 66 | assert(args.length > 0); 67 | auto exePath = args[0].absolutePath; 68 | auto exeDir = exePath.dirName; 69 | auto binDir = buildNormalizedPath(exeDir, "bin"); 70 | writeln(); 71 | writeln("=== Building tsv-utils executables ==="); 72 | writeln(); 73 | foreach (subPkg; subPackages) { 74 | auto subPkgBuildName = packageName ~ ":" ~ subPkg; 75 | buildCmdArgs[2] = subPkgBuildName; 76 | writeln("Building ", subPkg); 77 | writeln(); 78 | writeln(buildCmdArgs.join(' ')); 79 | auto buildResult = executeShell(escapeShellCommand(buildCmdArgs)); 80 | writeln(buildResult.output); 81 | if (buildResult.status != 0) { 82 | stderr.writeln("\n===> Build failure.\n"); 83 | return buildResult.status; 84 | } 85 | } 86 | 87 | writeln("========================================================"); 88 | writeln("Executables are in: ", binDir); 89 | writeln("Add this directory or the excecutables to the PATH."); 90 | writeln(); 91 | writeln("To build with a different compiler:"); 92 | writefln(" dub run %s -- --compiler=", packageName); 93 | writeln("========================================================"); 94 | 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /extras/scripts/tsv-sort: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright (c) 2020-2021, eBay Inc. 4 | # Initially written by Jon Degenhardt 5 | # 6 | # License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 7 | # 8 | # ------------------------------------------------------------- 9 | # 10 | # This file contains a sample implementation of the 'tsv-sort' script 11 | # described on the Tips and Tricks page in the eBay tsv-utils repository: 12 | # https://github.com/eBay/tsv-utils/blob/master/docs/TipsAndTricks.md 13 | # 14 | 15 | command_name="sort" 16 | mac_gnu_name="gsort" 17 | 18 | if [ "$(uname)" == "Darwin" ] && type $mac_gnu_name &>/dev/null ; then 19 | command_name=$mac_gnu_name 20 | fi 21 | if [ "$1" == "--help" ]; then 22 | script_name=$(basename "$0") 23 | echo "" 24 | echo "$script_name runs '$command_name' using TAB as the field delimiter and the buffer" 25 | echo "size set to 2G. All arguments are forwared to '$command_name'. Example:" 26 | echo "" 27 | echo " tsv-sort data.tsv -k1,1 -k3,3" 28 | echo "" 29 | echo "This sorts data.tsv using the 1st and 3rd fields as keys." 30 | echo "Run '$command_name --help' for more information." 31 | echo "" 32 | exit 0 33 | fi 34 | $command_name -t $'\t' --buffer-size=2G "$@" 35 | -------------------------------------------------------------------------------- /extras/scripts/tsv-sort-fast: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright (c) 2020-2021, eBay Inc. 4 | # Initially written by Jon Degenhardt 5 | # 6 | # License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 7 | # 8 | # ------------------------------------------------------------- 9 | # 10 | # This file contains a sample implementation of the 'tsv-sort-fast' script 11 | # described on the Tips and Tricks page in the eBay tsv-utils repository: 12 | # https://github.com/eBay/tsv-utils/blob/master/docs/TipsAndTricks.md 13 | # 14 | 15 | command_name="sort" 16 | mac_gnu_name="gsort" 17 | 18 | if [ "$(uname)" == "Darwin" ] && type $mac_gnu_name &>/dev/null ; then 19 | command_name=$mac_gnu_name 20 | fi 21 | if [ "$1" == "--help" ]; then 22 | script_name=$(basename "$0") 23 | echo "" 24 | echo "$script_name runs '$command_name' with:" 25 | echo " * TAB as the field delimiter." 26 | echo " * The buffer size set to 2G." 27 | echo " * Locale sensitive sorting turned off." 28 | echo "" 29 | echo "All arguments are forwarded to '$command_name'. Example:" 30 | echo "" 31 | echo " tsv-sort-fast data.tsv -k1,1 -k3,3" 32 | echo "" 33 | echo "This sorts data.tsv using the 1st and 3rd fields as keys." 34 | echo "Run '$command_name --help' for more information." 35 | echo "" 36 | echo "Turning off locale specific sorting is a meaningful change." 37 | echo "Alphabetic entries are no longer sorted in a locale and Unicode" 38 | echo "aware fashion. Instead sorting is based on the underlying byte" 39 | echo "sequences. The advantage is that it is dramatically faster. The" 40 | echo "typical speedup is between 2 and 10 times. Use with care." 41 | exit 0 42 | fi 43 | (LC_ALL=C $command_name -t $'\t' --buffer-size=2G "$@") 44 | -------------------------------------------------------------------------------- /keep-header/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # keep-header 4 | 5 | `keep-header` is a convenience utility that runs Unix commands in a header-aware fashion. It is especially useful with `sort`. `sort` does not know about headers, so the header line ends up wherever it falls in the sort order. Using `keep-header`, the header line is output first and the rest of the sorted file follows. For example: 6 | ``` 7 | $ # Sort a file, keeping the header line at the top. 8 | $ keep-header myfile.txt -- sort 9 | ``` 10 | 11 | The command to run is placed after the double dash (`--`). Everything after the initial double dash is part of the command. For example, `sort --ignore-case` is run as follows: 12 | ``` 13 | $ # Case-insensitive sort, keeping the header line at the top. 14 | $ keep-header myfile.txt -- sort --ignore-case 15 | ``` 16 | 17 | Multiple files can be provided, only the header from the first is retained. For example: 18 | 19 | ``` 20 | $ # Sort a set of files in reverse order, keeping only one header line. 21 | $ keep-header *.txt -- sort -r 22 | ``` 23 | 24 | `keep-header` is especially useful for commands like `sort` and `shuf` that reorder input lines. It is also useful with filtering commands like `grep`, many `awk` uses, and even `tail`, where the header should be retained without filtering or evaluation. 25 | 26 | Examples: 27 | ``` 28 | $ # 'grep' a file, keeping the header line without needing to match it. 29 | $ keep-header file.txt -- grep 'some text' 30 | 31 | $ # Print the last 10 lines of a file, but keep the header line 32 | $ keep-header file.txt -- tail 33 | 34 | $ # Print lines 100-149 of a file, plus the header 35 | $ keep-header file.txt -- tail -n +100 | head -n 51 36 | 37 | $ # Sort a set of TSV files numerically on field 2, keeping one header. 38 | $ keep-header *.tsv -- sort -t $'\t' -k2,2n 39 | 40 | $ # Same as the previous example, but using the 'tsv-sort-fast' bash 41 | $ # script described on the "Tips and Tricks" page. 42 | $ keep-header *.tsv -- tsv-sort-fast -k2,2n 43 | ``` 44 | 45 | See the [keep-header reference](../docs/tool_reference/keep-header.md) for more information. 46 | -------------------------------------------------------------------------------- /keep-header/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "keep-header", 3 | "description": "Execute a unix command in a header aware fashion.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2017-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "configurations": [ 10 | { 11 | "name" : "executable", 12 | "targetName": "keep-header", 13 | "targetPath": "../bin/", 14 | "mainSourceFile": "src/tsv_utils/keep-header.d", 15 | "dependencies": { 16 | "tsv-utils:common": { "path": ".." } 17 | } 18 | }, 19 | { 20 | "name": "unittest", 21 | "targetType": "none" 22 | } 23 | ], 24 | "buildTypes": { 25 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 26 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 27 | "dflags": ["-boundscheck=off"], 28 | "dflags-osx-ldc": ["-flto=thin"] } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /keep-header/makefile: -------------------------------------------------------------------------------- 1 | include ../makedefs.mk 2 | include ../makeapp.mk 3 | 4 | # No built-in unit tests 5 | unittest: ; 6 | unittest-codecov: ; 7 | -------------------------------------------------------------------------------- /keep-header/src/tsv_utils/keep-header.d: -------------------------------------------------------------------------------- 1 | /** 2 | Command line tool that executes a command while preserving header lines. 3 | 4 | Copyright (c) 2018-2021, eBay Inc. 5 | Initially written by Jon Degenhardt 6 | 7 | License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 8 | */ 9 | module tsv_utils.keep_header; 10 | 11 | auto helpText = q"EOS 12 | Execute a command against one or more files in a header aware fashion. 13 | The first line of each file is assumed to be a header. The first header 14 | is output unchanged. Remaining lines are sent to the given command via 15 | standard input, excluding the header lines of subsequent files. Output 16 | from the command is appended to the initial header line. 17 | 18 | A double dash (--) delimits the command, similar to how the pipe 19 | operator (|) delimits commands. Examples: 20 | 21 | $ keep-header file1.txt -- sort 22 | $ keep-header file1.txt file2.txt -- sort -k1,1nr 23 | 24 | These sort the files as usual, but preserve the header as the first line 25 | output. Data can also be read from standard input. Example: 26 | 27 | $ cat file1.txt | keep-header -- grep red 28 | 29 | Options: 30 | 31 | -V --version Print version information and exit. 32 | -h --help This help information. 33 | EOS"; 34 | 35 | static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 36 | 37 | /** keep-header is a simple program, it is implemented entirely in main. 38 | */ 39 | int main(string[] args) 40 | { 41 | import std.algorithm : findSplit, joiner; 42 | import std.path : baseName, stripExtension; 43 | import std.process : pipeProcess, ProcessPipes, Redirect, wait; 44 | import std.range; 45 | import std.stdio; 46 | import std.typecons : tuple; 47 | 48 | /* When running in DMD code coverage mode, turn on report merging. */ 49 | version(D_Coverage) version(DigitalMars) 50 | { 51 | import core.runtime : dmd_coverSetMerge; 52 | dmd_coverSetMerge(true); 53 | } 54 | 55 | auto programName = (args.length > 0) ? args[0].stripExtension.baseName : "Unknown_program_name"; 56 | auto splitArgs = findSplit(args, ["--"]); 57 | 58 | if (splitArgs[1].length == 0 || splitArgs[2].length == 0) 59 | { 60 | auto cmdArgs = splitArgs[0][1 .. $]; 61 | stderr.writefln("Synopsis: %s [file...] -- program [args]", programName); 62 | if (cmdArgs.length > 0 && 63 | (cmdArgs[0] == "-h" || cmdArgs[0] == "--help" || cmdArgs[0] == "--help-verbose")) 64 | { 65 | stderr.writeln(); 66 | stderr.writeln(helpText); 67 | } 68 | else if (cmdArgs.length > 0 && 69 | (cmdArgs[0] == "-V" || cmdArgs[0] == "--V" || cmdArgs[0] == "--version")) 70 | { 71 | import tsv_utils.common.tsvutils_version; 72 | stderr.writeln(); 73 | stderr.writeln(tsvutilsVersionNotice("keep-header")); 74 | } 75 | return 0; 76 | } 77 | 78 | ProcessPipes pipe; 79 | try pipe = pipeProcess(splitArgs[2], Redirect.stdin); 80 | catch (Exception exc) 81 | { 82 | stderr.writefln("[%s] Command failed: '%s'", programName, splitArgs[2].joiner(" ")); 83 | stderr.writeln(exc.msg); 84 | return 1; 85 | } 86 | 87 | int status = 0; 88 | { 89 | scope(exit) 90 | { 91 | auto pipeStatus = wait(pipe.pid); 92 | if (pipeStatus != 0) status = pipeStatus; 93 | } 94 | 95 | bool headerWritten = false; 96 | foreach (filename; splitArgs[0].length > 1 ? splitArgs[0][1..$] : ["-"]) 97 | { 98 | bool isStdin = (filename == "-"); 99 | File inputStream; 100 | 101 | if (isStdin) inputStream = stdin; 102 | else 103 | { 104 | try inputStream = filename.File(); 105 | catch (Exception exc) 106 | { 107 | stderr.writefln("[%s] Unable to open file: '%s'", programName, filename); 108 | stderr.writeln(exc.msg); 109 | status = 1; 110 | break; 111 | } 112 | } 113 | 114 | auto firstLine = inputStream.readln(); 115 | 116 | if (inputStream.eof && firstLine.length == 0) continue; 117 | 118 | if (!headerWritten) 119 | { 120 | write(firstLine); 121 | stdout.flush; 122 | headerWritten = true; 123 | } 124 | 125 | if (isStdin) 126 | { 127 | foreach (line; inputStream.byLine(KeepTerminator.yes)) 128 | { 129 | pipe.stdin.write(line); 130 | } 131 | } 132 | else 133 | { 134 | ubyte[1024 * 128] readBuffer; 135 | foreach (ubyte[] chunk; inputStream.byChunk(readBuffer)) 136 | { 137 | pipe.stdin.write(cast(char[])chunk); 138 | } 139 | } 140 | pipe.stdin.flush; 141 | } 142 | pipe.stdin.close; 143 | } 144 | return status; 145 | } 146 | -------------------------------------------------------------------------------- /keep-header/tests/emptyfile.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/keep-header/tests/emptyfile.txt -------------------------------------------------------------------------------- /keep-header/tests/gold/error_tests_1.txt: -------------------------------------------------------------------------------- 1 | Error test set 1 2 | ---------------- 3 | 4 | ====[keep-header nosuchfile.txt -- sort]==== 5 | [keep-header] Unable to open file: 'nosuchfile.txt' 6 | Cannot open file `nosuchfile.txt' in mode `rb' (No such file or directory) 7 | 8 | ====[keep-header input1.csv -- nosuchprogram]==== 9 | [keep-header] Command failed: 'nosuchprogram' 10 | Executable file not found: nosuchprogram 11 | 12 | ====[keep-header ]==== 13 | Synopsis: keep-header [file...] -- program [args] 14 | 15 | ====[keep-header input1.csv]==== 16 | Synopsis: keep-header [file...] -- program [args] 17 | 18 | ====[keep-header input1.csv --]==== 19 | Synopsis: keep-header [file...] -- program [args] 20 | 21 | ====[keep-header --]==== 22 | Synopsis: keep-header [file...] -- program [args] 23 | -------------------------------------------------------------------------------- /keep-header/tests/input1.csv: -------------------------------------------------------------------------------- 1 | file.row,field1,field2,field3 2 | input1.txt.1,30,green|blue 3 | input1.txt.2,20,緑|青 4 | input1.txt.3,10,绿色|蓝色 5 | -------------------------------------------------------------------------------- /keep-header/tests/input2.csv: -------------------------------------------------------------------------------- 1 | file.row,field1,field2,field3 2 | input2.txt.1,15,green|blue 3 | input2.txt.2,25,grün|blau 4 | input2.txt.3,35,vihreä|sininen 5 | -------------------------------------------------------------------------------- /keep-header/tests/input_headeronly.csv: -------------------------------------------------------------------------------- 1 | file.row,field1,field2,field3 2 | -------------------------------------------------------------------------------- /keep-header/tests/oneblankline.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | appdirs = csv2tsv keep-header number-lines tsv-append tsv-filter tsv-join tsv-pretty tsv-sample tsv-select tsv-split tsv-summarize tsv-uniq 2 | subdirs = common $(appdirs) 3 | buildtools_dir = buildtools 4 | 5 | # Package variables 6 | OS ?= UnkOS 7 | ARCH ?= x86_64 8 | APP_VERSION ?= v~dev 9 | PKG_ROOT_DIR ?= $(notdir $(basename $(CURDIR))) 10 | DCOMPILER_BASENAME = $(notdir $(basename $(DCOMPILER))) 11 | PKG_DIR = $(PKG_ROOT_DIR)-$(APP_VERSION)_$(OS)-$(ARCH)_$(DCOMPILER_BASENAME) 12 | TAR_FILE ?= $(PKG_DIR).tar.gz 13 | 14 | all: release 15 | 16 | help: 17 | @echo 'Commands:' 18 | @echo '=========' 19 | @echo 'release - Release mode build.' 20 | @echo 'debug - Debug build. (Apps are written with a .dbg extension.)' 21 | @echo 'clean - Removes executables and other build artifacts.' 22 | @echo 'clean-relics - Removes build artifacts, but not release artifacts.' 23 | @echo 'clean-bin-relics - Removes build artifacts from the bin directory, except for release' 24 | @echo ' binaries. Used to create a release package.' 25 | @echo 'test - Runs all tests. Unit tests, and release and debug executable tests.' 26 | @echo 'unittest - Runs unit tests.' 27 | @echo 'test-debug - Builds debug apps and runs command line tests against the apps.' 28 | @echo 'test-release - Builds release apps and runs command line tests against the apps.' 29 | @echo 'test-nobuild - Runs command line app tests without doing a build.' 30 | @echo ' This is useful when testing a build done with dub.' 31 | @echo 'test-codecov - Runs unit tests and app tests (executables) with code coverage' 32 | @echo ' reports. This is the simplest way to run code coverage. Reports are' 33 | @echo ' are written to .lst files, apps are built with .cov extensions.' 34 | @echo 'apptest-codecov - Runs app tests (executables) with code coverage reports on.' 35 | @echo 'unittest-codecov - Runs unit tests with code coverage reports on.' 36 | @echo 'package - Creates a release package. Used with travis-ci.' 37 | @echo '' 38 | @echo 'Note: DMD is the default compiler. Use the DCOMPILER parameter to switch. For example:' 39 | @echo '' 40 | @echo ' $$ make DCOMPILER=ldc2' 41 | @echo '' 42 | @echo 'Parameters:' 43 | @echo '===========' 44 | @echo 'DCOMPILER - Compiler to use. Defaults to DMD. Value can be a path.' 45 | @echo 'DFLAGS - Extra flags to pass to the compiler.' 46 | @echo 'LDC_HOME - The LDC install directory. If provided, all LDC binaries will located in' 47 | @echo ' in the bin directory inside LDC_HOME.' 48 | @echo 'LDC_LTO_RUNTIME - Turns on LTO for the runtime libraries (phobos, druntime). This uses' 49 | @echo ' LTO support built into the libraries shipped with LDC. Turn on with LDC_LTO_RUNTIME=1.' 50 | @echo ' LDC 1.9 or later required. Use LDC_BUILD_RUNTIME with early LDC releases.' 51 | @echo 'LDC_BUILD_RUNTIME - Turn on LTO for the runtime libraries by downloading the correct' 52 | @echo ' library source code and building. Turn on with LDC_BUILD_RUNTIME=1. LDC 1.5 or later' 53 | @echo ' required. When using LDC 1.9 or later LDC_LTO_RUNTIME is recommended instead.' 54 | @echo "LDC_LTO - LDC LTO options. One of 'thin', 'full', 'off', or 'default'. Leave unspecified" 55 | @echo ' to use the default (recommended).' 56 | @echo 'LDC_PGO - Turns on Profile Guided Optimization. This is available for a subset of apps,' 57 | @echo ' release builds with LDC_BUILD_RUNTIME=1 only. If LDC_PGO=1, PGO is used on the apps' 58 | @echo ' showing the largest performance benefits. If LDC_PGO=2, PGO is used on all apps it' 59 | @echo ' has been enabled for. Speed gains are smaller for the additional apps. PGO has' 60 | @echo ' longer build times. LDC_PGO=1 is a good compromise between build time and performance.' 61 | @echo "LDC_PGO_TYPE - Either 'IR' or 'AST'. Defaults to AST, and currently only AST is supported." 62 | @echo ' IR-PGO is anticipated in a future LDC release, but is not supported yet.' 63 | @echo '' 64 | 65 | release: make_subdirs 66 | debug: make_subdirs 67 | clean: make_subdirs 68 | -rm -f ./*.lst 69 | clean-relics: make_subdirs 70 | -rm -f ./*.lst 71 | clean-bin-relics: make_subdirs 72 | 73 | test: make_subdirs 74 | unittest: make_subdirs 75 | test-debug: make_subdirs 76 | test-release: make_subdirs 77 | test-nobuild: make_appdirs 78 | 79 | .PHONY: test-codecov 80 | test-codecov: make_subdirs buildtools 81 | $(buildtools_dir)/aggregate-codecov $(CURDIR) $(subdirs:%=%/*.lst) 82 | $(buildtools_dir)/codecov-to-relative-paths $(CURDIR)/*.lst 83 | 84 | apptest-codecov: make_appdirs 85 | unittest-codecov: make_subdirs 86 | 87 | .PHONY: make_subdirs $(subdirs) 88 | make_subdirs: $(subdirs) 89 | 90 | .PHONY: make_appdirs $(appdirs) 91 | make_appdirs: $(appdirs) 92 | 93 | $(subdirs): 94 | @echo '' 95 | @echo 'make -C $@ $(MAKECMDGOALS)' 96 | @$(MAKE) -C $@ $(MAKECMDGOALS) 97 | 98 | buildtools: 99 | @echo '' 100 | @echo 'make -C $(buildtools_dir)' 101 | @$(MAKE) -C $(buildtools_dir) 102 | 103 | .PHONY: package 104 | package: 105 | @$(MAKE) -C $(CURDIR) clean 106 | @$(MAKE) -C $(CURDIR) release 107 | @$(MAKE) -C $(CURDIR) test-nobuild 108 | @$(MAKE) -C $(CURDIR) clean-bin-relics 109 | @echo '' 110 | @echo '---> Build successful. Creating package.' 111 | @echo '' 112 | -rm -rf $(PKG_DIR) 113 | mkdir $(PKG_DIR) 114 | cp -pr $(CURDIR)/bin $(PKG_DIR) 115 | cp -pr $(CURDIR)/bash_completion $(PKG_DIR) 116 | cp -pr $(CURDIR)/extras $(PKG_DIR) 117 | cp -pr $(CURDIR)/LICENSE.txt $(PKG_DIR) 118 | cp -pr $(buildtools_dir)/ReleasePackageReadme.txt $(PKG_DIR) 119 | tar -czf $(TAR_FILE) $(PKG_DIR) 120 | -rm -r $(PKG_DIR) 121 | -------------------------------------------------------------------------------- /number-lines/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # number-lines 4 | 5 | A simpler version of the Unix `nl` program. It prepends a line number to each line read from files or standard input. This tool was written primarily as an example of a simple command line tool. The code structure it uses is the same as followed by all the other tools. Example: 6 | ``` 7 | $ number-lines myfile.txt 8 | ``` 9 | 10 | Despite its original purpose as a code sample, `number-lines` turns out to be quite convenient. It is often useful to add a unique row ID to a file, and this tool does this in a manner that maintains proper TSV formatting. 11 | 12 | See the [number-lines reference](../docs/tool_reference/number-lines.md) for further details. 13 | -------------------------------------------------------------------------------- /number-lines/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "number-lines", 3 | "description": "Number lines", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2015-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "configurations": [ 10 | { 11 | "name" : "executable", 12 | "targetName": "number-lines", 13 | "targetPath": "../bin/", 14 | "mainSourceFile": "src/tsv_utils/number-lines.d", 15 | "dependencies": { 16 | "tsv-utils:common": { "path": ".." } 17 | } 18 | }, 19 | { 20 | "name": "unittest", 21 | "targetType": "none", 22 | } 23 | ], 24 | "buildTypes": { 25 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 26 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 27 | "dflags": ["-boundscheck=off"], 28 | "dflags-osx-ldc": ["-flto=thin"] } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /number-lines/makefile: -------------------------------------------------------------------------------- 1 | include ../makedefs.mk 2 | include ../makeapp.mk 3 | 4 | # No built-in unit tests 5 | unittest: ; 6 | unittest-codecov: ; 7 | -------------------------------------------------------------------------------- /number-lines/tests/empty-file.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/number-lines/tests/empty-file.txt -------------------------------------------------------------------------------- /number-lines/tests/gold/error_tests_1.txt: -------------------------------------------------------------------------------- 1 | Error test set 1 2 | ---------------- 3 | 4 | ====[number-lines nosuchfile.txt]==== 5 | Error [number-lines]: Cannot open file `nosuchfile.txt' in mode `rb' (No such file or directory) 6 | 7 | ====[number-lines -d ß input1.txt]==== 8 | [number-lines] Error processing command line arguments: Invalid UTF-8 sequence (at index 1) 9 | 10 | ====[number-lines --nosuchparam input1.txt]==== 11 | [number-lines] Error processing command line arguments: Unrecognized option --nosuchparam 12 | -------------------------------------------------------------------------------- /number-lines/tests/input1.txt: -------------------------------------------------------------------------------- 1 | The first line - Is it a header? 2 | abc def ghi 3 | some random text 4 | Japanese: 私はガラスを食べられます。それは私を傷つけません。 5 | 6 | Previous line blank 7 | 8 | Previous line a single tab 9 | -------------------------------------------------------------------------------- /number-lines/tests/input2.txt: -------------------------------------------------------------------------------- 1 | The first line 2 | The second line 3 | The third line 4 | -------------------------------------------------------------------------------- /number-lines/tests/one-line-file.txt: -------------------------------------------------------------------------------- 1 | The one line 2 | -------------------------------------------------------------------------------- /number-lines/tests/tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ $# -le 1 ]; then 4 | echo "Insufficient arguments. A program name and output directory are required." 5 | exit 1 6 | fi 7 | 8 | prog=$1 9 | shift 10 | odir=$1 11 | echo "Testing ${prog}, output to ${odir}" 12 | 13 | ## Three args: program, args, output file 14 | runtest () { 15 | echo "" >> $3 16 | echo "====[number-lines $2]====" >> $3 17 | $1 $2 >> $3 2>&1 18 | return 0 19 | } 20 | 21 | basic_tests_1=${odir}/basic_tests_1.txt 22 | 23 | echo "Basic tests set 1" > ${basic_tests_1} 24 | echo "-----------------" >> ${basic_tests_1} 25 | 26 | runtest ${prog} "input1.txt" ${basic_tests_1} 27 | runtest ${prog} "--start-number 10 input1.txt" ${basic_tests_1} 28 | runtest ${prog} "-n 10 input1.txt" ${basic_tests_1} 29 | runtest ${prog} "-n -10 input1.txt" ${basic_tests_1} 30 | runtest ${prog} "--header input1.txt" ${basic_tests_1} 31 | runtest ${prog} "--header-string LINENUM input1.txt" ${basic_tests_1} 32 | runtest ${prog} "-s LineNum_àßß input1.txt" ${basic_tests_1} 33 | runtest ${prog} "--header -s line_num input1.txt" ${basic_tests_1} 34 | runtest ${prog} "--delimiter : input1.txt" ${basic_tests_1} 35 | runtest ${prog} "-d _ input1.txt" ${basic_tests_1} 36 | runtest ${prog} "--header -d ^ input1.txt" ${basic_tests_1} 37 | runtest ${prog} "empty-file.txt" ${basic_tests_1} 38 | runtest ${prog} "-H empty-file.txt" ${basic_tests_1} 39 | 40 | echo "" >> ${basic_tests_1}; echo "====Multi-file Tests===" >> ${basic_tests_1} 41 | runtest ${prog} "input1.txt input2.txt empty-file.txt one-line-file.txt" ${basic_tests_1} 42 | runtest ${prog} "input1.txt one-line-file.txt input2.txt empty-file.txt" ${basic_tests_1} 43 | runtest ${prog} "empty-file.txt input1.txt one-line-file.txt input2.txt input1.txt" ${basic_tests_1} 44 | runtest ${prog} "-H input2.txt input2.txt input2.txt" ${basic_tests_1} 45 | runtest ${prog} "--header input1.txt input2.txt empty-file.txt one-line-file.txt" ${basic_tests_1} 46 | runtest ${prog} "--header -n 10 input1.txt one-line-file.txt input2.txt empty-file.txt" ${basic_tests_1} 47 | runtest ${prog} "--header -s LINENUM empty-file.txt input1.txt one-line-file.txt input2.txt input1.txt" ${basic_tests_1} 48 | 49 | echo "" >> ${basic_tests_1}; echo "====Tests using Standard Input===" >> ${basic_tests_1} 50 | ## runtest can't do these. Generate them directly. 51 | 52 | echo "" >> ${basic_tests_1}; echo "====[cat input1.txt | number-lines]====" >> ${basic_tests_1} 53 | cat input1.txt | ${prog} >> ${basic_tests_1} 2>&1 54 | 55 | echo "" >> ${basic_tests_1}; echo "====[cat input1.txt input2.txt | number-lines --header]====" >> ${basic_tests_1} 56 | cat input1.txt input2.txt | ${prog} --header >> ${basic_tests_1} 2>&1 57 | 58 | echo "" >> ${basic_tests_1}; echo "====[cat input1.txt | number-lines -- input2.txt -]====" >> ${basic_tests_1} 59 | cat input1.txt | ${prog} -- input2.txt - >> ${basic_tests_1} 2>&1 60 | 61 | echo "" >> ${basic_tests_1}; echo "====[cat input1.txt | number-lines --header -- input2.txt -]====" >> ${basic_tests_1} 62 | cat input1.txt | ${prog} --header -- input2.txt - >> ${basic_tests_1} 2>&1 63 | 64 | echo "" >> ${basic_tests_1}; echo "====[cat input1.txt | number-lines -- input2.txt - one-line-file.txt]====" >> ${basic_tests_1} 65 | cat input1.txt | ${prog} -- input2.txt - one-line-file.txt >> ${basic_tests_1} 2>&1 66 | 67 | echo "" >> ${basic_tests_1}; echo "====[cat input1.txt | number-lines --header -- input2.txt - one-line-file.txt]====" >> ${basic_tests_1} 68 | cat input1.txt | ${prog} --header -- input2.txt - one-line-file.txt >> ${basic_tests_1} 2>&1 69 | 70 | ## --line-buffered tests 71 | echo "" >> ${basic_tests_1}; echo "====line buffered tests===" >> ${basic_tests_1} 72 | runtest ${prog} "--line-buffered input1.txt" ${basic_tests_1} 73 | runtest ${prog} "empty-file.txt" ${basic_tests_1} 74 | runtest ${prog} "-H empty-file.txt" ${basic_tests_1} 75 | runtest ${prog} "--line-buffered input1.txt input2.txt empty-file.txt one-line-file.txt" ${basic_tests_1} 76 | 77 | echo "" >> ${basic_tests_1}; echo "====[cat input1.txt input2.txt | number-lines --header --line-buffered]====" >> ${basic_tests_1} 78 | cat input1.txt input2.txt | ${prog} --header --line-buffered >> ${basic_tests_1} 2>&1 79 | 80 | ## Help and Version printing 81 | 82 | echo "" >> ${basic_tests_1} 83 | echo "Help and Version printing 1" >> ${basic_tests_1} 84 | echo "-----------------" >> ${basic_tests_1} 85 | echo "" >> ${basic_tests_1} 86 | 87 | echo "====[number-lines --help | grep -c Synopsis]====" >> ${basic_tests_1} 88 | ${prog} --help 2>&1 | grep -c Synopsis >> ${basic_tests_1} 2>&1 89 | 90 | echo "====[number-lines --version | grep -c 'number-lines (eBay/tsv-utils)']====" >> ${basic_tests_1} 91 | ${prog} --version 2>&1 | grep -c 'number-lines (eBay/tsv-utils)' >> ${basic_tests_1} 2>&1 92 | 93 | echo "====[number-lines -V | grep -c 'number-lines (eBay/tsv-utils)']====" >> ${basic_tests_1} 94 | ${prog} -V 2>&1 | grep -c 'number-lines (eBay/tsv-utils)' >> ${basic_tests_1} 2>&1 95 | 96 | ## Error cases 97 | 98 | error_tests_1=${odir}/error_tests_1.txt 99 | 100 | echo "Error test set 1" > ${error_tests_1} 101 | echo "----------------" >> ${error_tests_1} 102 | 103 | runtest ${prog} "nosuchfile.txt" ${error_tests_1} 104 | 105 | # Disable this test until Phobos 2.071 is available on all compilers 106 | # 2.071 changed the error message in a minor way. 107 | #runtest ${prog} "-n notanumber input1.txt" ${error_tests_1} 108 | 109 | runtest ${prog} "-d ß input1.txt" ${error_tests_1} 110 | runtest ${prog} "--nosuchparam input1.txt" ${error_tests_1} 111 | 112 | exit $? 113 | -------------------------------------------------------------------------------- /tsv-append/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # tsv-append 4 | 5 | `tsv-append` concatenates multiple TSV files, similar to the Unix `cat` utility. It is header-aware, writing the header from only the first file. It also supports source tracking, adding a column indicating the original file to each row. 6 | 7 | Concatenation with header support is useful when preparing data for traditional Unix utilities like `sort` and `sed` or applications that read a single file. 8 | 9 | Source tracking is useful when creating long/narrow form tabular data. This format is used by many statistics and data mining packages. (See [Wide & Long Data - Stanford University](https://stanford.edu/~ejdemyr/r-tutorials/wide-and-long/) or Hadley Wickham's [Tidy data](http://vita.had.co.nz/papers/tidy-data.html) for more info.) 10 | 11 | In this scenario, files have been used to capture related data sets, the difference between data sets being a condition represented by the file. For example, results from different variants of an experiment might each be recorded in their own files. Retaining the source file as an output column preserves the condition represented by the file. The source values default to the file names, but this can be customized. 12 | 13 | See the [tsv-append reference](../docs/tool_reference/tsv-append.md) for the complete list of options available. 14 | -------------------------------------------------------------------------------- /tsv-append/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tsv-append", 3 | "description": "Concatenate TSV files. Header aware, with support for source file tracking.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2017-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "configurations": [ 10 | { 11 | "name" : "executable", 12 | "targetName": "tsv-append", 13 | "targetPath": "../bin/", 14 | "mainSourceFile": "src/tsv_utils/tsv-append.d", 15 | "dependencies": { 16 | "tsv-utils:common": { "path": ".." } 17 | } 18 | }, 19 | { 20 | "name": "unittest", 21 | "targetType": "none" 22 | } 23 | ], 24 | "buildTypes": { 25 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 26 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 27 | "dflags": ["-boundscheck=off"], 28 | "dflags-osx-ldc": ["-flto=thin"] } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tsv-append/makefile: -------------------------------------------------------------------------------- 1 | include ../makedefs.mk 2 | include ../makeapp.mk 3 | -------------------------------------------------------------------------------- /tsv-append/tests/empty-file.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/tsv-append/tests/empty-file.txt -------------------------------------------------------------------------------- /tsv-append/tests/gold/error_tests_1.txt: -------------------------------------------------------------------------------- 1 | Error test set 1 2 | ---------------- 3 | 4 | ====[tsv-append no_such_file.tsv]==== 5 | Error [tsv-append]: Cannot open file `no_such_file.tsv' in mode `rb' (No such file or directory) 6 | 7 | ====[tsv-append -f none=no_such_file.tsv]==== 8 | Error [tsv-append]: Cannot open file `no_such_file.tsv' in mode `rb' (No such file or directory) 9 | 10 | ====[tsv-append -f]==== 11 | [tsv-append] Error processing command line arguments: Missing value for argument -f. 12 | 13 | ====[tsv-append -f file]==== 14 | [tsv-append] Error processing command line arguments: Invalid option value: '--f|file file'. Expected: '--f|file ='. 15 | 16 | ====[tsv-append -f source=]==== 17 | [tsv-append] Error processing command line arguments: Invalid option value: '--f|file source='. Expected: '--f|file ='. 18 | 19 | ====[tsv-append -f =file]==== 20 | [tsv-append] Error processing command line arguments: Invalid option value: '--f|file =file'. Expected: '--f|file ='. 21 | 22 | ====[tsv-append --no-such-param input1x3.tsv]==== 23 | [tsv-append] Error processing command line arguments: Unrecognized option --no-such-param 24 | 25 | ====[tsv-append -d ß input1x3.tsv]==== 26 | [tsv-append] Error processing command line arguments: Invalid UTF-8 sequence (at index 1) 27 | -------------------------------------------------------------------------------- /tsv-append/tests/input1x3.tsv: -------------------------------------------------------------------------------- 1 | field1 2 | row 1 3 | row 2 4 | -------------------------------------------------------------------------------- /tsv-append/tests/input1x4.tsv: -------------------------------------------------------------------------------- 1 | field1 2 | next-empty 3 | 4 | last-line 5 | -------------------------------------------------------------------------------- /tsv-append/tests/input3x2.tsv: -------------------------------------------------------------------------------- 1 | field1 field2 field3 2 | abc def ghi 3 | -------------------------------------------------------------------------------- /tsv-append/tests/input3x5.tsv: -------------------------------------------------------------------------------- 1 | field1 field2 field3 2 | jkl mno pqr 3 | 123 456 789 4 | xy1 xy2 xy3 5 | pqx pqy pqz 6 | -------------------------------------------------------------------------------- /tsv-filter/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tsv-filter", 3 | "description": "Filter lines in a tab-separated value file.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2015-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | 10 | "configurations": [ 11 | { 12 | "name" : "executable", 13 | "targetName": "tsv-filter", 14 | "targetPath": "../bin/", 15 | "mainSourceFile": "src/tsv_utils/tsv-filter.d", 16 | "dependencies": { 17 | "tsv-utils:common": { "path": ".." } 18 | } 19 | }, 20 | { 21 | "name": "unittest", 22 | "targetType": "none" 23 | } 24 | ], 25 | "buildTypes": { 26 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 27 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 28 | "dflags": ["-boundscheck=off"] } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tsv-filter/makefile: -------------------------------------------------------------------------------- 1 | APP_USES_LDC_PGO=2 2 | 3 | include ../makedefs.mk 4 | 5 | # tsv-filter has issues with LTO on OS X with xcode 9.3, LDC 1.8.0. 6 | # LDC Issue 2585: https://github.com/ldc-developers/ldc/issues/2585 7 | # Issues go beyond issue 2585 discussion. Using LTO with only the app 8 | # fails with multiple compiler versions. Using LTO with LDC_BUILD_RUNTIME=1 9 | # works, but using LDC_LTO_RUNTIME=1 fails. Could be different linkers, 10 | # not clear. Fix by adding -disable-fp-elim to release_flags created by 11 | # makedefs.mk. 12 | # 13 | # Update - With LDC version 1.24 the -disable-fp-elim option was removed. 14 | # It is no longer supported by the default LLVM version 11. At present 15 | # there are no known cases of failure using LDC 1.24. Use of the flag is 16 | # now conditioned on the older compiler versions. 17 | 18 | ifeq ($(shell uname -s),Darwin) 19 | ifeq ($(compiler_type),ldc) 20 | ifeq ($(ldc_version_major_minor),1.8) 21 | override release_flags += -disable-fp-elim 22 | else ifeq ($(ldc_version_major_minor),1.9) 23 | override release_flags += -disable-fp-elim 24 | else ifeq ($(ldc_version_major_minor),1.10) 25 | override release_flags += -disable-fp-elim 26 | else ifeq ($(ldc_version_major_minor),1.11) 27 | override release_flags += -disable-fp-elim 28 | else ifeq ($(ldc_version_major_minor),1.12) 29 | override release_flags += -disable-fp-elim 30 | else ifeq ($(ldc_version_major_minor),1.13) 31 | override release_flags += -disable-fp-elim 32 | else ifeq ($(ldc_version_major_minor),1.14) 33 | override release_flags += -disable-fp-elim 34 | else ifeq ($(ldc_version_major_minor),1.15) 35 | override release_flags += -disable-fp-elim 36 | else ifeq ($(ldc_version_major_minor),1.16) 37 | override release_flags += -disable-fp-elim 38 | else ifeq ($(ldc_version_major_minor),1.17) 39 | override release_flags += -disable-fp-elim 40 | else ifeq ($(ldc_version_major_minor),1.18) 41 | override release_flags += -disable-fp-elim 42 | else ifeq ($(ldc_version_major_minor),1.19) 43 | override release_flags += -disable-fp-elim 44 | else ifeq ($(ldc_version_major_minor),1.20) 45 | override release_flags += -disable-fp-elim 46 | else ifeq ($(ldc_version_major_minor),1.21) 47 | override release_flags += -disable-fp-elim 48 | else ifeq ($(ldc_version_major_minor),1.22) 49 | override release_flags += -disable-fp-elim 50 | else ifeq ($(ldc_version_major_minor),1.23) 51 | override release_flags += -disable-fp-elim 52 | endif 53 | endif 54 | endif 55 | 56 | include ../makeapp.mk 57 | 58 | # No built-in unit tests 59 | unittest: ; 60 | unittest-codecov: ; 61 | -------------------------------------------------------------------------------- /tsv-filter/profile_data/profile_data_4.tsv: -------------------------------------------------------------------------------- 1 | numeric_1 numeric_2 mixed_1 char_1 char_2 2 | 0.0 100 abc def 3 | -103.45 99.5 abc abcd abcdef 4 | 22 87 nan ababab abcabcabc 5 | 45.9 -23.5 2.37e+07 hbdacdrd YzyxY 6 | 30 31 "abc" XYZ 7 | 40 41 ab"ab" xxyyzz 8 | 50 51 62 ABCDABAB XXYYZZ 9 | 60 61 NAN abc XX 10 | 70 nan xyz abc YY 11 | 80 nan -99 bb SS 12 | 90 -1000.12 pqr cc TT 13 | -10 -11 14 cc UU 14 | -20 -21 111 cc VV 15 | -30 -31 76 dd WW 16 | -40 -41 77 cdabab RR 17 | -50 -51 78 xabcaby SS 18 | -60 -61 ee STXYxyST 19 | -70 -71 6.23e-03 bababa ST 20 | -80 -81 hhg ef MM 21 | -90 -91 pqr ed MN 22 | 1 2e+03 2e+04 ab XYZ 23 | 2 3e+03 3e+04 bc XYXY 24 | 3 4e+03 4e+04 ab ZXY 25 | 4 5e+03 5e+04 bc YXZ 26 | 5 inf nan abcdabcdab VV 27 | 6 15 nan abcdabcdab SW 28 | 7 17 inf dbcdde WS 29 | 8 18 nan abcdabcdab TT 30 | 9 22.5 gh cab RR 31 | 1.0 2.0e+03 2.0e+04 ab XYZ 32 | 2.0 3.0e+03 3.0e+04 bc XYXY 33 | 3.0 4.0e+03 4.0e+04 ab ZXY 34 | 4.0 5.0e+03 5.0e+04 bc YXZ 35 | 5.0 nan nan abcdabcdab VV 36 | 6.0 15.0 inf abcdabcdab SW 37 | 7.0 17.0 111 dbcdde WS 38 | 8.0 18.0 nan abcdabcdab TT 39 | 9.0 22.5 gh cab RR 40 | 1.1 2.0e+03 2.0e+04 ab XYZ 41 | 2.1 3.0e+03 3.0e+04 bc XYXY 42 | 3.1 inf 4.0e+04 ab ZXY 43 | 4.1 5.0e+03 5.0e+04 bc YXZ 44 | 5.1 nan nan abcdabcdab VV 45 | 6.1 15.1 nan abcdabcdab SW 46 | 7.1 17.1 111 dbcdde WS 47 | 8.1 18.1 nan abcdabcdab TT 48 | 9.1 22.5 gh cab RR 49 | 1.2 2.0e+03 2.0e+04 ab XYZ 50 | 2.2 3.0e+03 3.0e+04 bc XYXY 51 | 3.2 4.0e+03 4.0e+04 ab ZXY 52 | 4.2 5.0e+03 5.0e+04 bc YXZ 53 | 5.2 nan nan abcdabcdab VV 54 | 6.2 15.2 nan abcdabcdab SW 55 | 7.2 17.2 111 dbcdde WS 56 | 8.2 18.2 nan abcdabcdab TT 57 | 9.2 22.5 gh cab RR 58 | 1.3 2.0e+03 2.0e+04 ab XYZ 59 | 2.3 3.0e+03 3.0e+04 bc XYXY 60 | 3.3 4.0e+03 4.0e+04 ab ZXY 61 | 4.3 5.0e+03 5.0e+04 bc YXZ 62 | 5.3 nan nan abcdabcdab VV 63 | 6.3 15.3 nan abcdabcdab SW 64 | 7.3 17.3 111 dbcdde WS 65 | 8.3 18.3 nan abcdabcdab TT 66 | 9.3 22.5 gh cab RR 67 | 1.4 2.0e+03 2.0e+04 ab XYZ 68 | 2.4 3.0e+03 3.0e+04 dcbab XYXY 69 | 3.4 4.0e+03 4.0e+04 ab ZXY 70 | 4.4 5.0e+03 5.0e+04 dcbab YXZ 71 | 5.4 nan nan adcbabdadcbabdab VV 72 | 6.4 15.4 nan adcbabdadcbabdab SW 73 | 7.4 17.4 111 ddcbabdde WS 74 | 8.4 18.4 nan adcbabdadcbabdab TT 75 | 9.4 22.5 gh cab RR 76 | 1.5 2.0e+03 2.0e+04 ab XYZ 77 | 2.5 3.0e+03 3.0e+04 dcbab XYXY 78 | 3.5 4.0e+03 4.0e+04 ab ZXY 79 | 4.5 5.0e+03 5.0e+04 dcbab YXZ 80 | 5.5 nan nan adcbabdadcbabdab VV 81 | 6.5 15.5 nan adcbabdadcbabdab SW 82 | 7.5 17.5 111 ddcbabdde WS 83 | 8.5 18.5 nan adcbabdadcbabdab TT 84 | 9.5 22.5 gh cab RR 85 | 1.6 2.0e+03 2.0e+04 ab XYZ 86 | 2.6 3.0e+03 3.0e+04 dcbab XYXY 87 | 3.6 4.0e+03 4.0e+04 ab ZXY 88 | 4.6 5.0e+03 5.0e+04 dcbab YXZ 89 | 5.6 nan nan adcbabdadcbabdab VV 90 | 6.6 15.6 nan adcbabdadcbabdab SW 91 | 7.6 17.6 111 ddcbabdde WS 92 | 8.6 18.6 nan adcbabdacabdab TT 93 | 9.6 22.5 gh cab RR 94 | 10 20 2.0e+04 ab XYZ 95 | 20 30 3.0e+04 cab XYXY 96 | 30 40 4.0e+04 ab ZXY 97 | 40 50 5.0e+04 cab YXZ 98 | 50 nan nan acabdacabdab VV 99 | 60 15.0 nan acabdacabdab SW 100 | 70 17.0 111 dbcdde WS 101 | 80 18.0 nan abcdabcdab TT 102 | 90 22.5 gh eghb RR 103 | 10 20 2.0e+04 ab xyz 104 | 20 30 3.0e+04 bc xyxy 105 | 30 40 4.0e+04 ab zxy 106 | 40 50 5.0e+04 bc YXZ 107 | 50 nan nan abcdabcdab VV 108 | 60 15.0 nan abcdabcdab SW 109 | 70 17.0 111 dbcdde WS 110 | 80 18.0 nan abcdabcdab TT 111 | 90 22.5 gh eghb RR 112 | 10 21 2.0e+04 ab XYZ 113 | 20 31 3.0e+04 bc us 114 | 30 41 22 efabef TS 115 | 40 51 33 efcdab SS 116 | 50 21 nan habba VV 117 | 60 nan nan faabaaab RT 118 | 70 nan dbcdde QT 119 | 80 18.1 nan cacca TR 120 | 90 41 gh habg RS 121 | 10 41 2.0e+04 ab XYXXXZ 122 | 20 21 3.0e+04 bc US 123 | 30 31 22 efabef TS 124 | 40 311 33 efcdab SS 125 | 50 41 nan habba VV 126 | 60 nan nan faabaaab RT 127 | 70 nan dbcdde QT 128 | 80 15.3 nan cacca TR 129 | 90 41 gh habg RS 130 | 10 21 2.0e+04 ab XYZ 131 | 20 31 3.0e+04 bc US 132 | 30 41 22 efabef TS 133 | 40 51 33 efcdab SS 134 | 50 21 nan habba VV 135 | 60 nan nan faabaaab RT 136 | 70 nan dbcdde QT 137 | 80 18.1 nan fgcfg TR 138 | 90 41 gh habg RS 139 | 10 41 2.0e+04 ab XYZZ 140 | 20 21 3.0e+04 bc US 141 | 30 31 22 efabef TS 142 | 40 311 33 efcdab SS 143 | 50 41 nan habba VV 144 | 60 nan nan faabaaab RT 145 | 70 nan dbcdde QT 146 | 80 15.3 nan fdagfcfg TR 147 | 90 41 gh habg RS 148 | 10 211 2.0e+04 ab XYZ 149 | 20 311 3.0e+04 bc US 150 | 30 411 22 efabef TS 151 | 40 511 33 efcdab SS 152 | 50 21 nan habba VV 153 | 60 nan nan faabaaab RT 154 | 70 nan dbcdde QT 155 | 80 18.1 nan fdagfcfg TR 156 | 90 41 gh habg RS 157 | 10 41 2.0e+04 ab XyXyz 158 | 20 211 3.0e+04 bc US 159 | 30 31 22 efabef TS 160 | 40 311 33 efcdab SS 161 | 50 41 nan habba VV 162 | 60 nan nan faabaaab RT 163 | 70 nan dbcdde QT 164 | 80 15.3 nan fdagfcfg TR 165 | 90 41 gh habg RS 166 | -------------------------------------------------------------------------------- /tsv-filter/tests/input1.dos_tsv: -------------------------------------------------------------------------------- 1 | F1 F2 F3 F4 2 | 1 1.0 a A 3 | 2 2. b B 4 | 10 10.1 abc ABC 5 | 100 100 abc AbC 6 | 0 0.0 z AzB 7 | -1 -0.1 abc def abc def 8 | -2 -2.0 ß ss 9 | 0. 100. àbc ÀBC 10 | 0.0 100.0 àßc ÀssC 11 | -0.0 -100.0 àßc ÀSSC 12 | 100 100 AbC 13 | 100 100 abc 14 | 100 101 15 | 100 102 abc AbC 16 | 100 103 abc AbC 17 | -------------------------------------------------------------------------------- /tsv-filter/tests/input1.tsv: -------------------------------------------------------------------------------- 1 | F1 F2 F3 F4 2 | 1 1.0 a A 3 | 2 2. b B 4 | 10 10.1 abc ABC 5 | 100 100 abc AbC 6 | 0 0.0 z AzB 7 | -1 -0.1 abc def abc def 8 | -2 -2.0 ß ss 9 | 0. 100. àbc ÀBC 10 | 0.0 100.0 àßc ÀssC 11 | -0.0 -100.0 àßc ÀSSC 12 | 100 100 AbC 13 | 100 100 abc 14 | 100 101 15 | 100 102 abc AbC 16 | 100 103 abc AbC 17 | -------------------------------------------------------------------------------- /tsv-filter/tests/input1_noheader.tsv: -------------------------------------------------------------------------------- 1 | 1 1.0 a A 2 | 2 2. b B 3 | 10 10.1 abc ABC 4 | 100 100 abc AbC 5 | 0 0.0 z AzB 6 | -1 -0.1 abc def abc def 7 | -2 -2.0 ß ss 8 | 0. 100. àbc ÀBC 9 | 0.0 100.0 àßc ÀssC 10 | -0.0 -100.0 àßc ÀSSC 11 | 100 100 AbC 12 | 100 100 abc 13 | 100 101 14 | 100 102 abc AbC 15 | 100 103 abc AbC 16 | -------------------------------------------------------------------------------- /tsv-filter/tests/input2.tsv: -------------------------------------------------------------------------------- 1 | F1 F2 F3 F4 2 | 1000 1000.0 3 empty 3 | 1000 1000.0 3 1-space 4 | 1000 1000.001 3 2-spaces 5 | 1000 1001 abc 3 no space 6 | 1000 999.999 abc 3 space prefix 7 | 1000 999 abc 3 space suffix 8 | 1000 999.9999 a 3 space prefix&suffix 9 | 999.999 1000 x x 10 | 999.999 1000.999 x x 11 | 1000 1001.1 x x 12 | -999.99 -1000 x x 13 | -999.98 -1000 x x 14 | -999.99 1000 x x 15 | 999.99 -1000 x x 16 | -------------------------------------------------------------------------------- /tsv-filter/tests/input2_pipe-sep.tsv: -------------------------------------------------------------------------------- 1 | F1|F2|F3|F4 2 | 1|1.0|a|A 3 | 2|2.|b|B 4 | 10|10.1|abc|ABC 5 | 100|100|abc|AbC 6 | 0|0.0|z|AzB 7 | -1|-0.1|abc def|abc def 8 | -2|-2.0|ß|ss 9 | 0.|100.|àbc|ÀBC 10 | 0.0|100.0|àßc|ÀssC 11 | -0.0|-100.0|àßc|ÀSSC 12 | 100|100||AbC 13 | 100|100|abc| 14 | 100|101|| 15 | 100|102|abc|AbC 16 | 100|103|abc|AbC 17 | -------------------------------------------------------------------------------- /tsv-filter/tests/input4.tsv: -------------------------------------------------------------------------------- 1 | line 2_apha 3_apha 4_num 5_num 6_num 7_alpha 8_num 9_num 2 | 1 abc def 10 20 30 ghi 40 50 3 | 2 abcd abc 20 5 35 bcd 15 40 4 | 3 cde de 35 45 55 bcdef 10 25 5 | 4 aadd aabdd 10 30 15 abd 25 25 6 | 5 ad 30 35 25 bcdef 40 15 7 | 6 -10 -5 -25 -15 -30 8 | 7 bcf cc -20 -50 0 abc 0 -5 9 | 8 bd 10 20 40 bcd 15 25 10 | 9 0 0 0 0 0 11 | 10 ABCD ABC 20 5 35 BCD 15 40 12 | 11 AADD AABDD 10 30 15 ABD 25 25 13 | -------------------------------------------------------------------------------- /tsv-filter/tests/input_3x0.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 2 | -------------------------------------------------------------------------------- /tsv-filter/tests/input_3x1.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 2 | 3x1-r1 201 301 3 | -------------------------------------------------------------------------------- /tsv-filter/tests/input_3x2.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 2 | 3x2-r1 2001 3001 3 | 3x2-r2 2002 3002 4 | -------------------------------------------------------------------------------- /tsv-filter/tests/input_3x3.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 2 | 3x3-r1 21 31 3 | 3x3-r2 22 32 4 | 3x3-r3 23 33 5 | -------------------------------------------------------------------------------- /tsv-filter/tests/input_emptyfile.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/tsv-filter/tests/input_emptyfile.tsv -------------------------------------------------------------------------------- /tsv-filter/tests/input_num_or_empty.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 2 | 100 21 31 3 | 22 32 4 | 23 33 5 | 100 24 33 6 | none 25 34 7 | -------------------------------------------------------------------------------- /tsv-filter/tests/input_numeric_tests.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 2 | 1 nan 3 | 2 NaN 4 | 3 NAN 5 | 4 inf 6 | 5 -inf 7 | 6 INF 8 | 7 9 | 8 abc 10 | 9 23 11 | 10 -33.5 12 | 11 42.5 13 | 12 +45 14 | 13 .19 15 | 14 -.20 16 | 15 9e+02 17 | 16 8E-17 18 | -------------------------------------------------------------------------------- /tsv-filter/tests/input_onefield.txt: -------------------------------------------------------------------------------- 1 | no header 2 | no real fields, just some text 3 | abc def 4 | abc def 5 | abc def ghi 6 | 7 | previous line empty 8 | 9 | 10 | 11 | previous line empty, 2-back 1 space, 3-back 2 spaces 12 | 13 | 14 | 15 | previous line empty, 2-back 1 space, 3-back 2 spaces 16 | 17 | last line 18 | -------------------------------------------------------------------------------- /tsv-filter/tests/input_unicode.tsv: -------------------------------------------------------------------------------- 1 | Language Text 1 Text 2 Text 3 2 | English snow storm soccer player town hall 3 | Chinese (Simplified) 雪风暴 足球运动员 市政厅 4 | Chinese (Traditional) 雪風暴 足球運動員 市政廳 5 | French Tempête de neige joueur de foot mairie 6 | Georgian თოვლის ქარიშხალი ფეხბურთის მოთამაშე მუნიციპალიტეტი 7 | German Schneesturm Fußballspieler Rathaus 8 | Greek Χιονοθύελλα ποδοσφαιριστής Δημαρχείο 9 | Japanese 吹雪 サッカー選手 町役場 10 | Russian Снежная буря футболист ратуша 11 | Spanish Tormenta de nieve jugador de fútbol Ayuntamiento 12 | Vietnamese Bão tuyết cầuthủ bóng đá Thị trấn 13 | Mixed1 a-雪 a abcd 14 | Mixed2 ab-雪雪 ab abc 15 | Mixed3 abc-雪 abc ab 16 | Mixed4 abcd-雪雪 abcd a 17 | Mixed5 a-雪 abcde abcd 18 | Mixed6 ab-雪 雪 abc 19 | Mixed7 abc-雪 雪雪 ab 20 | Mixed8 abcd-雪 雪雪雪 a 21 | Mixed9 a-雪 雪雪雪雪 abcd 22 | Mixed10 ab-雪 雪雪雪雪雪 abc 23 | Mixed11 abc-雪 षि ab 24 | Mixed12 abcd-雪 षिषि a 25 | Mixed13 a-雪 षिषिषि abcd 26 | Mixed14 ab-雪 षिषिषिषि abc 27 | Mixed15 abc-雪 षिषिषिषिषि ab 28 | Mixed16 abcd-雪 aषि雪 a 29 | -------------------------------------------------------------------------------- /tsv-filter/tests/test-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "output_files" : [ 3 | { 4 | "name" : "basic_tests_1.txt" 5 | }, 6 | { 7 | "name" : "error_tests_1.txt", 8 | "versions" : [ 9 | "error_tests_1.2081.txt" 10 | ] 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tsv-join/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # tsv-join 4 | 5 | Joins lines from multiple files based on a common key. One file, the 'filter' file, contains the records (lines) being matched. The other input files are scanned for matching records. Matching records are written to standard output, along with any designated fields from the filter file. In database parlance this is a hash semi-join. This is similar to the "stream-static" joins available in Spark Structured Streaming and "KStream-KTable" joins in Kafka. (The filter file plays the same role as the Spark static dataset or Kafka KTable.) 6 | 7 | Example: 8 | ``` 9 | $ tsv-join -H --filter-file filter.tsv --key-fields Country,City --append-fields Population,Elevation data.tsv 10 | ``` 11 | 12 | This reads `filter.tsv`, creating a lookup table keyed on the `Country` and `City` fields. `data.tsv` is read, lines with a matching key are written to standard output with the `Population` and `Elevation` fields from `filter.tsv` appended. This is an inner join. Left outer joins and anti-joins are also supported. 13 | 14 | Common uses for `tsv-join` are to join related datasets or to filter one dataset based on another. Filter file entries are kept in memory, this limits the ultimate size that can be handled effectively. The author has found that filter files up to about 10 million lines are processed effectively, but performance starts to degrade after that. 15 | 16 | See the [tsv-join reference](../docs/tool_reference/tsv-join.md) for details. 17 | -------------------------------------------------------------------------------- /tsv-join/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tsv-join", 3 | "description": "Join lines in tab-separated value files.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2015-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "configurations": [ 10 | { 11 | "name" : "executable", 12 | "targetName": "tsv-join", 13 | "targetPath": "../bin/", 14 | "mainSourceFile": "src/tsv_utils/tsv-join.d", 15 | "dependencies": { 16 | "tsv-utils:common": { "path": ".." } 17 | } 18 | }, 19 | { 20 | "name": "unittest", 21 | "targetType": "none" 22 | } 23 | ], 24 | "buildTypes": { 25 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 26 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 27 | "dflags": ["-boundscheck=off"], 28 | "dflags-osx-ldc": ["-flto=thin"] } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tsv-join/makefile: -------------------------------------------------------------------------------- 1 | include ../makedefs.mk 2 | include ../makeapp.mk 3 | 4 | # No built-in unit tests 5 | unittest: ; 6 | unittest-codecov: ; 7 | -------------------------------------------------------------------------------- /tsv-join/tests/input1.dos_tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 f4 f5 2 | 1 ggg UUU 101 15 3 | 2 bbb ZZZ 21 28 4 | 3 nnn GGG 336 3 5 | 4 vvv VVV 43 403 6 | 5 ggg CCC 5734 52 7 | 6 ddd ZZZ 65 602 8 | 7 ßßß SSS 7 771 9 | 8 vv v 85 832 10 | 9 v vv 97 91 11 | 10 GGG nnn 101 102 12 | 11 v12 PPP 1123 1167 13 | 12 àbc P 1209 1234 14 | 13 ÀSSC 1367 1331 15 | 14 -1 1 1489 1421 16 | 15 B 1522 1567 17 | 16 1634 1602 18 | 17 0 X 1721 1703 19 | 18 a g ß Y 1845 1801 20 | 19 %tg-0 Z 1931 1956 21 | 20 sp-sp 2020 22 | -------------------------------------------------------------------------------- /tsv-join/tests/input1.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 f4 f5 2 | 1 ggg UUU 101 15 3 | 2 bbb ZZZ 21 28 4 | 3 nnn GGG 336 3 5 | 4 vvv VVV 43 403 6 | 5 ggg CCC 5734 52 7 | 6 ddd ZZZ 65 602 8 | 7 ßßß SSS 7 771 9 | 8 vv v 85 832 10 | 9 v vv 97 91 11 | 10 GGG nnn 101 102 12 | 11 v12 PPP 1123 1167 13 | 12 àbc P 1209 1234 14 | 13 ÀSSC 1367 1331 15 | 14 -1 1 1489 1421 16 | 15 B 1522 1567 17 | 16 1634 1602 18 | 17 0 X 1721 1703 19 | 18 a g ß Y 1845 1801 20 | 19 %tg-0 Z 1931 1956 21 | 20 sp-sp 2020 22 | -------------------------------------------------------------------------------- /tsv-join/tests/input1_noheader.tsv: -------------------------------------------------------------------------------- 1 | 1 ggg UUU 101 15 2 | 2 bbb ZZZ 21 28 3 | 3 nnn GGG 336 3 4 | 4 vvv VVV 43 403 5 | 5 ggg CCC 5734 52 6 | 6 ddd ZZZ 65 602 7 | 7 ßßß SSS 7 771 8 | 8 vv v 85 832 9 | 9 v vv 97 91 10 | 10 GGG nnn 101 102 11 | 11 v12 PPP 1123 1167 12 | 12 àbc P 1209 1234 13 | 13 ÀSSC 1367 1331 14 | 14 -1 1 1489 1421 15 | 15 B 1522 1567 16 | 16 1634 1602 17 | 17 0 X 1721 1703 18 | 18 a g ß Y 1845 1801 19 | 19 %tg-0 Z 1931 1956 20 | 20 sp-sp 2020 21 | -------------------------------------------------------------------------------- /tsv-join/tests/input1_rotated.tsv: -------------------------------------------------------------------------------- 1 | f5 f1 f2 f3 f4 2 | 15 1 ggg UUU 101 3 | 28 2 bbb ZZZ 21 4 | 3 3 nnn GGG 336 5 | 403 4 vvv VVV 43 6 | 52 5 ggg CCC 5734 7 | 602 6 ddd ZZZ 65 8 | 771 7 ßßß SSS 7 9 | 832 8 vv v 85 10 | 91 9 v vv 97 11 | 102 10 GGG nnn 101 12 | 1167 11 v12 PPP 1123 13 | 1234 12 àbc P 1209 14 | 1331 13 ÀSSC 1367 15 | 1421 14 -1 1 1489 16 | 1567 15 B 1522 17 | 1602 16 1634 18 | 1703 17 0 X 1721 19 | 1801 18 a g ß Y 1845 20 | 1956 19 %tg-0 Z 1931 21 | 2020 20 sp-sp 22 | -------------------------------------------------------------------------------- /tsv-join/tests/input2.dos_tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 f4 f5 2 | 1 ggg UUU 101b 15b 3 | 2 bbb ZZZ 21 28 4 | 3 nnn GGG 336b 3b 5 | 4 vvv VVV 43b 403b 6 | 5 ggg CCC 5734b 52b 7 | 6 ddd ZZZ 65b 602b 8 | 7 ßßß SSS 7b 771b 9 | 8 vv v 85b 832b 10 | 9 v vv 97 91 11 | 10 GGG nnn 101 102 12 | 11 v12 PPP 1123b 1167b 13 | 12 àbc P 1209b 1234b 14 | 13 ÀSSC 1367b 1331b 15 | 14 -1 1 1489b 1421b 16 | 15 B 1522b 1567b 17 | 16 1634b 1602b 18 | 17 0 X 1721b 1703 19 | 18 a g ß Y 1845b 1801b 20 | 19 %tg-0 Z 1931b 1956b 21 | 20 bbb ZZZ 21 28 22 | 21 bbb ZZZ 21 28 23 | 22 ddd ZZZ 65b 602b 24 | 23 v12 PPP 1123b 1167b 25 | 24 ÀSSC 1367b 1331b 26 | 25 0 X 1721b 1703 27 | 26 bbb ZZZ 21 28 28 | 27 xa gg 44 45 29 | 28 xb gh 45 46 30 | 29 xc gi 46 47 31 | 30 xd gj 47 48 32 | 31 sp-sp 2020b 33 | 32 xe gk 48 49 34 | -------------------------------------------------------------------------------- /tsv-join/tests/input2.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 f4 f5 2 | 1 ggg UUU 101b 15b 3 | 2 bbb ZZZ 21 28 4 | 3 nnn GGG 336b 3b 5 | 4 vvv VVV 43b 403b 6 | 5 ggg CCC 5734b 52b 7 | 6 ddd ZZZ 65b 602b 8 | 7 ßßß SSS 7b 771b 9 | 8 vv v 85b 832b 10 | 9 v vv 97 91 11 | 10 GGG nnn 101 102 12 | 11 v12 PPP 1123b 1167b 13 | 12 àbc P 1209b 1234b 14 | 13 ÀSSC 1367b 1331b 15 | 14 -1 1 1489b 1421b 16 | 15 B 1522b 1567b 17 | 16 1634b 1602b 18 | 17 0 X 1721b 1703 19 | 18 a g ß Y 1845b 1801b 20 | 19 %tg-0 Z 1931b 1956b 21 | 20 bbb ZZZ 21 28 22 | 21 bbb ZZZ 21 28 23 | 22 ddd ZZZ 65b 602b 24 | 23 v12 PPP 1123b 1167b 25 | 24 ÀSSC 1367b 1331b 26 | 25 0 X 1721b 1703 27 | 26 bbb ZZZ 21 28 28 | 27 xa gg 44 45 29 | 28 xb gh 45 46 30 | 29 xc gi 46 47 31 | 30 xd gj 47 48 32 | 31 sp-sp 2020b 33 | 32 xe gk 48 49 34 | -------------------------------------------------------------------------------- /tsv-join/tests/input2_noheader.tsv: -------------------------------------------------------------------------------- 1 | 1 ggg UUU 101b 15b 2 | 2 bbb ZZZ 21 28 3 | 3 nnn GGG 336b 3b 4 | 4 vvv VVV 43b 403b 5 | 5 ggg CCC 5734b 52b 6 | 6 ddd ZZZ 65b 602b 7 | 7 ßßß SSS 7b 771b 8 | 8 vv v 85b 832b 9 | 9 v vv 97 91 10 | 10 GGG nnn 101 102 11 | 11 v12 PPP 1123b 1167b 12 | 12 àbc P 1209b 1234b 13 | 13 ÀSSC 1367b 1331b 14 | 14 -1 1 1489b 1421b 15 | 15 B 1522b 1567b 16 | 16 1634b 1602b 17 | 17 0 X 1721b 1703 18 | 18 a g ß Y 1845b 1801b 19 | 19 %tg-0 Z 1931b 1956b 20 | 20 bbb ZZZ 21 28 21 | 21 bbb ZZZ 21 28 22 | 22 ddd ZZZ 65b 602b 23 | 23 v12 PPP 1123b 1167b 24 | 24 ÀSSC 1367b 1331b 25 | 25 0 X 1721b 1703 26 | 26 bbb ZZZ 21 28 27 | 27 xa gg 44 45 28 | 28 xb gh 45 46 29 | 29 xc gi 46 47 30 | 30 xd gj 47 48 31 | 31 sp-sp 2020b 32 | 32 xe gk 48 49 33 | -------------------------------------------------------------------------------- /tsv-join/tests/input_1x5.tsv: -------------------------------------------------------------------------------- 1 | fa 2 | ggg 3 | a b c d e 4 | PPP 5 | 3 6 | v 7 | -------------------------------------------------------------------------------- /tsv-join/tests/input_2x3_colon.tsv: -------------------------------------------------------------------------------- 1 | col a:col b 2 | 101:501 3 | 432:12 4 | 13:503 5 | -------------------------------------------------------------------------------- /tsv-join/tests/input_5x4_colon.tsv: -------------------------------------------------------------------------------- 1 | Field A:Field B:Field C:Field D:Field E 2 | 13:hello world:fast:432:303 3 | 55: abc:501:892:101 4 | 101:501:432:12:13 5 | 7: ßßß ÀSSC:slow:432:303 6 | -------------------------------------------------------------------------------- /tsv-join/tests/input_emptyfile.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/tsv-join/tests/input_emptyfile.tsv -------------------------------------------------------------------------------- /tsv-pretty/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # tsv-pretty 4 | 5 | tsv-pretty prints TSV data in an aligned format for better readability when working on the command-line. Text columns are left aligned, numeric columns are right aligned. Floats aligned on the decimal point and precision can be specified. Header lines are detected automatically. If desired, the header line can be repeated at regular intervals. 6 | 7 | An example, first printed without formatting: 8 | ``` 9 | $ cat sample.tsv 10 | Color Count Ht Wt 11 | Brown 106 202.2 1.5 12 | Canary Yellow 7 106 0.761 13 | Chartreuse 1139 77.02 6.22 14 | Fluorescent Orange 422 1141.7 7.921 15 | Grey 19 140.3 1.03 16 | ``` 17 | Now with `tsv-pretty`, using header underlining and float formatting: 18 | ``` 19 | $ tsv-pretty -u -f sample.tsv 20 | Color Count Ht Wt 21 | ----- ----- -- -- 22 | Brown 106 202.20 1.500 23 | Canary Yellow 7 106.00 0.761 24 | Chartreuse 1139 77.02 6.220 25 | Fluorescent Orange 422 1141.70 7.921 26 | Grey 19 140.30 1.030 27 | ``` 28 | See the [tsv-pretty reference](../docs/tool_reference/tsv-pretty.md) for details. 29 | -------------------------------------------------------------------------------- /tsv-pretty/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tsv-pretty", 3 | "description": "Print TSV data aligned for easier reading on consoles and traditional command-line environments.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2017-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "configurations": [ 10 | { 11 | "name" : "executable", 12 | "targetName": "tsv-pretty", 13 | "targetPath": "../bin/", 14 | "mainSourceFile": "src/tsv_utils/tsv-pretty.d", 15 | "dependencies": { 16 | "tsv-utils:common": { "path": ".." } 17 | } 18 | }, 19 | { 20 | "name": "unittest", 21 | "targetType": "none" 22 | } 23 | ], 24 | "buildTypes": { 25 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 26 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 27 | "dflags": ["-boundscheck=off"], 28 | "dflags-osx-ldc": ["-flto=thin"] } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tsv-pretty/makefile: -------------------------------------------------------------------------------- 1 | include ../makedefs.mk 2 | include ../makeapp.mk 3 | -------------------------------------------------------------------------------- /tsv-pretty/tests/emptyfile.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/tsv-pretty/tests/emptyfile.tsv -------------------------------------------------------------------------------- /tsv-pretty/tests/gold/basic_tests_5.txt: -------------------------------------------------------------------------------- 1 | Alternate delimiters 2 | -------------------- 3 | 4 | ===[cat input_comma_delim.tsv]=== 5 | F1,Field Two,3,Four 6 | A to Z,def,3,4 7 | A-to-Z,DEF,3.,4. 8 | A到Z," PQR",3.0,4.00 9 | A〜Z,$@#,3.00,4.0 10 | 11 | ====[tsv-pretty input_comma_delim.tsv]==== 12 | F1,Field Two,3,Four 13 | A to Z,def,3,4 14 | A-to-Z,DEF,3.,4. 15 | A到Z," PQR",3.0,4.00 16 | A〜Z,$@#,3.00,4.0 17 | 18 | ====[tsv-pretty --delimiter , input_comma_delim.tsv]==== 19 | F1 Field Two 3 Four 20 | A to Z def 3 4 21 | A-to-Z DEF 3. 4. 22 | A到Z " PQR" 3.0 4.00 23 | A〜Z $@# 3.00 4.0 24 | 25 | ====[tsv-pretty --header -d , input_comma_delim.tsv]==== 26 | F1 Field Two 3 Four 27 | A to Z def 3 4 28 | A-to-Z DEF 3. 4. 29 | A到Z " PQR" 3.0 4.00 30 | A〜Z $@# 3.00 4.0 31 | 32 | ====[tsv-pretty --no-header -d , input_comma_delim.tsv]==== 33 | F1 Field Two 3 Four 34 | A to Z def 3 4 35 | A-to-Z DEF 3. 4. 36 | A到Z " PQR" 3.0 4.00 37 | A〜Z $@# 3.00 4.0 38 | 39 | Help and version options 40 | ------------------------ 41 | 42 | ====[tsv-pretty --help | grep -c Synopsis]==== 43 | 1 44 | ====[tsv-pretty -h | grep -c Synopsis]==== 45 | 1 46 | ====[tsv-pretty --help-verbose | grep -c Synopsis]==== 47 | 1 48 | ====[tsv-pretty --help-verbose | grep -c Limitations]==== 49 | 1 50 | ====[tsv-pretty --version | grep -c 'tsv-pretty (eBay/tsv-utils)']==== 51 | 1 52 | ====[tsv-pretty -V | grep -c 'tsv-pretty (eBay/tsv-utils)']==== 53 | 1 54 | 55 | Standard input 56 | -------------- 57 | 58 | ====[cat input_5x5.tsv | tsv-pretty]==== 59 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 60 | ab.cd 55 5x5-f3 27 abcd 61 | cde.fgi 67 r2-f3 890 hg-90 62 | pqr 88.5 r3-f3 p-q-r x-y-z 63 | rst 98.5 r4-f3 r-s-t-u 100 64 | 65 | ====[cat input_5x5.tsv | tsv-pretty -u -- -]==== 66 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 67 | ------ ----- ----- ----- ----- 68 | ab.cd 55 5x5-f3 27 abcd 69 | cde.fgi 67 r2-f3 890 hg-90 70 | pqr 88.5 r3-f3 p-q-r x-y-z 71 | rst 98.5 r4-f3 r-s-t-u 100 72 | 73 | ====[cat input_5x5.tsv | tsv-pretty -u -- input_5x2.tsv -]==== 74 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 75 | ------ ----- ----- ----- ----- 76 | ABC 22 14 . ab 77 | ab.cd 55 5x5-f3 27 abcd 78 | cde.fgi 67 r2-f3 890 hg-90 79 | pqr 88.5 r3-f3 p-q-r x-y-z 80 | rst 98.5 r4-f3 r-s-t-u 100 81 | -------------------------------------------------------------------------------- /tsv-pretty/tests/gold/error_tests_1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/tsv-pretty/tests/gold/error_tests_1.txt -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x1.tsv: -------------------------------------------------------------------------------- 1 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 2 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x1_alltext.tsv: -------------------------------------------------------------------------------- 1 | abc d ef g hijk 2 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x1_noheader.tsv: -------------------------------------------------------------------------------- 1 | ABC 22 14 . ab 2 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x1_noheader_preamble1.tsv: -------------------------------------------------------------------------------- 1 | ### A one line preamble ### 2 | ABC 22 14 . ab 3 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x1_noheader_preamble2.tsv: -------------------------------------------------------------------------------- 1 | ### First line of a two-line preamble ### 2 | ### Second line of a two-line preamble ### 3 | ABC 22 14 . ab 4 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x1_preamble1.tsv: -------------------------------------------------------------------------------- 1 | ### A one line preamble ### 2 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 3 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x1_preamble2.tsv: -------------------------------------------------------------------------------- 1 | ### First line of a two-line preamble ### 2 | ### Second line of a two-line preamble ### 3 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 4 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x2.tsv: -------------------------------------------------------------------------------- 1 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 2 | ABC 22 14 . ab 3 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x2_noheader.tsv: -------------------------------------------------------------------------------- 1 | DEF 2233 0.0 e nan 2 | GHIJKLM 223344 1.4e07 17 ghi 3 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x2_noheader_preamble1.tsv: -------------------------------------------------------------------------------- 1 | ### A one line preamble ### 2 | DEF 2233 0.0 e nan 3 | GHIJKLM 223344 1.4e07 17 ghi 4 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x2_noheader_preamble2.tsv: -------------------------------------------------------------------------------- 1 | ### First line of a two-line preamble ### 2 | ### Second line of a two-line preamble ### 3 | DEF 2233 0.0 e nan 4 | GHIJKLM 223344 1.4e07 17 ghi 5 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x2_preamble1.tsv: -------------------------------------------------------------------------------- 1 | ### A one line preamble ### 2 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 3 | ABC 22 14 . ab 4 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x2_preamble2.tsv: -------------------------------------------------------------------------------- 1 | ### First line of a two-line preamble ### 2 | ### Second line of a two-line preamble ### 3 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 4 | ABC 22 14 . ab 5 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x3.tsv: -------------------------------------------------------------------------------- 1 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 2 | DEF 2233 0.0 e nan 3 | GHIJKLM 223344 1.4e07 17 ghi 4 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x3_preamble1.tsv: -------------------------------------------------------------------------------- 1 | ### A one line preamble ### 2 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 3 | DEF 2233 0.0 e nan 4 | GHIJKLM 223344 1.4e07 17 ghi 5 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x3_preamble2.tsv: -------------------------------------------------------------------------------- 1 | ### First line of a two-line preamble ### 2 | ### Second line of a two-line preamble ### 3 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 4 | DEF 2233 0.0 e nan 5 | GHIJKLM 223344 1.4e07 17 ghi 6 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x4_noheader.tsv: -------------------------------------------------------------------------------- 1 | ab.cd 55 5x5-f3 27 abcd 2 | cde.fgi 67 r2-f3 890 hg-90 3 | pqr 88.5 r3-f3 p-q-r x-y-z 4 | rst 98.5 r4-f3 r-s-t-u 100 5 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_5x5.tsv: -------------------------------------------------------------------------------- 1 | Text-1 Num-1 Mix-1 Mix-2 Mix-3 2 | ab.cd 55 5x5-f3 27 abcd 3 | cde.fgi 67 r2-f3 890 hg-90 4 | pqr 88.5 r3-f3 p-q-r x-y-z 5 | rst 98.5 r4-f3 r-s-t-u 100 6 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_comma_delim.tsv: -------------------------------------------------------------------------------- 1 | F1,Field Two,3,Four 2 | A to Z,def,3,4 3 | A-to-Z,DEF,3.,4. 4 | A到Z," PQR",3.0,4.00 5 | A〜Z,$@#,3.00,4.0 6 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_mixed_1.tsv: -------------------------------------------------------------------------------- 1 | I1 T1 F1 T2 F2 Mixed 2 | 3 A .1 HelloWorld -1.28098e-05 15 3 | 89 B 0.008 23rd 18.23 23.0 4 | 222 foo 135.6 thirty 3311.235 19.1 5 | 4 foobar -23.72 ten 0.0821 ABC 6 | -5900 foobarbaz 8.03556e-09 --text-- -31.002 1001 7 | 6789e+23 abcdefghijklmnopqrstuvwxyz -6.758191e-14 x 0.518 ABCDEFGHI 8 | 17 cat 5.31 ABCDE 22.1 101 9 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_mixed_2.tsv: -------------------------------------------------------------------------------- 1 | Date Text Value Score 2 | 07-17-2003 My hands were black with dirt 1029 17.58 3 | 11-12-1991 two of our young Irish sculptors 14 107.3 4 | 06-21-1983 Ahora, mi amigo Cincunegui se ha empeñado 972 73.83 5 | 03-07-2011 C'était un vieux marin, basané, goudronné 114 0.92 6 | 07-02-1823 Omtrent de werkelijke uitgestrektheid van het gebied 11 3.2 7 | 08-24-1765 Hasta tal punto era desconocida de casi todos 32 123.0 8 | 10-17-2001 Die teils aus groben Wollenstoffen 172 63.38 9 | 04-13-1986 Die Männer traten jetzt näher herzu 46 78.32 10 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_numbers_1.tsv: -------------------------------------------------------------------------------- 1 | One Two Three Four Five 2 | 1.0 1 10000 0 -1 3 | 1.034 2 87 1 -.5 4 | 0.200 3 0 .9 -9999 5 | 0.030 4 9999 .09 -9999. 6 | 99.83 5 10001 -1 -9999.0 7 | 101.2 6 100001 -0.03 -9999.3 8 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_numbers_2.tsv: -------------------------------------------------------------------------------- 1 | F1 F2 Field3 Field-4 [5] 2 | -34 9877623145 -3.067743 -78201.4200908 0 3 | 9 13 32 -678.003 45339.220776301 0.0 4 | 105 3 47.885 -423.72 1003.004792 5 | 9007 427 7003.2288643 NaN nan 6 | 22 7317 91567.821252734 -5238937176.3217 0.004316 1234.5 7 | 11136 62738536115 -14.3216 7.54845e+14 9357.003 0.55 8 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_numbers_3.tsv: -------------------------------------------------------------------------------- 1 | One 2 | 1.0 3 | 1.034 4 | 0.200 5 | 0.030 6 | 99.83 7 | 101.2 8 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_numbers_4.tsv: -------------------------------------------------------------------------------- 1 | F 2 | 0 3 | 10 4 | 100 5 | 1000 6 | 10000 7 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_numbers_noheader_1.tsv: -------------------------------------------------------------------------------- 1 | 1.0 1 10000 0 -1 2 | 1.034 2 87 1 -.5 3 | 0.200 3 0 .9 -9999 4 | 0.030 4 9999 .09 -9999. 5 | 99.83 5 10001 -1 -9999.0 6 | 101.2 6 100001 -0.03 -9999.3 7 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_numbers_noheader_2.tsv: -------------------------------------------------------------------------------- 1 | -34 9877623145 -3.067743 -78201.4200908 0 2 | 9 13 32 -678.003 45339.220776301 0.0 3 | 105 3 47.885 -423.72 1003.004792 4 | 9007 427 7003.2288643 NaN nan 5 | 22 7317 91567.821252734 -5238937176.3217 0.004316 1234.5 6 | 11136 62738536115 -14.3216 7.54845e+14 9357.003 0.55 7 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_numbers_noheader_3.tsv: -------------------------------------------------------------------------------- 1 | 1.0 2 | 1.034 3 | 0.200 4 | 0.030 5 | 99.83 6 | 101.2 7 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_numbers_noheader_4.tsv: -------------------------------------------------------------------------------- 1 | 0 2 | 10 3 | 100 4 | 1000 5 | 10000 6 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_sample_preamble.tsv: -------------------------------------------------------------------------------- 1 | # This file contains 4 fields: Color, Count, Height (Ht), and Weight (Wt). 2 | # Color is an alphabetic, the others are numeric. 3 | 4 | Color Count Ht Wt 5 | Brown 106 202.2 1.5 6 | Canary Yellow 7 106 0.761 7 | Chartreuse 1139 77.02 6.22 8 | Fluorescent Orange 422 1141.7 7.921 9 | Grey 19 140.3 1.03 10 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_text_1.tsv: -------------------------------------------------------------------------------- 1 | f1 field 2 field-3-longer-header x y 2 | a b c dd e 3 | bb cc d 4 | ccc ab cd cc cc cc cc cc e 5 | cd ab c d 6 | ccc ccc ccc ccc ddd e 7 | 8 | the last row d e 9 | -------------------------------------------------------------------------------- /tsv-pretty/tests/input_unicode.tsv: -------------------------------------------------------------------------------- 1 | Language Text 1 Text 2 Text 3 Text 4 Text 5 Text 6 Text 7 Number 2 | English snow storm soccer player town hall sleep quietly black leather jacket bongo drums 10 years ago 1.234 3 | Chinese (Simplified) 雪风暴 足球运动员 市政厅 安静地睡觉 黑色皮夹克 bongo鼓 10年前 2.345 4 | Chinese (Traditional) 雪風暴 足球運動員 市政廳 安靜地睡覺 黑色皮夾克 bongo鼓 10年前 3.456 5 | French Tempête de neige joueur de foot mairie Dormez tranquillement Veste en cuir noir Bongo drums Il ya 10 ans 4.567 6 | Georgian თოვლის ქარიშხალი ფეხბურთის მოთამაშე მუნიციპალიტეტი მშვიდი მშვიდად შავი ტყავის ქურთუკი ბოგოოს დასარტყამი 10 წლის წინ 5.678 7 | German Schneesturm Fußballspieler Rathaus Schlafe ruhig Schwarze Lederjacke Bongo-Trommeln vor 10 Jahren 6.789 8 | Greek Χιονοθύελλα ποδοσφαιριστής Δημαρχείο Κοιμάται ήσυχα Μαύρο δερμάτινο μπουφάν Bongo τύμπανα 10 ΧΡΟΝΙΑ πριν 7.891 9 | Hebrew סופת שלגים שחקן כדורגל עירייה לישון בשקט מעיל עור שחור תופים של בונגו לפני 10 שנים 8.901 10 | Japanese 吹雪 サッカー選手 町役場 静かに眠る ブラックレザージャケット ボンゴドラム 10年前 9.012 11 | Russian Снежная буря футболист ратуша Спать спокойно Черный кожаный пиджак Бонго-барабаны 10 лет назад 0.123 12 | Spanish Tormenta de nieve jugador de fútbol Ayuntamiento Dormir tranquilamente chaqueta de cuero negro Tambores bongos Hace 10 años 1.567 13 | Vietnamese Bão tuyết cầuthủ bóng đá Thị trấn Ngủ yên Áo khoác da màu đen Trống bongo 10 năm trước 2.678 14 | -------------------------------------------------------------------------------- /tsv-pretty/tests/invalid_unicode.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/tsv-pretty/tests/invalid_unicode.tsv -------------------------------------------------------------------------------- /tsv-sample/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # tsv-sample 4 | 5 | `tsv-sample` randomizes line order (shuffling) or selects random subsets of lines (sampling) from input data. Several methods are available, including shuffling, simple random sampling, weighted random sampling, Bernoulli sampling, and distinct sampling. Data can be read from files or standard input. These sampling methods are made available through several modes of operation: 6 | 7 | * Shuffling - The default mode of operation. All lines are read in and written out in random order. All orderings are equally likely. 8 | * Simple random sampling (`--n|num N`) - A random sample of `N` lines are selected and written out in random order. The `--i|inorder` option preserves the original input order. 9 | * Weighted random sampling (`--n|num N`, `--w|weight-field F`) - A weighted random sample of N lines are selected using weights from a field on each line. Output is in weighted selection order unless the `--i|inorder` option is used. Omitting `--n|num` outputs all lines in weighted selection order (weighted shuffling). 10 | * Sampling with replacement (`--r|replace`, `--n|num N`) - All lines are read in, then lines are randomly selected one at a time and written out. Lines can be selected multiple times. Output continues until `N` samples have been output. 11 | * Bernoulli sampling (`--p|prob P`) - A streaming form of sampling. Lines are read one at a time and selected for output using probability `P`. e.g. `-p 0.1` specifies that 10% of lines should be included in the sample. 12 | * Distinct sampling (`--k|key-fields F`, `--p|prob P`) - Another streaming form of sampling. However, instead of each line being subject to an independent selection choice, lines are selected based on a key contained in each line. A portion of keys are randomly selected for output, with probability P. Every line containing a selected key is included in the output. Consider a query log with records consisting of triples. It may be desirable to sample records for one percent of the users, but include all records for the selected users. 13 | 14 | `tsv-sample` is designed for large data sets. Streaming algorithms make immediate decisions on each line. They do not accumulate memory and can run on infinite length input streams. Shuffling algorithms need to hold the full output set in memory and are therefore limited by available memory. Simple and weighted random sampling use reservoir sampling and only need to hold the specified sample size (`--n|num`) in memory. By default, a new random order is generated every run, but options are available for using the same randomization order over multiple runs. The random values assigned to each line can be printed, either to observe the behavior or to run custom selection algorithms on the results. 15 | 16 | See the [tsv-sample reference](../docs/tool_reference/tsv-sample.md) for further details. 17 | -------------------------------------------------------------------------------- /tsv-sample/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tsv-sample", 3 | "description": "Randomize or sample lines from input data. Several sampling methods are available, including simple random sampling, weighted random sampling, Bernoulli sampling, and distinct sampling.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2017-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "configurations": [ 10 | { 11 | "name" : "executable", 12 | "targetName": "tsv-sample", 13 | "targetPath": "../bin/", 14 | "mainSourceFile": "src/tsv_utils/tsv-sample.d", 15 | "dependencies": { 16 | "tsv-utils:common": { "path": ".." } 17 | } 18 | }, 19 | { 20 | "name": "unittest", 21 | "targetType": "none" 22 | } 23 | ], 24 | "buildTypes": { 25 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 26 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 27 | "dflags": ["-boundscheck=off"], 28 | "dflags-osx-ldc": ["-flto=thin"] } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tsv-sample/makefile: -------------------------------------------------------------------------------- 1 | APP_USES_LDC_PGO=2 2 | include ../makedefs.mk 3 | 4 | # tsv-sample has issues with uncaught exceptions with LDC versions 1.7 and earlier 5 | # when making debug builds. The -disable-fp-elim option avoids this. 6 | 7 | ifeq ($(shell uname -s),Darwin) 8 | ifeq ($(compiler_type),ldc) 9 | ifeq ($(ldc_version),1.5.0) 10 | override debug_flags += -disable-fp-elim 11 | else ifeq ($(ldc_version),1.6.0) 12 | override debug_flags += -disable-fp-elim 13 | else ifeq ($(ldc_version),1.7.0) 14 | override debug_flags += -disable-fp-elim 15 | endif 16 | endif 17 | endif 18 | 19 | include ../makeapp.mk 20 | -------------------------------------------------------------------------------- /tsv-sample/tests/gold/error_tests_2.2081.txt: -------------------------------------------------------------------------------- 1 | Error test set 2 2 | ---------------- 3 | 4 | ====[tsv-sample -H -w 2 input3x25.tsv]==== 5 | Error [tsv-sample]: Could not process line: no digits seen 6 | File: input3x25.tsv Line: 2 7 | line title weight 8 | 9 | ====[tsv-sample -w 3 input3x25.tsv]==== 10 | Error [tsv-sample]: Could not process line: no digits seen 11 | File: input3x25.tsv Line: 1 12 | Is this a header line? Use --H|header to skip. 13 | 14 | ====[tsv-sample -p -v 10 -k 1 input4x50.tsv input4x15.tsv]==== 15 | [tsv-sample] Error processing command line arguments: no digits seen 16 | -------------------------------------------------------------------------------- /tsv-sample/tests/gold/error_tests_2.txt: -------------------------------------------------------------------------------- 1 | Error test set 2 2 | ---------------- 3 | 4 | ====[tsv-sample -H -w 2 input3x25.tsv]==== 5 | line title weight 6 | Error [tsv-sample]: Could not process line: no digits seen for input "Белые ночи". 7 | File: input3x25.tsv Line: 2 8 | 9 | ====[tsv-sample -w 3 input3x25.tsv]==== 10 | Error [tsv-sample]: Could not process line: no digits seen for input "weight". 11 | File: input3x25.tsv Line: 1 12 | Is this a header line? Use --H|header to skip. 13 | 14 | ====[tsv-sample -p -v 10 -k 1 input4x50.tsv input4x15.tsv]==== 15 | [tsv-sample] Error processing command line arguments: no digits seen for input "-v". 16 | -------------------------------------------------------------------------------- /tsv-sample/tests/input2x10_noheader.tsv: -------------------------------------------------------------------------------- 1 | 0.90391933 Folk-Lore of West and Mid-Wales 2 | 0.48032209 A Book of Christian Sonnets 3 | 0.30956877 The Courting Of Lady Jane 4 | 0.67881307 Burning Daylight 5 | 0.13769566 Eva's Adventures in Shadow-Land 6 | 0.03235488 海上花列傳 7 | 0.37116598 三國志演義 8 | 0.20987466 殘唐五代史演義傳 9 | 0.78401827 青箱雜記 10 | 0.20948332 歡喜冤家 11 | -------------------------------------------------------------------------------- /tsv-sample/tests/input2x1_noheader.tsv: -------------------------------------------------------------------------------- 1 | 0.157876295 Jacques le fataliste et son maître 2 | -------------------------------------------------------------------------------- /tsv-sample/tests/input2x5_noheader.dos_tsv: -------------------------------------------------------------------------------- 1 | 0.157876295 Jacques le fataliste et son maître 2 | 0.008718457 L'influence d'un livre 3 | 0.788170018 Les stratagèmes 4 | 0.656434250 Bulalakaw ng Pag-asa 5 | 0.767664821 Buntong Hininga 6 | -------------------------------------------------------------------------------- /tsv-sample/tests/input2x5_noheader.tsv: -------------------------------------------------------------------------------- 1 | 0.157876295 Jacques le fataliste et son maître 2 | 0.008718457 L'influence d'un livre 3 | 0.788170018 Les stratagèmes 4 | 0.656434250 Bulalakaw ng Pag-asa 5 | 0.767664821 Buntong Hininga 6 | -------------------------------------------------------------------------------- /tsv-sample/tests/input2x7_atsign.tsv: -------------------------------------------------------------------------------- 1 | title@weight 2 | 友情@5992 3 | 法螺男爵旅土産@5803 4 | 續惡魔@5815 5 | 幽霊書店@5903 6 | マルチン・ルターの小信仰問答書@5489 7 | Aristophanis Lysistrata@5464 8 | Ephemerides Barometricae Mutinenses@5869 9 | -------------------------------------------------------------------------------- /tsv-sample/tests/input3x0.tsv: -------------------------------------------------------------------------------- 1 | line title weight 2 | -------------------------------------------------------------------------------- /tsv-sample/tests/input3x10.tsv: -------------------------------------------------------------------------------- 1 | line title weight 2 | 1 Álomvilág: Elbeszélések 41 3 | 2 Grimm testvérek összegyüjtött meséi 9 4 | 3 Mesék és regék 20 5 | 4 Soitannollisia satuja ja jutelmia 44 6 | 5 Pinocchion seikkailut 19 7 | 6 Piepkuikentje 17 8 | 7 Oude Egyptische Legenden 28 9 | 8 Door het land der Skipetaren 30 10 | 9 Las Fábulas de Esopo 45 11 | 10 Platero y yo 2 12 | -------------------------------------------------------------------------------- /tsv-sample/tests/input3x25.dos_tsv: -------------------------------------------------------------------------------- 1 | line title weight 2 | 1 Белые ночи 98 3 | 2 Детство 4 4 | 3 Записки из подполья 78 5 | 4 Fru Inger til Østråt 26 6 | 5 Märchen für Kinder 73 7 | 6 Große und kleine Welt 91 8 | 7 Arabische Nächte 66 9 | 8 Διδαχή των Δώδεκα αποστόλων 2 10 | 9 Nanà a Milano 41 11 | 10 L'olmo e l'edera 78 12 | 11 Il "Damo viennese" 23 13 | 12 הצופה לבית ישראל 91 14 | 13 בית נכות ההלכות 47 15 | 14 Pasáček Ali: Pověst z východu 64 16 | 15 Zápisky z mrtvého domu 52 17 | 16 Leabhráin an Irisleabhair—III 53 18 | 17 Diné yázhí ba'áłchíní 88 19 | 18 Right Half Hollins 67 20 | 19 Annie Laurie and Azalea 85 21 | 20 羅生門 87 22 | 21 入れかわった男 33 23 | 22 豆棚閒話 73 24 | 23 佛說四十二章經 81 25 | 24 La Navidad en las Montañas 100 26 | 25 Don Quijote 88 27 | -------------------------------------------------------------------------------- /tsv-sample/tests/input3x25.tsv: -------------------------------------------------------------------------------- 1 | line title weight 2 | 1 Белые ночи 98 3 | 2 Детство 4 4 | 3 Записки из подполья 78 5 | 4 Fru Inger til Østråt 26 6 | 5 Märchen für Kinder 73 7 | 6 Große und kleine Welt 91 8 | 7 Arabische Nächte 66 9 | 8 Διδαχή των Δώδεκα αποστόλων 2 10 | 9 Nanà a Milano 41 11 | 10 L'olmo e l'edera 78 12 | 11 Il "Damo viennese" 23 13 | 12 הצופה לבית ישראל 91 14 | 13 בית נכות ההלכות 47 15 | 14 Pasáček Ali: Pověst z východu 64 16 | 15 Zápisky z mrtvého domu 52 17 | 16 Leabhráin an Irisleabhair—III 53 18 | 17 Diné yázhí ba'áłchíní 88 19 | 18 Right Half Hollins 67 20 | 19 Annie Laurie and Azalea 85 21 | 20 羅生門 87 22 | 21 入れかわった男 33 23 | 22 豆棚閒話 73 24 | 23 佛說四十二章經 81 25 | 24 La Navidad en las Montañas 100 26 | 25 Don Quijote 88 27 | -------------------------------------------------------------------------------- /tsv-sample/tests/input3x25_negative_wt.tsv: -------------------------------------------------------------------------------- 1 | line title weight 2 | 1 Белые ночи 5 3 | 2 Детство 4 4 | 3 Записки из подполья 3 5 | 4 Fru Inger til Østråt 1 6 | 5 Märchen für Kinder 0.77 7 | 6 Große und kleine Welt 2 8 | 7 Arabische Nächte -1 9 | 8 Διδαχή των Δώδεκα αποστόλων 2 10 | 9 Nanà a Milano 1.5 11 | 10 L'olmo e l'edera 2.5 12 | 11 Il "Damo viennese" 6.5 13 | 12 הצופה לבית ישראל 2.3 14 | 13 בית נכות ההלכות 4.7 15 | 14 Pasáček Ali: Pověst z východu 6.4 16 | 15 Zápisky z mrtvého domu .52 17 | 16 Leabhráin an Irisleabhair—III 5.3 18 | 17 Diné yázhí ba'áłchíní 0.88 19 | 18 Right Half Hollins 6.7 20 | 19 Annie Laurie and Azalea 0.5 21 | 20 羅生門 0.01 22 | 21 入れかわった男 3.3 23 | 22 豆棚閒話 0.23 24 | 23 佛說四十二章經 0.55 25 | 24 La Navidad en las Montañas 1.0 26 | 25 Don Quijote 3.4 27 | -------------------------------------------------------------------------------- /tsv-sample/tests/input3x3.tsv: -------------------------------------------------------------------------------- 1 | line title weight 2 | 1 Thérèse Desqueyroux 43 3 | 2 Bonjour Tristesse 3 4 | 3 Le Silence de la mer 72 5 | -------------------------------------------------------------------------------- /tsv-sample/tests/input3x4.tsv: -------------------------------------------------------------------------------- 1 | line title weight 2 | 1 Aurélien 36 3 | 2 A Room of One's Own 62 4 | 3 The Catcher in the Rye 19 5 | 4 Ficciones 23 6 | -------------------------------------------------------------------------------- /tsv-sample/tests/input4x15.tsv: -------------------------------------------------------------------------------- 1 | c-1 c-2 c-3 c-4 2 | pechschwarz 2079 Araqua-pintado 1 3 | Grünspan 2105 Macuco 0 4 | ébène 2142 Орёл 2 5 | blutrot 2093 Araqua-pintado 2 6 | fumée 2129 Löffelente 0 7 | jaune 2090 Weißwangengans 4 8 | türkis 2097 Голубь 2 9 | Grünspan 2100 Purpurreiher 1 10 | rouge 2058 Pipit de Godlewski 4 11 | marrón 2121 Marreca-cabocla 2 12 | blutrot 2137 Marreca-cabocla 1 13 | jaune 2058 Purpurreiher 2 14 | jaune 2104 Tüpfelsumpfhuhn 1 15 | dorado 2147 Tüpfelsumpfhuhn 1 16 | jaune 2097 Purpurreiher 4 17 | -------------------------------------------------------------------------------- /tsv-sample/tests/input4x50.tsv: -------------------------------------------------------------------------------- 1 | c-1 c-2 c-3 c-4 2 | púrpura 2088 Macuco 1 3 | blutrot 2142 Tüpfelsumpfhuhn 1 4 | Indigo 2056 Голубь 1 5 | Indigo 2141 Голубь 1 6 | blutrot 2118 Tüpfelsumpfhuhn 4 7 | Cerise 2076 Malvasía Cabeciblanca 4 8 | schneeweiß 2117 Porrón Islándico 4 9 | café 2088 Purpurreiher 2 10 | Orange-red 2089 Purpurreiher 1 11 | 暗紅色/暗赤色 2015 Malvasía Cabeciblanca 2 12 | rouge 2132 Löffelente 4 13 | zitronengelb 2136 Macreuse à bec jaune 3 14 | Grünspan 2145 Pipit de Godlewski 4 15 | zitronengelb 2083 Macreuse à bec jaune 4 16 | púrpura 2070 Macreuse à bec jaune 2 17 | blutrot 2121 Воробей 4 18 | marrón 2102 Weißwangengans 1 19 | púrpura 2092 Macreuse à bec jaune 4 20 | rouge 2146 Löffelente 1 21 | blutrot 2117 Tüpfelsumpfhuhn 0 22 | púrpura 2093 Macreuse à bec jaune 4 23 | schneeweiß 2121 Porrón Islándico 1 24 | dorado 2045 Лебедь 2 25 | café 2062 Purpurreiher 4 26 | Cerise 2119 Malvasía Cabeciblanca 4 27 | noir 2082 Weißwangengans 1 28 | noir 2094 Weißwangengans 4 29 | blanc 2137 Голубь 2 30 | café 2100 Purpurreiher 2 31 | rouge 2135 Marreca-cabocla 1 32 | красный 2049 Weißwangengans 3 33 | noir 2115 Weißwangengans 2 34 | blanc 2109 Голубь 3 35 | blutrot 2143 Tüpfelsumpfhuhn 4 36 | púrpura 2119 Macuco 4 37 | púrpura 2145 Macreuse à bec jaune 2 38 | Grünspan 2082 Pipit de Godlewski 2 39 | blutrot 2149 Tüpfelsumpfhuhn 1 40 | blutrot 2120 Tüpfelsumpfhuhn 1 41 | café 2041 Purpurreiher 4 42 | Indigo 2138 Голубь 4 43 | café 2019 Marreca-cabocla 1 44 | Grünspan 2053 Pipit de Godlewski 4 45 | blanc 2038 Голубь 0 46 | Grünspan 2071 Pipit de Godlewski 3 47 | rouge 2037 Marreca-cabocla 0 48 | fumée 2113 Araqua-pintado 0 49 | marrón 2034 Weißwangengans 1 50 | fumée 2124 Araqua-pintado 0 51 | -------------------------------------------------------------------------------- /tsv-sample/tests/test-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "output_files" : [ 3 | { 4 | "name" : "basic_tests_1.txt" 5 | }, 6 | { 7 | "name" : "error_tests_1.txt" 8 | }, 9 | { 10 | "name" : "error_tests_2.txt", 11 | "versions" : [ 12 | "error_tests_2.2081.txt" 13 | ] 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /tsv-select/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # tsv-select 4 | 5 | A version of the Unix `cut` utility with the ability to select fields by name, drop fields, and reorder fields. The following command writes the `date` and `time` fields from a pair of files to standard output: 6 | ``` 7 | $ tsv-select -H -f date,time file1.tsv file2.tsv 8 | ``` 9 | Fields can also be selected by field number: 10 | ``` 11 | $ tsv-select -f 4,2,9-11 file1.tsv file2.tsv 12 | ``` 13 | 14 | Fields can be listed more than once, and fields not specified can be selected as a group using `--r|rest`. Fields can be dropped using `--e|exclude`. 15 | 16 | The `--H|header` option turns on header processing. This enables specifying fields by name. Only the header from the first file is retained when multiple input files are provided. 17 | 18 | Examples: 19 | ``` 20 | $ # Output fields 2 and 1, in that order. 21 | $ tsv-select -f 2,1 data.tsv 22 | 23 | $ # Output the 'Name' and 'RecordNum' fields. 24 | $ tsv-select -H -f Name,RecordNum data.tsv. 25 | 26 | $ # Drop the first field, keep everything else. 27 | $ tsv-select --exclude 1 file.tsv 28 | 29 | $ # Drop the 'Color' field, keep everything else. 30 | $ tsv-select -H --exclude Color file.tsv 31 | 32 | $ # Move the 'RecordNum' field to the start of the line. 33 | $ tsv-select -H -f RecordNum --rest last data.tsv 34 | 35 | $ # Move field 1 to the end of the line. 36 | $ tsv-select -f 1 --rest first data.tsv 37 | 38 | $ # Output a range of fields in reverse order. 39 | $ tsv-select -f 30-3 data.tsv 40 | 41 | $ # Drop all the fields ending in '_time' 42 | $ tsv-select -H -e '*_time' data.tsv 43 | 44 | $ # Multiple files with header lines. Keep only one header. 45 | $ tsv-select data*.tsv -H --fields 1,2,4-7,14 46 | ``` 47 | 48 | Named fields, dropping and reordering fields, and header line management are useful enhancements over traditional `cut`. However, much of the motivation for writing `tsv-select` was to explore the D programming language and provide a comparison point against other common approaches to this task. Code for `tsv-select` is a bit more liberal with comments pointing out D programming constructs than code for the other tools. As an unexpected benefit, `tsv-select` is faster than other implementations of `cut` that are available. 49 | 50 | See the [tsv-select reference](../docs/tool_reference/tsv-select.md) for more details on `tsv-select`. See [Field syntax](../docs/tool_reference/common-options-and-behavior.md#field-syntax) for more information on selecting fields by name. 51 | -------------------------------------------------------------------------------- /tsv-select/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tsv-select", 3 | "description": "Output select columns from TSV files.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2015-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "configurations": [ 10 | { 11 | "name" : "executable", 12 | "targetName": "tsv-select", 13 | "targetPath": "../bin/", 14 | "mainSourceFile": "src/tsv_utils/tsv-select.d", 15 | "excludedSourceFiles": ["src/tsv-select_no-template-version.d"], 16 | "dependencies": { 17 | "tsv-utils:common": { "path": ".." } 18 | } 19 | }, 20 | { 21 | "name": "unittest", 22 | "targetType": "none" 23 | } 24 | ], 25 | "buildTypes": { 26 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 27 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 28 | "dflags": ["-boundscheck=off"], 29 | "dflags-osx-ldc": ["-flto=thin"] } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /tsv-select/makefile: -------------------------------------------------------------------------------- 1 | APP_USES_LDC_PGO=2 2 | include ../makedefs.mk 3 | include ../makeapp.mk 4 | 5 | # No built-in unit tests 6 | unittest: ; 7 | unittest-codecov: ; 8 | -------------------------------------------------------------------------------- /tsv-select/profile_data/collect_profile_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | if [ $# -eq 0 ]; then 4 | echo "Insufficient arguments. The path of the instrumented program is required." 5 | exit 1 6 | fi 7 | 8 | prog=$1 9 | shift 10 | 11 | ldc_profdata_tool_name=ldc-profdata 12 | ldc_profdata_tool=${ldc_profdata_tool_name} 13 | 14 | if [ $# -ne 0 ]; then 15 | ldc_profdata_tool=${1}/bin/${ldc_profdata_tool_name} 16 | fi 17 | 18 | for f in profile.*.raw; do 19 | if [ -e $f ]; then 20 | rm $f 21 | fi 22 | done 23 | 24 | if [ -e app.profdata ]; then 25 | rm -f app.profdata 26 | fi 27 | 28 | $prog profile_data_1.tsv -H -f 1-3,17,13-9 > /dev/null 29 | $prog profile_data_1.tsv -H -f 1 > /dev/null 30 | $prog profile_data_1.tsv -H -f 20 > /dev/null 31 | $prog profile_data_1.tsv -H -f 11 > /dev/null 32 | $prog profile_data_2.tsv -H -f 4 > /dev/null 33 | $prog profile_data_2.tsv -H -f 1,2 > /dev/null 34 | $prog profile_data_2.tsv -H -f 2-4 > /dev/null 35 | $prog profile_data_3.tsv -H -f 8 > /dev/null 36 | $prog profile_data_3.tsv -H -f 5,3,1 > /dev/null 37 | $prog profile_data_3.tsv -H -f 1-3 > /dev/null 38 | $prog profile_data_3.tsv -H -f 7 > /dev/null 39 | $prog profile_data_3.tsv -H -f 3-6 > /dev/null 40 | $prog profile_data_1.tsv -H -f 5 --rest last > /dev/null 41 | $prog profile_data_1.tsv -H -f 1 --rest first > /dev/null 42 | $prog -H --exclude 1 profile_data_1.tsv profile_data_2.tsv profile_data_3.tsv -H > /dev/null 43 | $prog profile_data_3.tsv --exclude 2-4 > /dev/null 44 | 45 | ${ldc_profdata_tool} merge -o app.profdata profile.*.raw 46 | -------------------------------------------------------------------------------- /tsv-select/tests/input1.dos_tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 f4 2 | 1 ggg UUU 101 3 | f1-empty CCC 5734 4 | 3 ßßß SSS 7 5 | 4 sss f4-empty 6 | 5 ÀBC 1367 7 | 6 f23-empty 8 | 7 f23-space 9 | 8 0.0 Z 1931 10 | -------------------------------------------------------------------------------- /tsv-select/tests/input1.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 f4 2 | 1 ggg UUU 101 3 | f1-empty CCC 5734 4 | 3 ßßß SSS 7 5 | 4 sss f4-empty 6 | 5 ÀBC 1367 7 | 6 f23-empty 8 | 7 f23-space 9 | 8 0.0 Z 1931 10 | -------------------------------------------------------------------------------- /tsv-select/tests/input_1field.tsv: -------------------------------------------------------------------------------- 1 | 1 2 | 2 abc def 3 | 3 4 | 4 567 89-10 5 | -------------------------------------------------------------------------------- /tsv-select/tests/input_2fields.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 2 | abc def 3 | 123 456 4 | ABC DEF 5 | -------------------------------------------------------------------------------- /tsv-select/tests/input_2plus_hat_delim.tsv: -------------------------------------------------------------------------------- 1 | f1^f2 2 | abc^def^ghi 3 | ^^ 4 | ^^^ 5 | 123^456^789^ 6 | ^abc^ 7 | -------------------------------------------------------------------------------- /tsv-select/tests/input_3plus_fields.tsv: -------------------------------------------------------------------------------- 1 | 1 ggg UUU 101 2 | 2 ggg CCC 5734 52 3 | 3 ßßß SSS 4 | 4 ssssss sss 18 16 f6 f7 f8 f9 f10 5 | 5 ÀBC 1367 1331 1234 4567 6 | 6 e-e 7 | 7 sp-sp 2020 2021 8 | 8 empty-last 9 | 9 next-line-6-empty | 10 | 11 | 11 12 | 12 A B C D 13 | 13 AA BB CC DD EE 14 | 14 ß Ä ä Ö ö 15 | 15 ßß ÄÄ ää ÖÖ öö 16 | 16 ßsß ÄAÄ äaä ÖOÖ öoö 17 | 17 last-row done 18 | -------------------------------------------------------------------------------- /tsv-select/tests/input_3x0.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 2 | -------------------------------------------------------------------------------- /tsv-select/tests/input_3x1.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 2 | 3x1-r1 201 301 3 | -------------------------------------------------------------------------------- /tsv-select/tests/input_3x2.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 2 | 3x2-r1 2001 3001 3 | 3x2-r2 2002 3002 4 | -------------------------------------------------------------------------------- /tsv-select/tests/input_3x3.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 2 | 3x3-r1 21 31 3 | 3x3-r2 22 32 4 | 3x3-r3 23 33 5 | -------------------------------------------------------------------------------- /tsv-select/tests/input_emptyfile.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/tsv-select/tests/input_emptyfile.tsv -------------------------------------------------------------------------------- /tsv-select/tests/input_header1.tsv: -------------------------------------------------------------------------------- 1 | field1 field2 field3 2 | 11567 12567 13567 3 | 21567 22567 23567 4 | -------------------------------------------------------------------------------- /tsv-select/tests/input_header2.tsv: -------------------------------------------------------------------------------- 1 | field1 field2 field3 2 | 11987 12987 13987 3 | -------------------------------------------------------------------------------- /tsv-select/tests/input_header3.tsv: -------------------------------------------------------------------------------- 1 | field1 field2 field3 2 | -------------------------------------------------------------------------------- /tsv-select/tests/input_header4.tsv: -------------------------------------------------------------------------------- 1 | field1 field2 field3 2 | 11888 12888 13888 3 | 21888 22888 23888 4 | -------------------------------------------------------------------------------- /tsv-select/tests/input_header_variants.tsv: -------------------------------------------------------------------------------- 1 | ab:c ab c ab-c ab\c ab/c ab,c ab.c ab*c 1 01 56:7 56 7 56-7 56\7 56/7 56,7 56.7 56*7 ab:56 ab 56 ab-56 ab\56 ab/56 ab,56 ab.56 ab*56 濡れ羽色 2 | r1c0 r1c1 r1c2 r1c3 r1c4 r1c5 r1c6 r1c7 r1c8 r1c9 r1c10 r1c11 r1c12 r1c13 r1c14 r1c15 r1c16 r1c17 r1c18 r1c19 r1c20 r1c21 r1c22 r1c23 r1c24 r1c25 r1c26 3 | -------------------------------------------------------------------------------- /tsv-split/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # tsv-split 4 | 5 | `tsv-split` is used to split one or more input files into multiple output files. There are three modes of operation: 6 | * Fixed number of lines per file (`--l|lines-per-file NUM`): Each input block of NUM lines is written to a new file. This is similar to the Unix `split` utility. 7 | 8 | * Random assignment (`--n|num-files NUM`): Each input line is written to a randomly selected output file. Random selection is from NUM files. 9 | 10 | * Random assignment by key (`--n|num-files NUM, --k|key-fields FIELDS`): Input lines are written to output files using fields as a key. Each unique key is randomly assigned to one of NUM output files. All lines with the same key are written to the same file. 11 | 12 | By default, files are written to the current directory and have names of the form `part_NNN`, with `NNN` being a number and `` being the extension of the first input file. If the input file is `file.txt`, the names will take the form `part_NNN.txt`. The output directory and file names are customizable. 13 | 14 | Examples: 15 | ``` 16 | $ # Split a file into files of 10,000 lines each. Output files 17 | $ # are written to the 'split_files/' directory. 18 | $ tsv-split data.txt --lines-per-file 10000 --dir split_files 19 | 20 | $ # Split a file into 1000 files with lines randomly assigned. 21 | $ tsv-split data.txt --num-files 1000 --dir split_files 22 | 23 | # Randomly assign lines to 1000 files using field 3 as a key. 24 | $ tsv-split data.tsv --num-files 1000 -key-fields 3 --dir split_files 25 | ``` 26 | 27 | See the [tsv-split reference](../docs/tool_reference/tsv-split.md) for more information. 28 | -------------------------------------------------------------------------------- /tsv-split/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tsv-split", 3 | "description": "Split a file into multiple files.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2020-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "configurations": [ 10 | { 11 | "name" : "executable", 12 | "targetName": "tsv-split", 13 | "targetPath": "../bin/", 14 | "mainSourceFile": "src/tsv_utils/tsv-split.d", 15 | "dependencies": { 16 | "tsv-utils:common": { "path": ".." } 17 | } 18 | }, 19 | { 20 | "name": "unittest", 21 | "targetType": "none" 22 | } 23 | ], 24 | "buildTypes": { 25 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 26 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 27 | "dflags": ["-boundscheck=off"], 28 | "dflags-osx-ldc": ["-flto=thin"] } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tsv-split/makefile: -------------------------------------------------------------------------------- 1 | # PGO disabled for now due to LDC compilation failures starting with LDC 1.12.0 2 | # APP_USES_LDC_PGO=2 3 | include ../makedefs.mk 4 | include ../makeapp.mk 5 | -------------------------------------------------------------------------------- /tsv-split/profile_data/collect_profile_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | if [ $# -eq 0 ]; then 4 | echo "Insufficient arguments. The path of the instrumented program is required." 5 | exit 1 6 | fi 7 | 8 | prog=$1 9 | shift 10 | 11 | ldc_profdata_tool_name=ldc-profdata 12 | ldc_profdata_tool=${ldc_profdata_tool_name} 13 | 14 | if [ $# -ne 0 ]; then 15 | ldc_profdata_tool=${1}/bin/${ldc_profdata_tool_name} 16 | fi 17 | 18 | for f in profile.*.raw; do 19 | if [ -e $f ]; then 20 | rm $f 21 | fi 22 | done 23 | 24 | if [ -e app.profdata ]; then 25 | rm -f app.profdata 26 | fi 27 | 28 | mkdir -p odir 29 | 30 | $prog --dir odir profile_data_1.tsv --lines-per-file 10 ; rm odir/* 31 | $prog --dir odir profile_data_1.tsv --lines-per-file 100 ; rm odir/* 32 | $prog --dir odir profile_data_1.tsv --num-files 5 ; rm odir/* 33 | $prog --dir odir profile_data_1.tsv --num-files 50 ; rm odir/* 34 | $prog --dir odir profile_data_1.tsv --num-files 5 -k 1 ; rm odir/* 35 | $prog --dir odir profile_data_1.tsv --num-files 50 -k 1 ; rm odir/* 36 | 37 | $prog --dir odir profile_data_2.tsv --lines-per-file 500 ; rm odir/* 38 | $prog --dir odir profile_data_2.tsv --lines-per-file 20 -H ; rm odir/* 39 | $prog --dir odir profile_data_2.tsv --num-files 100 -H ; rm odir/* 40 | $prog --dir odir profile_data_2.tsv --num-files 100 -I ; rm odir/* 41 | $prog --dir odir profile_data_2.tsv --num-files 5 -k 1 ; rm odir/* 42 | cat profile_data_2.tsv | $prog --dir odir --lines-per-file 100 -I ; rm odir/* 43 | cat profile_data_2.tsv | $prog --dir odir --num-files 100 -I ; rm odir/* 44 | cat profile_data_2.tsv | $prog --dir odir --num-files 100 -k 2,4 -I ; rm odir/* 45 | cat profile_data_2.tsv | $prog --dir odir --num-files 100 -k 1,3 -H ; rm odir/* 46 | 47 | $prog --dir odir profile_data_3.tsv --lines-per-file 300 ; rm odir/* 48 | $prog --dir odir profile_data_3.tsv --num-files 200 --max-open-files 20 ; rm odir/* 49 | $prog --dir odir profile_data_3.tsv --num-files 200 -k 4 --max-open-files 20 ; rm odir/* 50 | $prog --dir odir profile_data_3.tsv --num-files 200 -k 2,3 --max-open-files 100 ; rm odir/* 51 | 52 | $prog --dir odir --lines-per-file 1000 profile_data_1.tsv profile_data_2.tsv profile_data_3.tsv ; rm odir/* 53 | $prog --dir odir --num-files 100 profile_data_1.tsv profile_data_2.tsv profile_data_3.tsv ; rm odir/* 54 | $prog --dir odir --num-files 100 -k 1,2 profile_data_1.tsv profile_data_2.tsv profile_data_3.tsv ; rm odir/* 55 | 56 | rmdir odir 57 | 58 | ${ldc_profdata_tool} merge -o app.profdata profile.*.raw 59 | -------------------------------------------------------------------------------- /tsv-split/tests/empty-file.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/tsv-split/tests/empty-file.txt -------------------------------------------------------------------------------- /tsv-split/tests/gold/help_and_version_tests.txt: -------------------------------------------------------------------------------- 1 | 2 | Help and Version printing 1 3 | ----------------- 4 | 5 | ====[tsv-split --help | grep -c Synopsis]==== 6 | 1 7 | ====[tsv-split --help-verbose | grep -c Synopsis]==== 8 | 1 9 | ====[tsv-split --help-fields | head -n 1]==== 10 | tsv-utils Field Syntax 11 | ====[tsv-split --version | grep -c 'tsv-split (eBay/tsv-utils)']==== 12 | 1 13 | ====[tsv-split -V | grep -c 'tsv-split (eBay/tsv-utils)']==== 14 | 1 15 | -------------------------------------------------------------------------------- /tsv-split/tests/input1x3.txt: -------------------------------------------------------------------------------- 1 | input1x3.txt: line 1 2 | input1x3.txt: line 2 3 | input1x3.txt: line 3 4 | -------------------------------------------------------------------------------- /tsv-split/tests/input1x5.txt: -------------------------------------------------------------------------------- 1 | input1x5.txt: line 1 2 | input1x5.txt: line 2 3 | input1x5.txt: line 3 4 | input1x5.txt: line 4 5 | input1x5.txt: line 5 6 | -------------------------------------------------------------------------------- /tsv-split/tests/input4x18.tsv: -------------------------------------------------------------------------------- 1 | c-1 c-2 c-3 c-4 2 | pechschwarz 2079 Araqua-pintado 1 3 | Grünspan 2105 Macuco 0 4 | ébène 2142 Орёл 2 5 | blutrot 2093 Araqua-pintado 2 6 | fumée 2129 Löffelente 0 7 | jaune 2090 Weißwangengans 4 8 | türkis 2097 Голубь 2 9 | Grünspan 2100 Purpurreiher 1 10 | rouge 2058 Pipit de Godlewski 4 11 | marrón 2121 Marreca-cabocla 2 12 | GRÜNSPAN 2145 pipit de godlewski 4 13 | blutrot 2137 Marreca-cabocla 1 14 | jaune 2058 Purpurreiher 2 15 | BLUTROT 2142 TÜPFELSUMPFHUHN 1 16 | jaune 2104 Tüpfelsumpfhuhn 1 17 | dorado 2147 Tüpfelsumpfhuhn 1 18 | jaune 2097 Purpurreiher 4 19 | -------------------------------------------------------------------------------- /tsv-split/tests/input4x58.tsv: -------------------------------------------------------------------------------- 1 | c-1 c-2 c-3 c-4 2 | púrpura 2088 Macuco 1 3 | blutrot 2142 Tüpfelsumpfhuhn 1 4 | Blutrot 2142 tüpfelsumpfhuhn 1 5 | BLUTROT 2142 TÜPFELSUMPFHUHN 1 6 | Indigo 2056 Голубь 1 7 | Indigo 2141 Голубь 1 8 | blutrot 2118 Tüpfelsumpfhuhn 4 9 | Cerise 2076 Malvasía Cabeciblanca 4 10 | schneeweiß 2117 Porrón Islándico 4 11 | café 2088 Purpurreiher 2 12 | Orange-red 2089 Purpurreiher 1 13 | 暗紅色/暗赤色 2015 Malvasía Cabeciblanca 2 14 | rouge 2132 Löffelente 4 15 | zitronengelb 2136 Macreuse à bec jaune 3 16 | Grünspan 2145 Pipit de Godlewski 4 17 | grünspan 2145 PIPIT DE GODLEWSKI 4 18 | GRÜNSPAN 2145 pipit de godlewski 4 19 | Blutrot 2142 tüpfelsumpfhuhn 1 20 | zitronengelb 2083 Macreuse à bec jaune 4 21 | púrpura 2070 Macreuse à bec jaune 2 22 | blutrot 2121 Воробей 4 23 | marrón 2102 Weißwangengans 1 24 | púrpura 2092 Macreuse à bec jaune 4 25 | rouge 2146 Löffelente 1 26 | blutrot 2117 Tüpfelsumpfhuhn 0 27 | púrpura 2093 Macreuse à bec jaune 4 28 | schneeweiß 2121 Porrón Islándico 1 29 | dorado 2045 Лебедь 2 30 | café 2062 Purpurreiher 4 31 | Cerise 2119 Malvasía Cabeciblanca 4 32 | noir 2082 Weißwangengans 1 33 | noir 2094 Weißwangengans 4 34 | blanc 2137 Голубь 2 35 | café 2100 Purpurreiher 2 36 | BLUTROT 2142 TÜPFELSUMPFHUHN 1 37 | rouge 2135 Marreca-cabocla 1 38 | красный 2049 Weißwangengans 3 39 | noir 2115 Weißwangengans 2 40 | grünspan 2145 PIPIT DE GODLEWSKI 4 41 | blanc 2109 Голубь 3 42 | blutrot 2143 Tüpfelsumpfhuhn 4 43 | púrpura 2119 Macuco 4 44 | púrpura 2145 Macreuse à bec jaune 2 45 | Grünspan 2082 Pipit de Godlewski 2 46 | blutrot 2149 Tüpfelsumpfhuhn 1 47 | blutrot 2120 Tüpfelsumpfhuhn 1 48 | café 2041 Purpurreiher 4 49 | GRÜNSPAN 2145 pipit de godlewski 4 50 | Indigo 2138 Голубь 4 51 | café 2019 Marreca-cabocla 1 52 | Grünspan 2053 Pipit de Godlewski 4 53 | blanc 2038 Голубь 0 54 | Grünspan 2071 Pipit de Godlewski 3 55 | rouge 2037 Marreca-cabocla 0 56 | fumée 2113 Araqua-pintado 0 57 | marrón 2034 Weißwangengans 1 58 | fumée 2124 Araqua-pintado 0 59 | -------------------------------------------------------------------------------- /tsv-split/tests/input4x58_colon-delim.tsv: -------------------------------------------------------------------------------- 1 | c-1:c-2:c-3:c-4 2 | púrpura:2088:Macuco:1 3 | blutrot:2142:Tüpfelsumpfhuhn:1 4 | Blutrot:2142:tüpfelsumpfhuhn:1 5 | BLUTROT:2142:TÜPFELSUMPFHUHN:1 6 | Indigo:2056:Голубь:1 7 | Indigo:2141:Голубь:1 8 | blutrot:2118:Tüpfelsumpfhuhn:4 9 | Cerise:2076:Malvasía Cabeciblanca:4 10 | schneeweiß:2117:Porrón Islándico:4 11 | café:2088:Purpurreiher:2 12 | Orange-red:2089:Purpurreiher:1 13 | 暗紅色/暗赤色:2015:Malvasía Cabeciblanca:2 14 | rouge:2132:Löffelente:4 15 | zitronengelb:2136:Macreuse à bec jaune:3 16 | Grünspan:2145:Pipit de Godlewski:4 17 | grünspan:2145:PIPIT DE GODLEWSKI:4 18 | GRÜNSPAN:2145:pipit de godlewski:4 19 | Blutrot:2142:tüpfelsumpfhuhn:1 20 | zitronengelb:2083:Macreuse à bec jaune:4 21 | púrpura:2070:Macreuse à bec jaune:2 22 | blutrot:2121:Воробей:4 23 | marrón:2102:Weißwangengans:1 24 | púrpura:2092:Macreuse à bec jaune:4 25 | rouge:2146:Löffelente:1 26 | blutrot:2117:Tüpfelsumpfhuhn:0 27 | púrpura:2093:Macreuse à bec jaune:4 28 | schneeweiß:2121:Porrón Islándico:1 29 | dorado:2045:Лебедь:2 30 | café:2062:Purpurreiher:4 31 | Cerise:2119:Malvasía Cabeciblanca:4 32 | noir:2082:Weißwangengans:1 33 | noir:2094:Weißwangengans:4 34 | blanc:2137:Голубь:2 35 | café:2100:Purpurreiher:2 36 | BLUTROT:2142:TÜPFELSUMPFHUHN:1 37 | rouge:2135:Marreca-cabocla:1 38 | красный:2049:Weißwangengans:3 39 | noir:2115:Weißwangengans:2 40 | grünspan:2145:PIPIT DE GODLEWSKI:4 41 | blanc:2109:Голубь:3 42 | blutrot:2143:Tüpfelsumpfhuhn:4 43 | púrpura:2119:Macuco:4 44 | púrpura:2145:Macreuse à bec jaune:2 45 | Grünspan:2082:Pipit de Godlewski:2 46 | blutrot:2149:Tüpfelsumpfhuhn:1 47 | blutrot:2120:Tüpfelsumpfhuhn:1 48 | café:2041:Purpurreiher:4 49 | GRÜNSPAN:2145:pipit de godlewski:4 50 | Indigo:2138:Голубь:4 51 | café:2019:Marreca-cabocla:1 52 | Grünspan:2053:Pipit de Godlewski:4 53 | blanc:2038:Голубь:0 54 | Grünspan:2071:Pipit de Godlewski:3 55 | rouge:2037:Marreca-cabocla:0 56 | fumée:2113:Araqua-pintado:0 57 | marrón:2034:Weißwangengans:1 58 | fumée:2124:Araqua-pintado:0 59 | -------------------------------------------------------------------------------- /tsv-summarize/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # tsv-summarize 4 | 5 | `tsv-summarize` performs statistical calculations on fields. For example, generating the sum or median of a field's values. Calculations can be run across the entire input or can be grouped by key fields. Consider the file `data.tsv`: 6 | ``` 7 | color weight 8 | red 6 9 | red 5 10 | blue 15 11 | red 4 12 | blue 10 13 | ``` 14 | Calculations of the sum and mean of the `weight` column is shown below. The first command runs calculations on all values. The second groups them by color. 15 | ``` 16 | $ tsv-summarize --header --sum weight --mean weight data.tsv 17 | weight_sum weight_mean 18 | 40 8 19 | 20 | $ tsv-summarize --header --group-by color --sum weight --mean color data.tsv 21 | color weight_sum weight_mean 22 | red 15 5 23 | blue 25 12.5 24 | ``` 25 | 26 | Multiple fields can be used as the `--group-by` key. The file's sort order does not matter, there is no need to sort in the `--group-by` order first. Fields can be specified either by name or field number, like other tsv-utils tools. 27 | 28 | See the [tsv-summarize reference](../docs/tool_reference/tsv-summarize.md) for the list of statistical and other aggregation operations available. 29 | -------------------------------------------------------------------------------- /tsv-summarize/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tsv-summarize", 3 | "description": "Run aggregation and summarization operations on fields from TSV files.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2016-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "configurations": [ 10 | { 11 | "name" : "executable", 12 | "targetName": "tsv-summarize", 13 | "targetPath": "../bin/", 14 | "mainSourceFile": "src/tsv_utils/tsv-summarize.d", 15 | "dependencies": { 16 | "tsv-utils:common": { "path": ".." } 17 | } 18 | }, 19 | { 20 | "name": "unittest", 21 | "targetType": "none" 22 | } 23 | ], 24 | "buildTypes": { 25 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 26 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 27 | "dflags": ["-boundscheck=off"], 28 | "dflags-osx-ldc": ["-flto=thin"] } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tsv-summarize/makefile: -------------------------------------------------------------------------------- 1 | APP_USES_LDC_PGO=1 2 | include ../makedefs.mk 3 | include ../makeapp.mk 4 | -------------------------------------------------------------------------------- /tsv-summarize/profile_data/collect_profile_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | if [ $# -eq 0 ]; then 4 | echo "Insufficient arguments. The path of the instrumented program is required." 5 | exit 1 6 | fi 7 | 8 | prog=$1 9 | shift 10 | 11 | ldc_profdata_tool_name=ldc-profdata 12 | ldc_profdata_tool=${ldc_profdata_tool_name} 13 | 14 | if [ $# -ne 0 ]; then 15 | ldc_profdata_tool=${1}/bin/${ldc_profdata_tool_name} 16 | fi 17 | 18 | for f in profile.*.raw; do 19 | if [ -e $f ]; then 20 | rm $f 21 | fi 22 | done 23 | 24 | if [ -e app.profdata ]; then 25 | rm -f app.profdata 26 | fi 27 | 28 | $prog profile_data_1.tsv -H --min 3,8,16 --max 3,8,16 --range 3,8,16 --sum 3,8,16 --mean 4,9,17 > /dev/null 29 | $prog profile_data_1.tsv -H --median 19,5,11 --quantile 19,5:0.25,0.9 --mad 11,5 --var 8,9 --stdev 14,15 > /dev/null 30 | $prog profile_data_2.tsv --group-by 1 --max 3,4 --median 3,4 --sum 3,4 > /dev/null 31 | $prog profile_data_2.tsv --group-by 2 --mean 3,4 --median 3,4 --mad 3,4> /dev/null 32 | $prog profile_data_3.tsv -H --unique-count 1,3 --missing-count 5 --not-missing-count 5 --unique-values 4 > /dev/null 33 | $prog profile_data_3.tsv -H --group-by 1,3 --count --range 6-8 --median 6-8 > /dev/null 34 | $prog profile_data_3.tsv -H --group-by 1 --count --retain 2 --first 6 --last 7 --mode 5 --mode-count 5 --values 3 > /dev/null 35 | 36 | ${ldc_profdata_tool} merge -o app.profdata profile.*.raw 37 | -------------------------------------------------------------------------------- /tsv-summarize/tests/empty_file.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/tsv-summarize/tests/empty_file.tsv -------------------------------------------------------------------------------- /tsv-summarize/tests/input_1field_a.dos_tsv: -------------------------------------------------------------------------------- 1 | size 2 | 10 3 | small 4 | 5 | small 6 | 8 7 | 10 8 | -------------------------------------------------------------------------------- /tsv-summarize/tests/input_1field_a.tsv: -------------------------------------------------------------------------------- 1 | size 2 | 10 3 | small 4 | 5 | small 6 | 8 7 | 10 8 | -------------------------------------------------------------------------------- /tsv-summarize/tests/input_1field_b.tsv: -------------------------------------------------------------------------------- 1 | size 2 | 9 3 | medium 4 | 10 5 | -------------------------------------------------------------------------------- /tsv-summarize/tests/input_2field_a.tsv: -------------------------------------------------------------------------------- 1 | field1 field2 2 | k3 10 3 | k3 20 4 | k2 5 5 | k1 15 6 | k3 25 7 | k2 10 8 | k4 16 9 | k4 20 10 | k5 15 11 | k5 14 12 | k4 18 13 | k5 13 14 | k5 22 15 | k4 17 16 | k5 27 17 | -------------------------------------------------------------------------------- /tsv-summarize/tests/input_2field_b.tsv: -------------------------------------------------------------------------------- 1 | field1 field2 2 | k3 10 3 | k3 20 4 | k2 5 5 | k1 15 6 | k3 25 7 | k2 10 8 | k4 16 9 | k4 20 10 | k5 15 11 | k5 14 12 | k4 18 13 | k5 13 14 | k5 22 15 | k4 17 16 | k5 27 17 | -------------------------------------------------------------------------------- /tsv-summarize/tests/input_5field_a.tsv: -------------------------------------------------------------------------------- 1 | color pattern length width height 2 | red solid 10 4 7 3 | red striped 8 6 6 4 | blue solid 16 2 4 5 | green solid 11 5.5 3.2 6 | blue striped 12 1 2 7 | blue solid 14 4 3 8 | green solid 7.4 6.0 5.4 9 | -------------------------------------------------------------------------------- /tsv-summarize/tests/input_5field_b.tsv: -------------------------------------------------------------------------------- 1 | color pattern length width height 2 | red solid 6 2 5 3 | 赤 水玉模様 8 6 6 4 | 青 弁慶縞 10 5.5 4.5 5 | 赤 水玉模様 9 7 8 6 | -------------------------------------------------------------------------------- /tsv-summarize/tests/input_5field_c.tsv: -------------------------------------------------------------------------------- 1 | color pattern length width height 2 | red checked 10 4 7 3 | -------------------------------------------------------------------------------- /tsv-summarize/tests/input_5field_d.tsv: -------------------------------------------------------------------------------- 1 | color pattern length width height 2 | red solid 0.11 0.11 0.12345678901234567 3 | red plaid 0.011 0.11 0.012345678901234567 4 | blue plaid 0.111 0.11 0.2345678901234567891 5 | blue solid 0.1 0.11 0.1234567899876543211 6 | green plaid 0.11 0.11 0.1111111133333333333 7 | red solid 0.1111 0.11 0.3333333311111111111 8 | -------------------------------------------------------------------------------- /tsv-summarize/tests/input_5field_header_only.tsv: -------------------------------------------------------------------------------- 1 | color pattern length width height 2 | -------------------------------------------------------------------------------- /tsv-summarize/tests/test-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "output_files" : [ 3 | { 4 | "name" : "basic_tests_1.txt" 5 | }, 6 | { 7 | "name" : "error_tests_1.txt", 8 | "versions" : [ 9 | "error_tests_1.2081.txt" 10 | ] 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tsv-uniq/README.md: -------------------------------------------------------------------------------- 1 | _Visit the eBay TSV utilities [main page](../README.md)_ 2 | 3 | # tsv-uniq 4 | 5 | Similar in spirit to the Unix `uniq` tool, `tsv-uniq` filters a dataset so there is only one copy of each unique line. `tsv-uniq` goes beyond Unix `uniq` in a couple ways. First, data does not need to be sorted. Second, equivalence can be based on a subset of fields rather than the full line. 6 | 7 | `tsv-uniq` can also be run in 'equivalence class identification' mode, where lines with equivalent keys are marked with a unique id rather than filtered out. Another variant is 'number' mode, which generates line numbers grouped by the key. 8 | 9 | `tsv-uniq` operates on the entire line when no fields are specified. This is a useful alternative to the traditional `sort -u` or `sort | uniq` paradigms for identifying unique lines in unsorted files, as it is quite a bit faster, especially when there are many duplicate lines. As a bonus, order of the input lines is retained. 10 | 11 | Examples: 12 | ``` 13 | $ # Unique a file based on the full line. 14 | $ tsv-uniq data.tsv 15 | 16 | $ # Unique a file with fields 2 and 3 as the key. 17 | $ tsv-uniq -f 2,3 data.tsv 18 | 19 | $ # Unique a file using the 'RecordID' field as the key. 20 | $ tsv-uniq -H -f RecordID data.tsv 21 | ``` 22 | 23 | An in-memory lookup table is used to record unique entries. This ultimately limits the data sizes that can be processed. The author has found that datasets with up to about 10 million unique entries work fine, but performance starts to degrade after that. Even then it remains faster than the alternatives. 24 | 25 | See the [tsv-uniq reference](../docs/tool_reference/tsv-uniq.md) for details. 26 | -------------------------------------------------------------------------------- /tsv-uniq/dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tsv-uniq", 3 | "description": "Output unique lines in TSV files using a subset of fields.", 4 | "homepage": "https://github.com/eBay/tsv-utils", 5 | "authors": ["Jon Degenhardt"], 6 | "copyright": "Copyright (c) 2015-2021, eBay Inc.", 7 | "license": "BSL-1.0", 8 | "targetType": "executable", 9 | "configurations": [ 10 | { 11 | "name" : "executable", 12 | "targetName": "tsv-uniq", 13 | "targetPath": "../bin/", 14 | "mainSourceFile": "src/tsv_utils/tsv-uniq.d", 15 | "dependencies": { 16 | "tsv-utils:common": { "path": ".." } 17 | } 18 | }, 19 | { 20 | "name": "unittest", 21 | "targetType": "none" 22 | } 23 | ], 24 | "buildTypes": { 25 | "debug": { "buildOptions": ["debugMode", "optimize"] }, 26 | "release": { "buildOptions": ["releaseMode", "optimize", "inline"], 27 | "dflags": ["-boundscheck=off"], 28 | "dflags-osx-ldc": ["-flto=thin"] } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tsv-uniq/makefile: -------------------------------------------------------------------------------- 1 | APP_USES_LDC_PGO=2 2 | include ../makedefs.mk 3 | include ../makeapp.mk 4 | 5 | # No built-in unit tests 6 | unittest: ; 7 | unittest-codecov: ; 8 | -------------------------------------------------------------------------------- /tsv-uniq/profile_data/collect_profile_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | if [ $# -eq 0 ]; then 4 | echo "Insufficient arguments. The path of the instrumented program is required." 5 | exit 1 6 | fi 7 | 8 | prog=$1 9 | shift 10 | 11 | ldc_profdata_tool_name=ldc-profdata 12 | ldc_profdata_tool=${ldc_profdata_tool_name} 13 | 14 | if [ $# -ne 0 ]; then 15 | ldc_profdata_tool=${1}/bin/${ldc_profdata_tool_name} 16 | fi 17 | 18 | for f in profile.*.raw; do 19 | if [ -e $f ]; then 20 | rm $f 21 | fi 22 | done 23 | 24 | if [ -e app.profdata ]; then 25 | rm -f app.profdata 26 | fi 27 | 28 | $prog profile_data_1.tsv > /dev/null 29 | $prog profile_data_1.tsv -H > /dev/null 30 | $prog profile_data_1.tsv -i > /dev/null 31 | $prog profile_data_1.tsv -f 1 > /dev/null 32 | $prog profile_data_1.tsv -f 3 > /dev/null 33 | $prog profile_data_1.tsv -H -f 1,3 > /dev/null 34 | $prog profile_data_1.tsv -H -i -f 2,3,4 > /dev/null 35 | 36 | $prog profile_data_2.tsv > /dev/null 37 | $prog profile_data_2.tsv -f 1 > /dev/null 38 | $prog profile_data_2.tsv -f 2 > /dev/null 39 | $prog profile_data_2.tsv > /dev/null 40 | $prog profile_data_2.tsv -H -i > /dev/null 41 | $prog profile_data_2.tsv > /dev/null 42 | 43 | $prog profile_data_3.tsv > /dev/null 44 | $prog profile_data_3.tsv -f 1 > /dev/null 45 | $prog profile_data_3.tsv -H -f 3 > /dev/null 46 | $prog profile_data_3.tsv > /dev/null 47 | $prog profile_data_3.tsv -i > /dev/null 48 | $prog profile_data_3.tsv -f 2 > /dev/null 49 | $prog profile_data_3.tsv -i -f 3 > /dev/null 50 | 51 | ${ldc_profdata_tool} merge -o app.profdata profile.*.raw 52 | -------------------------------------------------------------------------------- /tsv-uniq/tests/empty-file.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eBay/tsv-utils/38ed0a1c31742bd8b59196517e89ff0b51e8fb80/tsv-uniq/tests/empty-file.txt -------------------------------------------------------------------------------- /tsv-uniq/tests/gold/error_tests_1.txt: -------------------------------------------------------------------------------- 1 | Error test set 1 2 | ---------------- 3 | 4 | ====[tsv-uniq -f 1,0 input1.tsv]==== 5 | [tsv-uniq] Error processing command line arguments: Whole line as key (--f|field 0) cannot be combined with multiple fields. 6 | 7 | ====[tsv-uniq -f 1,g input1.tsv]==== 8 | [tsv-uniq] Error processing command line arguments: [--f|fields] Non-numeric field group: 'g'. Use '--H|header' when using named field groups. 9 | 10 | ====[tsv-uniq -f 1-g input1.tsv]==== 11 | [tsv-uniq] Error processing command line arguments: [--f|fields] Non-numeric field group: '1-g'. Use '--H|header' when using named field groups. 12 | 13 | ====[tsv-uniq -f 0-2 input1.tsv]==== 14 | [tsv-uniq] Error processing command line arguments: [--f|fields] Zero cannot be used as part of a range: '0-2'. 15 | 16 | ====[tsv-uniq -f 1- input1.tsv]==== 17 | [tsv-uniq] Error processing command line arguments: [--f|fields] Incomplete ranges are not supported: '1-'. 18 | 19 | ====[tsv-uniq -d abc -f 2 input1.tsv]==== 20 | [tsv-uniq] Error processing command line arguments: Unexpected 'b' when converting from type string to type char 21 | 22 | ====[tsv-uniq -d ß -f 1 input1.tsv]==== 23 | [tsv-uniq] Error processing command line arguments: Invalid UTF-8 sequence (at index 1) 24 | 25 | ====[tsv-uniq -f 2 --equiv-start 10 input1.tsv]==== 26 | [tsv-uniq] Error processing command line arguments: --equiv-start requires --e|equiv 27 | 28 | ====[tsv-uniq -f 2 --equiv-header abc input1.tsv]==== 29 | [tsv-uniq] Error processing command line arguments: --equiv-header requires --e|equiv 30 | 31 | ====[tsv-uniq -f 2 --number-header abc input1.tsv]==== 32 | [tsv-uniq] Error processing command line arguments: --number-header requires --z|number 33 | 34 | ====[tsv-uniq -f 2,30 input1.tsv]==== 35 | Error [tsv-uniq]: Not enough fields in line. File: input1.tsv, Line: 1 36 | 37 | ====[tsv-uniq -f 2-30 input1.tsv]==== 38 | Error [tsv-uniq]: Not enough fields in line. File: input1.tsv, Line: 1 39 | 40 | ====[tsv-uniq -H -f 1,0 input1.tsv]==== 41 | [tsv-uniq] Error processing command line arguments: Whole line as key (--f|field 0) cannot be combined with multiple fields. 42 | 43 | ====[tsv-uniq -H -f f1,0 input1.tsv]==== 44 | [tsv-uniq] Error processing command line arguments: Whole line as key (--f|field 0) cannot be combined with multiple fields. 45 | 46 | ====[tsv-uniq -H -f 1,g input1.tsv]==== 47 | [tsv-uniq] Error processing command line arguments: [--f|fields] Field not found in file header: 'g'. 48 | 49 | ====[tsv-uniq -H -f f1,g input1.tsv]==== 50 | [tsv-uniq] Error processing command line arguments: [--f|fields] Field not found in file header: 'g'. 51 | 52 | ====[tsv-uniq -H -f 1-g input1.tsv]==== 53 | [tsv-uniq] Error processing command line arguments: [--f|fields] Ranges with both numeric and named components are not supported: '1-g'. 54 | 55 | ====[tsv-uniq -H -f 0-2 input1.tsv]==== 56 | [tsv-uniq] Error processing command line arguments: [--f|fields] Zero cannot be used as part of a range: '0-2'. 57 | 58 | ====[tsv-uniq -H -f 1- input1.tsv]==== 59 | [tsv-uniq] Error processing command line arguments: [--f|fields] Incomplete ranges are not supported: '1-'. 60 | 61 | ====[tsv-uniq -H -d abc -f f2 input1.tsv]==== 62 | [tsv-uniq] Error processing command line arguments: Unexpected 'b' when converting from type string to type char 63 | 64 | ====[tsv-uniq -H -d ß -f f1 input1.tsv]==== 65 | [tsv-uniq] Error processing command line arguments: Invalid UTF-8 sequence (at index 1) 66 | 67 | ====[tsv-uniq -H -f 2 --equiv-start 10 input1.tsv]==== 68 | [tsv-uniq] Error processing command line arguments: --equiv-start requires --e|equiv 69 | 70 | ====[tsv-uniq -H -f 2 --equiv-header abc input1.tsv]==== 71 | [tsv-uniq] Error processing command line arguments: --equiv-header requires --e|equiv 72 | 73 | ====[tsv-uniq -H -f 2 --number-header abc input1.tsv]==== 74 | [tsv-uniq] Error processing command line arguments: --number-header requires --z|number 75 | 76 | ====[tsv-uniq -H -f 2,30 input1.tsv]==== 77 | f1 f2 f3 f4 f5 78 | Error [tsv-uniq]: Not enough fields in line. File: input1.tsv, Line: 2 79 | 80 | ====[tsv-uniq -H -f 2-30 input1.tsv]==== 81 | f1 f2 f3 f4 f5 82 | Error [tsv-uniq]: Not enough fields in line. File: input1.tsv, Line: 2 83 | -------------------------------------------------------------------------------- /tsv-uniq/tests/input1.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 f4 f5 2 | 1 ggg UUU 101 15 3 | 2 bbb ZZZ 21 28 4 | 3 ggg CCC 5734 52 5 | 4 ddd ZZZ 65 602 6 | 5 ßßß SSS 7 771 7 | 6 sss sss 17 15 8 | 7 ssssss sss 18 16 9 | 8 àbc P 1209 1234 10 | 9 ÀBC 1367 1331 11 | 10 e-e 1602 12 | 11 0 X 1721 1703 13 | 9 ÀBC 1367 1331 14 | 12 sp-sp 2020 15 | 13 0.0 Z 1931 1956 16 | 8 ÀBC p 1209 1234 17 | 14 sp-sp 2022 18 | 3 ggg CCC 5734 52 19 | 9 ÀBC 1367 1331 20 | 17 0 Z 5734 602 21 | -------------------------------------------------------------------------------- /tsv-uniq/tests/input1_noheader.tsv: -------------------------------------------------------------------------------- 1 | 1 ggg UUU 101 15 2 | 2 bbb ZZZ 21 28 3 | 3 ggg CCC 5734 52 4 | 4 ddd ZZZ 65 602 5 | 5 ßßß SSS 7 771 6 | 6 sss sss 17 15 7 | 7 ssssss sss 18 16 8 | 8 àbc P 1209 1234 9 | 9 ÀBC 1367 1331 10 | 10 e-e 1602 11 | 11 0 X 1721 1703 12 | 9 ÀBC 1367 1331 13 | 12 sp-sp 2020 14 | 13 0.0 Z 1931 1956 15 | 8 ÀBC p 1209 1234 16 | 14 sp-sp 2022 17 | 3 ggg CCC 5734 52 18 | 9 ÀBC 1367 1331 19 | 17 0 Z 5734 602 20 | -------------------------------------------------------------------------------- /tsv-uniq/tests/input2.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 f4 f5 2 | 1 ggg UUU 101 15 3 | 3 ggg CCC 5734 52 4 | 5 ßßß SSSa 7 771 5 | 17 8 str 9997 8886 6 | 5 ßßßz SSS 7 771 7 | 16 pqr uvw 9998 8887 8 | 5 ßßß SSS 7 771 9 | 5 ßßß SSS 76 771 10 | 11 0 X 1721 1703 11 | 9 ÀBC 1367 1331 12 | 15 7 xyz 9999 8888 13 | -------------------------------------------------------------------------------- /tsv-uniq/tests/input3.tsv: -------------------------------------------------------------------------------- 1 | f1 f2 f3 f4 f5 2 | 1 Green Grün 緑 Verde 3 | 2 WHITE WEIẞ 白い BLANCA 4 | 3 teal blaugrün ティール azulado 5 | 4 Soccer Fútbol サッカー Fútbol 6 | 5 Baseball Baseball 野球 Béisbol 7 | 1 GREEN GRÜN 緑 VERDE 8 | 2 White Weiß 白い Blanca 9 | 3 TEAL BLAUGRÜN ティール AZULADO 10 | 4 soccer fútbol サッカー fútbol 11 | 5 BASEBALL BASEBALL 野球 BÉISBOL 12 | 1 green grün 緑 verde 13 | 2 white weiß 白い blanca 14 | 3 Teal Blaugrün ティール azulado 15 | 4 SOCCER FÚTBOL サッカー FÚTBOL 16 | 5 baseball baseball 野球 béisbol 17 | 1 green Grün 緑 verde 18 | 2 white WEISS 白い Blanca 19 | 3 Teal Blaugrün ティール azulado 20 | 4 SOCCER FÚTBOL サッカー fútbol 21 | 5 baseball BASEBALL 野球 béisbol 22 | -------------------------------------------------------------------------------- /tsv-uniq/tests/input_delim_underscore.tsv: -------------------------------------------------------------------------------- 1 | f1_f2_f3_f4_f5 2 | 1_ggg_UUU_101_15 3 | 2_bbb_ZZZ_21_28 4 | 3_ggg_CCC_5734_52 5 | 4_ddd_ZZZ_65_602 6 | 5_ßßß_SSS_ 7_771 7 | 6_sss_sss_17_15 8 | 7_ssssss_sss_18_16 9 | 8_àbc_P_1209_1234 10 | 9_ÀBC__1367_1331 11 | 10___e-e_1602 12 | 11_0_X_1721_1703 13 | 9_ÀBC__1367_1331 14 | 12_ _ _sp-sp_2020 15 | 13_0.0_Z_1931_1956 16 | 8_ÀBC_p_1209_1234 17 | 14_ _ _sp-sp_2022 18 | 3_ggg_CCC_5734_52 19 | 9_ÀBC__1367_1331 20 | 17_0_Z_5734_602 21 | --------------------------------------------------------------------------------