├── .editorconfig ├── .github └── workflows │ ├── main.yml │ └── release.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── shard.yml ├── spec ├── csvzip_spec.cr ├── fixtures │ └── trains.csv └── spec_helper.cr └── src ├── cli.cr └── csvzip.cr /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*.cr] 4 | charset = utf-8 5 | end_of_line = lf 6 | insert_final_newline = true 7 | indent_style = space 8 | indent_size = 2 9 | trim_trailing_whitespace = true 10 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: csvzip tests 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | 11 | build_linux: 12 | runs-on: ubuntu-latest 13 | 14 | container: 15 | image: crystallang/crystal:latest-alpine 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | # this is for bash and ansible 21 | - name: Apk update 22 | run: apk update 23 | - name: Pre-req - bash 24 | run: apk add bash 25 | 26 | # csvzip tests 27 | - name: Install dependencies 28 | run: shards install 29 | - name: Run unittests 30 | run: crystal spec 31 | - name: Run tool format check 32 | run: crystal tool format --check 33 | - name: Build 34 | run: shards build 35 | 36 | build_macos: 37 | runs-on: macos-latest 38 | 39 | steps: 40 | - uses: actions/checkout@v2 41 | 42 | # install Crystal 43 | - name: Brew update 44 | run: brew update 45 | - name: Install Crystal 46 | run: brew install crystal 47 | 48 | # csvzip tests 49 | - name: Install dependencies 50 | run: shards install 51 | - name: Run unittests 52 | run: crystal spec 53 | - name: Run tool format check 54 | run: crystal tool format --check 55 | - name: Build 56 | run: shards build 57 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Upload Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - v* # Push events to matching v*, i.e. v1.0, v20.15.10 7 | 8 | jobs: 9 | 10 | build_linux: 11 | name: Build for GNU/Linux 12 | runs-on: ubuntu-latest 13 | 14 | # we build statically linked binaries using the official 15 | # Crystal alpine-linux docker image 16 | # (we need musl-libc, hence we use Alpine) 17 | # https://crystal-lang.org/2020/02/02/alpine-based-docker-images.html 18 | container: 19 | image: crystallang/crystal:latest-alpine 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | 24 | - name: Install dependencies 25 | run: shards install 26 | - name: Build 27 | run: shards build --production --static 28 | 29 | - name: Upload build artifact 30 | uses: actions/upload-artifact@v1 31 | with: 32 | name: csvzip-linux 33 | path: ./bin/csvzip 34 | 35 | build_macos: 36 | name: Build for macOS 37 | runs-on: macos-latest 38 | 39 | steps: 40 | - uses: actions/checkout@v2 41 | 42 | - name: Brew update 43 | run: brew update 44 | - name: Install Crystal 45 | run: brew install crystal 46 | 47 | - name: Install dependencies 48 | run: shards install 49 | - name: Build 50 | run: shards build --production # no static linking on macOS: https://developer.apple.com/library/archive/qa/qa1118/_index.html 51 | 52 | - name: Upload build artifact 53 | uses: actions/upload-artifact@v1 54 | with: 55 | name: csvzip-darwin 56 | path: ./bin/csvzip 57 | 58 | create_release: 59 | needs: [build_linux, build_macos] 60 | name: Create Release 61 | runs-on: ubuntu-latest 62 | 63 | steps: 64 | - name: Create Release 65 | id: create_release 66 | uses: actions/create-release@v1 67 | env: 68 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 69 | with: 70 | tag_name: ${{ github.ref }} 71 | release_name: Release ${{ github.ref }} 72 | draft: false 73 | prerelease: false 74 | - name: Get the version # https://github.community/t5/GitHub-Actions/How-to-get-just-the-tag-name/m-p/32163/highlight/true#M1024 75 | id: get_version 76 | run: echo ::set-output name=VERSION::$(echo $GITHUB_REF | cut -d / -f 3) 77 | 78 | # get artifacts containing the builds 79 | - name: Download csvzip-linux 80 | uses: actions/download-artifact@v1 81 | with: 82 | name: csvzip-linux 83 | - name: Download csvzip-darwin 84 | uses: actions/download-artifact@v1 85 | with: 86 | name: csvzip-darwin 87 | 88 | # create tar archives 89 | - shell: bash 90 | run: | 91 | chmod +x ./csvzip-linux/csvzip 92 | chmod +x ./csvzip-darwin/csvzip 93 | tar -czf csvzip-${{ steps.get_version.outputs.VERSION }}-linux-amd64.tar.gz -C ./csvzip-linux . 94 | tar -czf csvzip-${{ steps.get_version.outputs.VERSION }}-darwin-amd64.tar.gz -C ./csvzip-darwin . 95 | 96 | # upload archives 97 | - name: Upload Release Asset Linux 98 | id: upload-release-asset-linux 99 | uses: actions/upload-release-asset@v1 100 | env: 101 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 102 | with: 103 | upload_url: ${{ steps.create_release.outputs.upload_url }} 104 | asset_path: ./csvzip-${{ steps.get_version.outputs.VERSION }}-linux-amd64.tar.gz 105 | asset_name: csvzip-${{ steps.get_version.outputs.VERSION }}-linux-amd64.tar.gz 106 | asset_content_type: application/gzip 107 | - name: Upload Release Asset macOS 108 | id: upload-release-asset-macos 109 | uses: actions/upload-release-asset@v1 110 | env: 111 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 112 | with: 113 | upload_url: ${{ steps.create_release.outputs.upload_url }} 114 | asset_path: ./csvzip-${{ steps.get_version.outputs.VERSION }}-darwin-amd64.tar.gz 115 | asset_name: csvzip-${{ steps.get_version.outputs.VERSION }}-darwin-amd64.tar.gz 116 | asset_content_type: application/gzip 117 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /docs/ 2 | /lib/ 3 | /bin/ 4 | /.shards/ 5 | *.dwarf 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: crystal 2 | 3 | script: 4 | - crystal spec 5 | - crystal tool format --check 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 Fernando Blat 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # csvzip 2 | 3 | **csvzip** is a standalone CLI tool to reduce CSVs size by converting categorical columns in a list of unique integers. 4 | 5 | The execution produces two files: 6 | 7 | 1. a CSV with the compressed values 8 | 2. a JSON dictionary with the mappings 9 | 10 | ‼️ Current in input CSV file:‼️ 11 | - the CSV has to have a headers row 12 | 13 | We love [csvkit](https://csvkit.readthedocs.io) and csvzip has been inspired by that great tool 14 | 15 | ## Installation 16 | 17 | ### GNU/Linux 18 | 19 | You can download the latest binary from the [releases page](https://github.com/PopulateTools/csvzip/releases). 20 | 21 | ### macOS 22 | 23 | You can get the latest *darwin* build from the [releases page](https://github.com/PopulateTools/csvzip/releases). 24 | 25 | ### Windows 26 | 27 | Until the [Crystal Windows porting](https://github.com/crystal-lang/crystal/wiki/Porting-to-Windows) is completed, 28 | you can go with [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10). 29 | 30 | ## Usage 31 | 32 | Let's say you have downloaded a very big CSV, for example [Madrid census](https://datos.madrid.es/sites/v/index.jsp?vgnextoid=1d755cde99be2410VgnVCM1000000b205a0aRCRD&vgnextchannel=374512b9ace9f310VgnVCM100000171f5a0aRCRD): 33 | 34 | If we inspect it, it's a 22Mb file that looks like this: 35 | 36 | ``` 37 | "COD_DISTRITO";"DESC_DISTRITO";"COD_DIST_BARRIO";"DESC_BARRIO";"COD_BARRIO";"COD_DIST_SECCION";"COD_SECCION";"COD_EDAD_INT";"EspanolesHombres";"EspanolesMujeres";"ExtranjerosHombres";"ExtranjerosMujeres" 38 | "1";"CENTRO ";"101";"PALACIO ";"1";"1006";"6";"99";"";"1";"";"" 39 | "1";"CENTRO ";"101";"PALACIO ";"1";"1006";"6";"102";"";"1";"";"" 40 | "1";"CENTRO ";"101";"PALACIO ";"1";"1007";"7";"0";"2";"2";"";"1" 41 | "1";"CENTRO ";"101";"PALACIO ";"1";"1007";"7";"1";"3";"3";"";"" 42 | "1";"CENTRO ";"101";"PALACIO ";"1";"1007";"7";"2";"4";"3";"";"" 43 | "1";"CENTRO ";"101";"PALACIO ";"1";"1007";"7";"3";"1";"3";"";"" 44 | "1";"CENTRO ";"101";"PALACIO ";"1";"1007";"7";"4";"";"6";"";"1" 45 | "1";"CENTRO ";"101";"PALACIO ";"1";"1007";"7";"5";"2";"1";"";"" 46 | "1";"CENTRO ";"101";"PALACIO ";"1";"1007";"7";"6";"3";"4";"";"" 47 | ... 48 | ``` 49 | 50 | Let's compress it: 51 | ``` 52 | csvzip -i Rango_Edades_Seccion_202005.csv -o compressed.csv -c "DESC_DISTRITO,DESC_BARRIO" -k census -s ';' 53 | ``` 54 | 55 | ```$ head compressed.csv 56 | COD_DISTRITO,DESC_DISTRITO,COD_DIST_BARRIO,DESC_BARRIO,COD_BARRIO,COD_DIST_SECCION,COD_SECCION,COD_EDAD_INT,EspanolesHombres,EspanolesMujeres,ExtranjerosHombres,ExtranjerosMujeres 57 | 1,0,101,0,1,1006,6,99,,1,, 58 | 1,0,101,0,1,1006,6,102,,1,, 59 | 1,0,101,0,1,1007,7,0,2,2,,1 60 | 1,0,101,0,1,1007,7,1,3,3,, 61 | 1,0,101,0,1,1007,7,2,4,3,, 62 | 1,0,101,0,1,1007,7,3,1,3,, 63 | ... 64 | ``` 65 | 66 | And the size is now 7.2Mb. 67 | 68 | If we inspect the dictionary, it contains the values of those columns: 69 | 70 | ```$ cat dictionary.json | jq 71 | { 72 | "census": { 73 | "DESC_DISTRITO": { 74 | "CENTRO": "0", 75 | "ARGANZUELA": "1", 76 | "RETIRO": "2", 77 | "SALAMANCA": "3", 78 | ... 79 | }, 80 | "DESC_BARRIO": { 81 | "PALACIO": "0", 82 | "EMBAJADORES": "1", 83 | "UNIVERSIDAD": "2", 84 | "CHOPERA": "3", 85 | "PACIFICO": "4", 86 | ... 87 | } 88 | } 89 | } 90 | ``` 91 | 92 | ## Todo 93 | 94 | - [ ] Improve specs coverage 95 | - [ ] accept headers parameter 96 | - [ ] decompress operation 97 | 98 | ## Contributing 99 | 100 | 1. Fork it () 101 | 2. Create your feature branch (`git checkout -b my-new-feature`) 102 | 3. Commit your changes (`git commit -am 'Add some feature'`) 103 | 4. Push to the branch (`git push origin my-new-feature`) 104 | 5. Create a new Pull Request 105 | 106 | ## Thanks 107 | 108 | Thanks to marcobellaccini from [nanvault](https://github.com/marcobellaccini/nanvault) for the inspiration to build this CLI tool. The structure of the project and the Github actions scripts are copied from that repository. 109 | 110 | ## Contributors 111 | 112 | - [Fernando Blat](https://github.com/ferblape) - creator and maintainer 113 | -------------------------------------------------------------------------------- /shard.yml: -------------------------------------------------------------------------------- 1 | name: csvzip 2 | version: 0.2.0 3 | 4 | authors: 5 | - Fernando Blat 6 | 7 | targets: 8 | csvzip: 9 | main: src/cli.cr 10 | 11 | crystal: 0.34.0 12 | 13 | license: MIT 14 | -------------------------------------------------------------------------------- /spec/csvzip_spec.cr: -------------------------------------------------------------------------------- 1 | require "./spec_helper" 2 | require "file_utils" 3 | 4 | describe Csvzip do 5 | describe Csvzip::Compressor do 6 | it "compresses the provided columns" do 7 | input_file = "spec/fixtures/trains.csv" 8 | output_file = "out.csv" 9 | dictionary_file = "dictionary.json" 10 | dictionary_key = "csvzip_test" 11 | separator = ',' 12 | quote_char = '"' 13 | 14 | # Ensure output files are removed 15 | FileUtils.rm(output_file) if File.exists?(output_file) 16 | FileUtils.rm(dictionary_file) if File.exists?(dictionary_file) 17 | 18 | c = Csvzip::Compressor.new input_file, output_file, 19 | ["origin", "destination", "train_type", "train_class", "fare"], 20 | dictionary_file, dictionary_key, 21 | separator, quote_char 22 | c.compress 23 | 24 | File.exists?(output_file).should eq true 25 | File.exists?(dictionary_file).should eq true 26 | 27 | dictionary = File.open(dictionary_file) do |file| 28 | JSON.parse(file) 29 | end 30 | 31 | File.open(output_file) do |infile| 32 | reader = CSV.new(infile, header = true, strip = true) 33 | reader.next 34 | row = reader.row.to_h 35 | dictionary[dictionary_key]["origin"][row["origin"]].should eq "MADRID" 36 | dictionary[dictionary_key]["destination"][row["destination"]].should eq "BARCELONA" 37 | end 38 | 39 | # Ensure output files are removed 40 | FileUtils.rm(output_file) 41 | FileUtils.rm(dictionary_file) 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /spec/fixtures/trains.csv: -------------------------------------------------------------------------------- 1 | insert_date,origin,destination,start_date,end_date,train_type,price,train_class,fare,price_tree,batch,id 2 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 05:50:00,2019-04-18 08:55:00,AVE,68.95,Preferente,Promo,,, 3 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 06:30:00,2019-04-18 09:20:00,AVE,75.4,Turista,Promo,,, 4 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 07:00:00,2019-04-18 09:30:00,AVE,106.75,Turista Plus,Promo,,, 5 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 07:30:00,2019-04-18 10:40:00,AVE,90.5,Turista Plus,Promo,,, 6 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 08:00:00,2019-04-18 10:30:00,AVE,88.95,Turista,Promo,,, 7 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 08:30:00,2019-04-18 11:15:00,AVE,107.7,Turista,Flexible,,, 8 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 09:30:00,2019-04-18 12:34:00,AVE,107.7,Turista,Flexible,,, 9 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 10:30:00,2019-04-18 13:15:00,AVE,102.15,Turista Plus,Promo,,, 10 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 11:30:00,2019-04-18 14:40:00,AVE,102.15,Turista Plus,Promo,,, 11 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 12:30:00,2019-04-18 15:30:00,AVE,107.7,Turista,Flexible,,, 12 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 13:25:00,2019-04-18 16:24:00,AVE-TGV,107.7,Turista,Flexible,,, 13 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 14:00:00,2019-04-18 16:30:00,AVE,100.4,Turista,Promo,,, 14 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 14:30:00,2019-04-18 17:21:00,AVE,107.7,Turista,Flexible,,, 15 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 15:30:00,2019-04-18 18:40:00,AVE,,Preferente,Promo,,, 16 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 16:30:00,2019-04-18 19:15:00,AVE,102.15,Turista Plus,Promo,,, 17 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 17:30:00,2019-04-18 20:40:00,AVE,75.4,Turista,Promo,,, 18 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 18:30:00,2019-04-18 21:20:00,AVE,90.5,Turista Plus,Promo,,, 19 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 19:00:00,2019-04-18 21:30:00,AVE,115.65,Preferente,Promo,,, 20 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 19:30:00,2019-04-18 22:40:00,AVE,85.1,Turista,Promo,,, 21 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 20:30:00,2019-04-18 23:40:00,AVE,85.1,Turista,Promo,,, 22 | 2019-04-11 21:49:46,MADRID,BARCELONA,2019-04-18 21:25:00,2019-04-18 23:55:00,AVE,,,,,, 23 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 06:20:00,2019-05-18 09:29:00,AVE,66.75,Turista,Promo,,, 24 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 07:30:00,2019-05-18 10:40:00,AVE,85.1,Turista,Promo,,, 25 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 08:30:00,2019-05-18 11:15:00,AVE,85.1,Turista,Promo,,, 26 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 09:30:00,2019-05-18 12:34:00,AVE,85.1,Turista,Promo,,, 27 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 10:30:00,2019-05-18 13:15:00,AVE,75.4,Turista,Promo,,, 28 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 11:30:00,2019-05-18 14:40:00,AVE,75.4,Turista,Promo,,, 29 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 12:30:00,2019-05-18 15:30:00,AVE,75.4,Turista,Promo,,, 30 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 13:25:00,2019-05-18 16:24:00,AVE-TGV,75.4,Turista,Promo,,, 31 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 15:30:00,2019-05-18 18:40:00,AVE,86.8,Preferente,Promo,,, 32 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 16:30:00,2019-05-18 19:15:00,AVE,80.15,Turista Plus,Promo,,, 33 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 17:30:00,2019-05-18 20:40:00,AVE,75.4,Turista,Promo,,, 34 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 18:30:00,2019-05-18 21:20:00,AVE,75.4,Turista,Promo,,, 35 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 19:30:00,2019-05-18 22:40:00,AVE,49.15,Turista Plus,Promo,,, 36 | 2019-04-11 21:49:48,MADRID,BARCELONA,2019-05-18 20:30:00,2019-05-18 23:40:00,AVE,69.8,Turista Plus,Promo,,, 37 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 06:10:00,2019-05-22 08:40:00,AVE,85.1,Turista,Promo,,, 38 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 06:30:00,2019-05-22 09:20:00,AVE,40.95,Turista,Promo,,, 39 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 07:00:00,2019-05-22 09:30:00,AVE,94.55,Turista Plus,Promo,,, 40 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 07:15:00,2019-05-22 16:37:00,R. EXPRES,43.25,Turista,Adulto ida,,, 41 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 07:20:00,2019-05-22 09:50:00,AVE,100.4,Turista,Promo,,, 42 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 07:30:00,2019-05-22 10:40:00,AVE,85.1,Turista,Promo,,, 43 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 08:00:00,2019-05-22 10:30:00,AVE,100.4,Turista,Promo,,, 44 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 08:20:00,2019-05-22 11:05:00,AVE,40.95,Turista,Promo,,, 45 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 08:30:00,2019-05-22 11:15:00,AVE,75.4,Turista,Promo,,, 46 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 09:00:00,2019-05-22 11:45:00,AVE,85.1,Turista,Promo,,, 47 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 09:30:00,2019-05-22 12:34:00,AVE,85.1,Turista,Promo,,, 48 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 10:30:00,2019-05-22 13:15:00,AVE,85.1,Turista,Promo,,, 49 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 11:30:00,2019-05-22 14:40:00,AVE,85.1,Turista,Promo,,, 50 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 12:30:00,2019-05-22 15:30:00,AVE,90.5,Turista Plus,Promo,,, 51 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 13:25:00,2019-05-22 16:24:00,AVE-TGV,40.95,Turista,Promo,,, 52 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 14:00:00,2019-05-22 16:30:00,AVE,78.8,Turista,Promo,,, 53 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 14:30:00,2019-05-22 17:21:00,AVE,66.75,Turista,Promo,,, 54 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 15:00:00,2019-05-22 17:30:00,AVE,88.95,Turista,Promo,,, 55 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 15:30:00,2019-05-22 18:40:00,AVE,85.1,Turista,Promo,,, 56 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 16:00:00,2019-05-22 18:30:00,AVE,68.65,Turista,Promo,,, 57 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 16:30:00,2019-05-22 19:15:00,AVE,90.5,Turista Plus,Promo,,, 58 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 17:00:00,2019-05-22 19:30:00,AVE,100.4,Turista,Promo,,, 59 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 17:30:00,2019-05-22 20:40:00,AVE,85.1,Turista,Promo,,, 60 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 18:00:00,2019-05-22 20:30:00,AVE,100.4,Turista,Promo,,, 61 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 18:30:00,2019-05-22 21:20:00,AVE,102.15,Turista Plus,Promo,,, 62 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 19:00:00,2019-05-22 21:30:00,AVE,82.35,Turista Plus,Promo,,, 63 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 19:30:00,2019-05-22 22:40:00,AVE,58.15,Turista,Promo,,, 64 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 20:00:00,2019-05-22 22:30:00,AVE,78.8,Turista,Promo,,, 65 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 20:30:00,2019-05-22 23:40:00,AVE,49.55,Turista,Promo,,, 66 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-05-22 21:25:00,2019-05-22 23:55:00,AVE,75.4,Turista,Promo,,, 67 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 05:50:00,2019-04-22 08:55:00,AVE,66.75,Turista,Promo,,, 68 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 06:30:00,2019-04-22 09:20:00,AVE,75.4,Turista,Promo,,, 69 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 07:00:00,2019-04-22 09:30:00,AVE,106.75,Turista Plus,Promo,,, 70 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 07:15:00,2019-04-22 16:37:00,R. EXPRES,43.25,Turista,Adulto ida,,, 71 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 07:30:00,2019-04-22 10:40:00,AVE,85.1,Turista,Promo,,, 72 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 08:00:00,2019-04-22 10:30:00,AVE,100.4,Turista,Promo,,, 73 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 08:30:00,2019-04-22 11:15:00,AVE,,Preferente,Promo,,, 74 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 09:30:00,2019-04-22 12:34:00,AVE,102.15,Turista Plus,Promo,,, 75 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 10:30:00,2019-04-22 13:15:00,AVE,107.7,Turista,Flexible,,, 76 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 11:30:00,2019-04-22 14:40:00,AVE,181.5,Preferente,Flexible,,, 77 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 12:30:00,2019-04-22 15:30:00,AVE,181.5,Preferente,Flexible,,, 78 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 13:25:00,2019-04-22 16:24:00,AVE-TGV,181.5,Preferente,Flexible,,, 79 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 14:00:00,2019-04-22 16:30:00,AVE,127.1,Turista,Flexible,,, 80 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 14:30:00,2019-04-22 17:21:00,AVE,107.7,Turista,Flexible,,, 81 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 15:00:00,2019-04-22 17:30:00,AVE,132.8,Preferente,Promo,,, 82 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 15:30:00,2019-04-22 18:40:00,AVE,107.7,Turista,Flexible,,, 83 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 16:00:00,2019-04-22 18:30:00,AVE,127.1,Turista,Flexible,,, 84 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 16:30:00,2019-04-22 19:15:00,AVE,181.5,Preferente,Flexible,,, 85 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 17:00:00,2019-04-22 19:30:00,AVE,127.1,Turista,Flexible,,, 86 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 17:30:00,2019-04-22 20:40:00,AVE,107.7,Turista,Flexible,,, 87 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 18:00:00,2019-04-22 20:30:00,AVE,214.2,Preferente,Flexible,,, 88 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 18:30:00,2019-04-22 21:20:00,AVE,107.7,Turista,Flexible,,, 89 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 19:00:00,2019-04-22 21:30:00,AVE,127.1,Turista,Flexible,,, 90 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 19:30:00,2019-04-22 22:40:00,AVE,181.5,Preferente,Flexible,,, 91 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 20:00:00,2019-04-22 22:30:00,AVE,127.1,Turista,Flexible,,, 92 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 20:30:00,2019-04-22 23:40:00,AVE,181.5,Preferente,Flexible,,, 93 | 2019-04-11 21:50:04,MADRID,BARCELONA,2019-04-22 21:25:00,2019-04-22 23:55:00,AVE,107.7,Turista,Flexible,,, 94 | 2019-04-11 21:50:20,MADRID,BARCELONA,2019-04-27 06:20:00,2019-04-27 09:29:00,AVE,86.8,Preferente,Promo,,, 95 | 2019-04-11 21:50:20,MADRID,BARCELONA,2019-04-27 07:30:00,2019-04-27 10:40:00,AVE,75.4,Turista,Promo,,, 96 | 2019-04-11 21:50:20,MADRID,BARCELONA,2019-04-27 08:30:00,2019-04-27 11:15:00,AVE,85.1,Turista,Promo,,, 97 | 2019-04-11 21:50:20,MADRID,BARCELONA,2019-04-27 09:30:00,2019-04-27 12:34:00,AVE,102.15,Turista Plus,Promo,,, 98 | 2019-04-11 21:50:20,MADRID,BARCELONA,2019-04-27 10:30:00,2019-04-27 13:15:00,AVE,58.15,Turista,Promo,,, 99 | 2019-04-11 21:50:20,MADRID,BARCELONA,2019-04-27 11:30:00,2019-04-27 14:40:00,AVE,102.15,Turista Plus,Promo,,, 100 | 2019-04-11 21:50:20,MADRID,BARCELONA,2019-04-27 12:30:00,2019-04-27 15:30:00,AVE,102.15,Turista Plus,Promo,,, 101 | -------------------------------------------------------------------------------- /spec/spec_helper.cr: -------------------------------------------------------------------------------- 1 | require "spec" 2 | require "../src/csvzip" 3 | -------------------------------------------------------------------------------- /src/cli.cr: -------------------------------------------------------------------------------- 1 | require "option_parser" 2 | require "./csvzip" 3 | 4 | input_file = "" 5 | output_file = "" 6 | dictionary_file = "" 7 | dictionary_key = "" 8 | columns = "" 9 | separator = "" 10 | quote_char = "" 11 | 12 | OptionParser.parse do |parser| 13 | parser.banner = "csvzip is a standalone CLI tool to reduce CSVs size by converting categorical columns in a list of unique integers\n" \ 14 | "The execution produces two files: a CSV with the compressed values and a JSON dictionary with the mappings.\n\n" \ 15 | "‼️ Current limitations in input CSV file:‼️ \n" \ 16 | " - the CSV has to have a headers row\n" \ 17 | "More information, usage examples and candies at:\n" \ 18 | "https://github.com/PopulateTools/csvzip\n\n" \ 19 | "Usage: csvzip" 20 | parser.on("-i INPUT", "--input=INPUT", "Input CSV file (required)") { |value| input_file = value } 21 | parser.on("-o OUTPUT", "--output=OUTPUT", "Output CSV file (required)") { |value| output_file = value } 22 | parser.on("-c COLUMNS", "--columns=COLUMNS", "Columns to compress, in a comma separated format. Example: \"col1, col2, col5\" (required)") { |value| columns = value } 23 | parser.on("-d DICTIONARY", "--dictionary=DICTIONARY", "Output dictionary file, in JSON format. Default: dictionary.json") { |value| dictionary_file = value } 24 | parser.on("-k KEY", "--dictionary-key=KEY", "First level key in dictionary. Default: csvzip") { |value| dictionary_key = value } 25 | parser.on("-s SEPARATOR", "--separator=SEPARATOR", "CSV column separator. Default: comma") { |value| separator = value } 26 | parser.on("-q QUOTE_CHAR", "--quote-char=QUOTE_CHAR", "CSV quote character. Default: double quotes") { |value| quote_char = value } 27 | parser.on("--version", "Print version") { puts "csvzip version #{Csvzip::VERSION}"; exit(0) } 28 | parser.on("-h", "--help", "Show this help") { puts parser; exit(0) } 29 | 30 | parser.unknown_args do |args| 31 | # filter out unknown options 32 | unk_args = args.find { |s| !s.starts_with?("-") } 33 | if !unk_args.nil? 34 | STDERR.puts "ERROR: this program does not need arguments!" 35 | STDERR.puts parser 36 | exit(1) 37 | end 38 | end 39 | parser.invalid_option do |flag| 40 | STDERR.puts "ERROR: #{flag} is not a valid option." 41 | STDERR.puts parser 42 | exit(1) 43 | end 44 | parser.missing_option do |flag| 45 | STDERR.puts "ERROR: incomplete or missing option '#{flag}'." 46 | STDERR.puts parser 47 | exit(1) 48 | end 49 | end 50 | 51 | # Arguments checks 52 | # Input file 53 | if input_file == "" 54 | STDERR.puts "ERROR: input file is required" 55 | exit(1) 56 | end 57 | 58 | # Output file 59 | if output_file == "" 60 | STDERR.puts "ERROR: output file is required" 61 | exit(1) 62 | end 63 | 64 | # Columns 65 | if columns == "" 66 | STDERR.puts "ERROR: columns list is required" 67 | exit(1) 68 | end 69 | 70 | # Dictionary 71 | if dictionary_file == "" 72 | dictionary_file = "dictionary.json" 73 | end 74 | 75 | # Dictionary key 76 | if dictionary_key == "" 77 | dictionary_key = "csvzip" 78 | end 79 | 80 | if separator == "" 81 | separator = "," 82 | end 83 | 84 | if quote_char == "" 85 | quote_char = "\"" 86 | end 87 | 88 | columns_list = columns.split(",").map { |v| v.strip } 89 | 90 | begin 91 | c = Csvzip::Compressor.new(input_file, output_file, columns_list, dictionary_file, dictionary_key, separator.chars.first, quote_char.chars.first) 92 | c.compress 93 | exit(0) 94 | rescue ex 95 | STDERR.puts "ERROR: #{ex.message}" 96 | exit(1) 97 | end 98 | -------------------------------------------------------------------------------- /src/csvzip.cr: -------------------------------------------------------------------------------- 1 | require "csv" 2 | require "json" 3 | 4 | module Csvzip 5 | VERSION = "0.2.0" 6 | 7 | class Compressor 8 | def initialize( 9 | @input_file : String, 10 | @output_file : String, 11 | @columns : Array(String), 12 | @dictionary_file : String, 13 | @dictionary_key : String, 14 | @separator : Char, 15 | @quote_char : Char 16 | ) 17 | @dictionary = {} of String => Hash(String, Hash(String, String)) 18 | @dictionary[@dictionary_key] = {} of String => Hash(String, String) 19 | end 20 | 21 | def compress 22 | File.open(@input_file) do |infile| 23 | reader = CSV.new(infile, header = true, strip = true, separator = @separator, quote_char = @quote_char) 24 | 25 | File.open(@output_file, "w") do |outfile| 26 | CSV.build(outfile) do |writer| 27 | # Add output headers 28 | writer.row reader.headers 29 | 30 | while reader.next 31 | row = reader.row.to_h 32 | @columns.each do |column| 33 | key = row[column].strip 34 | 35 | # Initialize the dictionary column 36 | @dictionary[@dictionary_key][column] ||= {} of String => String 37 | 38 | # Setup a encoded value for the key if it doesn't exist 39 | @dictionary[@dictionary_key][column][key] ||= @dictionary[@dictionary_key][column].keys.size.to_s 40 | 41 | # Update row with the encoded value 42 | row[column] = @dictionary[@dictionary_key][column][key] 43 | end 44 | writer.row row.values 45 | end 46 | end 47 | end 48 | end 49 | 50 | @columns.each do |column| 51 | @dictionary[@dictionary_key][column] = @dictionary[@dictionary_key][column].invert 52 | end 53 | 54 | File.open(@dictionary_file, "w") do |file| 55 | @dictionary.to_json(file) 56 | end 57 | end 58 | end 59 | end 60 | --------------------------------------------------------------------------------