├── .gitignore ├── README.md ├── converted-files └── .gitkeep ├── csv-split.rb ├── remove-sample.csv └── sample.csv /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | #*/*.csv 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CSV Split 2 | 3 | Developed and tested on Ruby 2.4.0 4 | 5 | A ruby script that splits a large csv file into smaller files and stores the smaller files into the ```split-files``` directory. 6 | 7 | This script has eight parameters: 8 | 9 | ``` 10 | Options: 11 | -f, --file-path= Path to csv file to be split 12 | -n, --new-file-name= Name of the new files. This will be appended with an incremented number (default: split) 13 | -i, --include-headers, --no-include-headers Include headers in new files (default: true) 14 | -l, --line-count= Number of lines per file (default: 1) 15 | -d, --delimiter= Charcter used for Col. Sep. (Default: ,) 16 | -r, --remove-columns Specify column names to be removed during processing in remove_coluns.txt 17 | -c, --include-remainders Include remainder rows in the split files (default: false). Example: if there are 1030 rows in a csv file and will be split in 100 rows, the remaining 30 rows will be stored in a new file 18 | -h, --help Show this message 19 | ``` 20 | 21 | ## Required Gems 22 | 23 | - [optimist](https://github.com/ManageIQ/optimist) 24 | 25 | ## Installation 26 | 27 | 1. Clone repository 28 | 2. Install the required gem: 29 | ``` 30 | gem install optimist 31 | ``` 32 | 33 | ## Running the script 34 | 35 | ``` 36 | ruby csv-split.rb --file-path path/to/csv/file/.csv --line-count 2500 --include-headers 37 | ``` 38 | -------------------------------------------------------------------------------- /converted-files/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imartingraham/csv-split/0902357c8ed3ceb7c67e72cbd022a6219f213892/converted-files/.gitkeep -------------------------------------------------------------------------------- /csv-split.rb: -------------------------------------------------------------------------------- 1 | require 'optimist' 2 | require 'fileutils' #Added for file management 3 | require 'csv' 4 | 5 | # I want to deeply thank https://github.com/imartingraham for providing this original work 6 | # 7 | # I had to do a legal production and used the base project and add features as needed. 8 | # 9 | # Key features are custom delimiter, spliting the files then removing columns that were needed. {in my case for redaction} 10 | # 11 | # I appologize for some of the slope structure but it should be straight forward and a good learning experience. 12 | # 13 | # Regards wb 0727/2017 14 | # 15 | 16 | opts = Optimist::options do 17 | opt :file_path, "Path to csv file to be split", type: :string, default: nil 18 | opt :new_file_name, "Name of the new files. This will be appended with an incremented number", type: :string, default: 'split' #Please note later i change the default name to {original_filename}-{inc#}.csv 19 | opt :include_headers, "Include headers in new files", default: true, type: :boolean 20 | opt :line_count, "Number of lines per file", default: 1, type: :integer #change default to 1 21 | opt :delimiter, "Charcter used for Col. Sep.", default: ',', type: :string #Add custom delimiter 22 | opt :remove_columns, "Specify column names to be removed during processing in remove_coluns.txt", default: false, type: :boolean #Add Remove Column processing with remove.csv 23 | opt :include_remainders, "Include remainder rows in the split files (default: false). Example: if there are 1030 rows in a csv file and will be split in 100 rows, the remaining 30 rows will be stored in a new file", default: false, type: :boolean 24 | end 25 | 26 | #Remind users to provide ARGVs at command-line 27 | if opts[:file_path].nil? 28 | print "Must provide Path & Filename for processing {add} --{file-path path/to/csv/file}/{filename}.csv" 29 | exit 30 | end 31 | 32 | #Get path for processing 33 | path_name = File.dirname(opts[:file_path]) 34 | 35 | #Stop if remove_columns is enbabled but remove.csv is missing and/or broken 36 | if opts[:remove_columns] == true 37 | #Stop if remove.csv missing or broken 38 | unless File.exists?("#{path_name}/remove.csv") 39 | puts "remove.csv is missing or mis-formatted. Please check remove-sample.csv for format" 40 | exit 41 | end 42 | end 43 | 44 | #Disliked Converted file as directory name so changed defual to split-files 45 | split_path_name = "split-files" 46 | 47 | #Clean-up previous processing of file by deleting previously processes split-file directory 48 | if File.exists?(split_path_name) 49 | FileUtils.rm_r "#{path_name}/#{split_path_name}" 50 | FileUtils::mkdir_p "#{path_name}/#{split_path_name}" 51 | else 52 | FileUtils::mkdir_p "#{path_name}/#{split_path_name}" 53 | end 54 | 55 | 56 | ###### 57 | # 58 | # Note the following changes the default filename. This seemed more logical to me. 59 | # 60 | ###### 61 | 62 | #Change default of split files to the original file name unless recieves input 63 | if opts[:new_file_name] == "split" 64 | s = opts[:file_path] 65 | s_name = s.split('/')[-1] #Get name of original CSV without path 66 | split_name = s_name.split('.')[0] 67 | else 68 | s = opts[:new_file_name] 69 | s_name = s.split('/')[-1] #Sanitizing incase user adds a path but overkill 70 | split_name = s_name.split('.')[0] 71 | end 72 | 73 | 74 | file = File.expand_path(opts[:file_path]) 75 | col_data = [] 76 | index = 1 77 | file_int = 0 78 | new_file_tmp = "#{split_path_name}/#{split_name}-%d.csv" 79 | new_file = sprintf new_file_tmp, file_int 80 | headers = []; 81 | csv_open_parameters = {headers: true, encoding: "UTF-8", quote_char: '"', col_sep: opts[:delimiter]} 82 | 83 | # get the number of rows in the file for remainder checking 84 | row_total = CSV.foreach(file, csv_open_parameters).count 85 | 86 | # do the actual splitting 87 | CSV.foreach(file, csv_open_parameters) do |row| 88 | 89 | if opts[:include_headers] && headers.empty? 90 | headers = row.to_hash.keys 91 | end 92 | 93 | col_data << row 94 | 95 | if index % opts[:line_count] == 0 || (opts[:include_remainders] && index == row_total) 96 | CSV.open(new_file, "wb", force_quotes: true) do |csv| 97 | if opts[:include_headers] 98 | csv << headers 99 | end 100 | 101 | col_data.each do |d| 102 | csv << d 103 | end 104 | end 105 | 106 | file_int = file_int + 1 107 | new_file = sprintf new_file_tmp, file_int 108 | col_data = [] 109 | end 110 | 111 | index = index + 1 112 | end 113 | 114 | #Added se the ability to process the split files (leaving original split files) and removing columns 115 | 116 | if opts[:remove_columns] == true 117 | 118 | #Clean-up previous processing and create new directory fo rthe split files with columns removed 119 | split_path_name_rmv_cols = "split-files-rmv-cols" 120 | if File.exists?(split_path_name_rmv_cols) 121 | FileUtils.rm_r "#{path_name}/#{split_path_name_rmv_cols}" 122 | FileUtils.mkdir_p "#{path_name}/#{split_path_name_rmv_cols}" 123 | else 124 | FileUtils.mkdir_p "#{path_name}/#{split_path_name_rmv_cols}" 125 | end 126 | 127 | Dir.glob("#{path_name}/#{split_path_name}/*.csv") do |csv_name| 128 | 129 | original = CSV.read(csv_name, { headers: true, return_headers: true, encoding: "UTF-8", quote_char: '"', col_sep: opts[:delimiter] }) 130 | 131 | rmv_col_names =[] 132 | rmvr = 0 133 | 134 | list = CSV.foreach('remove.csv', {headers: true, encoding: "UTF-8", quote_char: '"', col_sep:","}) do |row| 135 | rmv_col_names << row[0] 136 | end 137 | 138 | rmv_col_count = rmv_col_names.count 139 | 140 | while rmvr < (rmv_col_count-1) 141 | original.delete("#{rmv_col_names[rmvr]}") 142 | rmvr +=1 143 | end 144 | 145 | csv_rmv_name = csv_name.split('/')[-1] 146 | 147 | CSV.open("#{path_name}/#{split_path_name_rmv_cols}/#{csv_rmv_name}", 'w') do |csv| 148 | original.each do |row| 149 | csv << row 150 | end 151 | end 152 | end 153 | end 154 | -------------------------------------------------------------------------------- /remove-sample.csv: -------------------------------------------------------------------------------- 1 | Title 2 | Birthdate -------------------------------------------------------------------------------- /sample.csv: -------------------------------------------------------------------------------- 1 | FirstName,LastName,Title,ReportsTo.Email,Birthdate,Description 2 | Tom,Jones,Senior Director,buyer@salesforcesample.com,1940-06-07Z,"Self-described as ""the top"" branding guru on the West Coast" 3 | Ian,Dury,Chief Imagineer,cto@salesforcesample.com,,"World-renowned expert in fuzzy logic design. Influential in technology purchases." --------------------------------------------------------------------------------