├── .gitignore ├── tests ├── __init__.py ├── help_tests.py └── processor_tests.py ├── MANIFEST.in ├── requirements.txt ├── fixtures ├── sample.tsv ├── sample.csv ├── sample.psv └── au.csv ├── .travis.yml ├── makefile ├── setup.py ├── LICENSE ├── csvfilter └── __init__.py ├── README.rst └── bin └── csvfilter /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | include LICENSE 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | envoy==0.0.2 2 | nose==1.3.4 3 | -------------------------------------------------------------------------------- /fixtures/sample.tsv: -------------------------------------------------------------------------------- 1 | This is a test 2 | And so is this 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 2.6 4 | - 2.7 5 | install: 6 | script: 7 | - pip install -r requirements.txt 8 | - ./setup.py develop 9 | - nosetests -------------------------------------------------------------------------------- /fixtures/sample.csv: -------------------------------------------------------------------------------- 1 | "1","Terry, Smith","Flat 1, Mansion House, London N12 4RT" 2 | "2","Doe, John","73 Woodside Road, Manchester M3 2ER" 3 | "3","Partridge, Alan","Lynton Travel Tavern, Norwich" -------------------------------------------------------------------------------- /fixtures/sample.psv: -------------------------------------------------------------------------------- 1 | "1"|"Terry, Smith"|"Flat 1, Mansion House, London N12 4RT" 2 | "2"|"Doe, John"|"73 Woodside Road, Manchester M3 2ER" 3 | "3"|"Partridge, Alan"|"Lynton Travel Tavern, Norwich" -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | install: clean 2 | pip install -r requirements.txt 3 | python setup.py develop 4 | 5 | clean: 6 | find . -name "*.pyc" -delete 7 | -rm -rf *.egg-info dist 8 | 9 | release: 10 | python setup.py sdist upload 11 | git push --tags 12 | -------------------------------------------------------------------------------- /tests/help_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import envoy 3 | 4 | from csvfilter import VERSION 5 | 6 | 7 | class HelpOutputTests(unittest.TestCase): 8 | 9 | def setUp(self): 10 | self.r = envoy.run('csvfilter -h') 11 | 12 | def test_help_output(self): 13 | self.assertEqual(0, self.r.status_code) 14 | 15 | def test_version_is_present(self): 16 | self.assertTrue(VERSION in self.r.std_out) 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup, find_packages 3 | from csvfilter import VERSION 4 | 5 | 6 | setup(name='csvfilter', 7 | version=VERSION, 8 | url='https://github.com/codeinthehole/csvfilter', 9 | author="David Winterbottom", 10 | author_email="david.winterbottom@gmail.com", 11 | description="A command-line utility and Python API for manipulating CSV data, eg plucking columns and reordering them. It's a bit like the unix utility 'cut'", 12 | license='MIT', 13 | long_description=open('README.rst').read(), 14 | packages=find_packages(exclude=["tests*"]), 15 | scripts=['bin/csvfilter']) 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2014 csvfilter David Winterbottom 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /csvfilter/__init__.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | 4 | # Avoid issue with fields larger than max size 5 | csv.field_size_limit(sys.maxsize) 6 | 7 | VERSION = '0.3.2' 8 | 9 | 10 | class Processor(object): 11 | 12 | def __init__(self, fields=None, invert=False, delimiter=',', 13 | quotechar='"', skip=0): 14 | self.fields = fields 15 | self.invert = invert 16 | self.delimiter = delimiter 17 | self.quotechar = quotechar 18 | self.skip = skip 19 | self.validators = [] 20 | 21 | def add_validator(self, f): 22 | self.validators.append(f) 23 | 24 | def process(self, file_handle): 25 | reader = csv.reader(file_handle, delimiter=self.delimiter, 26 | quotechar=self.quotechar) 27 | for row in reader: 28 | output = None 29 | if reader.line_num <= self.skip: 30 | continue 31 | if self.fields: 32 | if not self.invert: 33 | output = [row[i] for i in self.fields if len(row) > i] 34 | else: 35 | output = [e for i,e in enumerate(row) if i not in self.fields] 36 | else: 37 | output = row 38 | if not self.is_valid(output): 39 | continue 40 | if output: 41 | yield output 42 | 43 | def is_valid(self, row): 44 | for validator in self.validators: 45 | if not validator(row): 46 | return False 47 | return True 48 | -------------------------------------------------------------------------------- /fixtures/au.csv: -------------------------------------------------------------------------------- 1 | "Pcode","Locality","State","Comments","DeliveryOffice","PresortIndicator","ParcelZone","BSPnumber","BSPname","Category","Lat","Long" 2 | "02,00","AUSTRALIAN NATIONAL UNIVERSITY","ACT","PO Boxes","AUSTRALIAN NATIONAL UNI LPO x","150","N2 ","019","CANBERRA","Post Office Boxes ","-35.277272","149.117136" 3 | "0221","BARTON","ACT","LVR Special Mailing",,"150","N2 ","019","CANBERRA","LVR ","-35.201372","149.095065" 4 | "0800","DARWIN","NT",,"DARWIN DELIVERY CENTRE ","085","NT1","001","DARWIN","Delivery Area ","-12.801028","130.955789" 5 | "0801","DARWIN","NT","GPO Boxes","DARWIN GPO DELIVERY ANNEXE ","085","NT1","001","DARWIN","Post Office Boxes ","-12.801028","130.955789" 6 | "0804","PARAP","NT","PO Boxes","PARAP LPO ","085","NT1","001","DARWIN","Post Office Boxes ","-12.432181","130.84331" 7 | "0810","ALAWA","NT",,"DARWIN DELIVERY CENTRE ","085","NT1","001","DARWIN","Delivery Area ","-12.378451","130.877014" 8 | "0810","BRINKIN","NT",,"DARWIN DELIVERY CENTRE ","085","NT1","001","DARWIN","Delivery Area ","-12.378451","130.877014" 9 | "0810","CASUARINA","NT",,"DARWIN DELIVERY CENTRE ","085","NT1","001","DARWIN","Delivery Area ","-12.378451","130.877014" 10 | "0810","COCONUT GROVE","NT",,"DARWIN DELIVERY CENTRE ","085","NT1","001","DARWIN","Delivery Area ","-12.378451","130.877014" 11 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | csvfilter - Simple CSV filtering 3 | ================================ 4 | 5 | A simple wrapper around Python's CSV module to provide a command-line tool for 6 | filtering columns from a CSV file. This is useful as standard tools like awk 7 | can't easily handle the quoting and escaping used in CSV files. 8 | 9 | Basically, it's a bit like ``cut`` but for CSVs. 10 | 11 | Install 12 | ------- 13 | 14 | From PyPi:: 15 | 16 | pip install csvfilter 17 | 18 | Use 19 | --- 20 | 21 | Pluck fields 1, 3 and 5 from ``in.csv``:: 22 | 23 | csvfilter -f 1,3,5 in.csv > out.csv 24 | 25 | Pluck all fields apart from column 2 from STDIN:: 26 | 27 | cat in.csv | csvfilter -f 2 -i > out.csv 28 | 29 | Convert pipe-separated file to comma-separated (by default, output is 30 | comma-separated):: 31 | 32 | csvfilter -d"|" in.psv > out.csv 33 | 34 | Skip that pesky header row:: 35 | 36 | cat in.csv | csvfilter --skip=1 37 | 38 | As you can see, CSV data can be supplied through STDIN or by running ``csvfilter`` directly on a 39 | file. 40 | 41 | Help is in the usual place:: 42 | 43 | $ csvfilter --help 44 | 45 | Usage: csvfilter [options] [inputfile] 46 | 47 | Source: https://github.com/codeinthehole/csvfilter/ 48 | 49 | Options: 50 | -h, --help show this help message and exit 51 | -f FIELDS, --fields=FIELDS 52 | Specify which fields to pluck 53 | -s SKIP, --skip=SKIP Number of rows to skip 54 | -d DELIMITER, --delimiter=DELIMITER 55 | Delimiter of incoming CSV data 56 | -q QUOTECHAR, --quotechar=QUOTECHAR 57 | Quotechar of incoming CSV data 58 | 59 | -i, --inverse Invert the filter - ie drop the selected fields 60 | --out-delimiter=OUT_DELIMITER 61 | Delimiter to use for output 62 | --out-quotechar=OUT_QUOTECHAR 63 | Quote character to use for output 64 | 65 | Report issues 66 | ------------- 67 | 68 | Use the `Github issue tracker`_ or, better still... 69 | 70 | .. _`Github issue tracker`: https://github.com/codeinthehole/csvfilter/issues 71 | 72 | Contribute 73 | ---------- 74 | 75 | After cloning, install the testing requirements:: 76 | 77 | make 78 | 79 | Run the tests with:: 80 | 81 | nosetests 82 | 83 | and, if it helps, use the fixture files to test your amendments:: 84 | 85 | cat fixtures/au.csv | csvfilter -f 3,1,2 -s 1 86 | csvfilter fixutres/au.csv -f 1,2 -i 87 | 88 | Have fun. 89 | -------------------------------------------------------------------------------- /bin/csvfilter: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | from optparse import OptionParser 4 | import csv 5 | 6 | from csvfilter import Processor, VERSION 7 | 8 | 9 | def main(options, args): 10 | infile, outfile = get_files(options, args) 11 | processor = get_processor(options, args) 12 | writer = get_writer(outfile, options) 13 | pump(processor, infile, writer) 14 | 15 | 16 | def get_files(options, args): 17 | infile = open(args[0], 'r') if len(args) > 0 else sys.stdin 18 | return infile, sys.stdout 19 | 20 | 21 | def get_processor(options, args): 22 | fields = map(int, options.fields.split(',')) if options.fields else None 23 | return Processor(fields=fields, 24 | skip=options.skip, 25 | delimiter=options.delimiter, 26 | quotechar=options.quotechar, 27 | invert=options.inverse) 28 | 29 | 30 | def get_writer(outfile, options): 31 | return csv.writer(outfile, delimiter=options.out_delimiter, 32 | quotechar=options.out_quotechar) 33 | 34 | 35 | def pump(processor, infile, writer): 36 | for output in processor.process(infile): 37 | writer.writerow(output) 38 | 39 | 40 | def process_delimiter(option, opt, value, parser): 41 | if value == '\\t': 42 | value = '\t' 43 | parser.values.delimiter = value 44 | 45 | 46 | if __name__ == '__main__': 47 | usage = "Usage: %prog [options] [inputfile]" +\ 48 | "\n\nVersion: %s\nSource: %s" % (VERSION, 'https://github.com/codeinthehole/csvfilter/') 49 | parser = OptionParser(usage=usage) 50 | parser.add_option('-f', '--fields', dest='fields', default=None, 51 | help="Specify which fields to pluck") 52 | parser.add_option('-s', '--skip', dest='skip', default=0, 53 | type='int', help="Number of rows to skip") 54 | parser.add_option('-d', '--delimiter', type='string', default=',', 55 | action='callback', callback=process_delimiter, 56 | help="Delimiter of incoming CSV data") 57 | parser.add_option('-q', '--quotechar', dest='quotechar', default='"', 58 | help="Quotechar of incoming CSV data") 59 | parser.add_option('-i', '--inverse', dest='inverse', default=False, 60 | action='store_true', help="Invert the filter - ie drop the selected fields") 61 | parser.add_option('--out-delimiter', dest='out_delimiter', default=',', 62 | help="Delimiter to use for output") 63 | parser.add_option('--out-quotechar', dest='out_quotechar', default='"', 64 | help="Quote character to use for output") 65 | (options, args) = parser.parse_args() 66 | main(options, args) 67 | -------------------------------------------------------------------------------- /tests/processor_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from csvfilter import Processor 4 | 5 | SAMPLE_CSV = ['a,b,c', 'd,e,f', 'g,h,i'] 6 | SAMPLE_PSV = ['a|b|c', 'd|e|f', 'g|h|i'] 7 | SAMPLE_TSV = ['a b c', 'd e f', 'g h i'] 8 | 9 | SAMPLE_QUOTED_CSV = [ 10 | '"Pcode","Locality","State","Comments","DeliveryOffice","PresortIndicator","ParcelZone","BSPnumber","BSPname","Category","Lat","Long"', 11 | '"0200","AUSTRALIAN NATIONAL UNIVERSITY","ACT","PO Boxes","AUSTRALIAN NATIONAL UNI LPO x","150","N2 ","019","CANBERRA","Post Office Boxes ","-35.277272","149.117136"', 12 | '"0221","BARTON","ACT","LVR Special Mailing",,"150","N2 ","019","CANBERRA","LVR ","-35.201372","149.095065"' 13 | ] 14 | 15 | SAMPLE_CUSTOM_QUOTED_CSV = [ 16 | '|Pcode|,|Locality|,|State|,|Comments|,|DeliveryOffice|,|PresortIndicator|,|ParcelZone|,|BSPnumber|,|BSPname|,|Category|,|Lat|,|Long|', 17 | '|0200|,|AUSTRALIAN NATIONAL UNIVERSITY|,|ACT|,|PO Boxes|,|AUSTRALIAN NATIONAL UNI LPO x|,|150|,|N2 |,|019|,|CANBERRA|,|Post Office Boxes |,|-35.277272|,|149.117136|', 18 | '|0221|,|BARTON|,|ACT|,|LVR Special Mailing|,,|150|,|N2 |,|019|,|CANBERRA|,|LVR |,|-35.201372|,|149.095065|' 19 | ] 20 | 21 | 22 | class ProcessorTests(unittest.TestCase): 23 | 24 | def test_no_config_does_no_processing(self): 25 | p = Processor() 26 | output = [row for row in p.process(SAMPLE_CSV)] 27 | self.assertEqual([['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']], output) 28 | 29 | def test_single_col_plucking(self): 30 | p = Processor(fields=[0]) 31 | output = [row for row in p.process(SAMPLE_CSV)] 32 | self.assertEqual([['a'], ['d'], ['g']], output) 33 | 34 | def test_validator(self): 35 | p = Processor() 36 | p.add_validator(lambda row: row[0] == 'a') 37 | output = [row for row in p.process(SAMPLE_CSV)] 38 | self.assertEqual([['a', 'b', 'c']], output) 39 | 40 | def test_multiple_col_plucking(self): 41 | p = Processor(fields=[0, 2]) 42 | output = [row for row in p.process(SAMPLE_CSV)] 43 | self.assertEqual([['a', 'c'], ['d', 'f'], ['g', 'i']], output) 44 | 45 | def test_multiple_col_plucking_with_reordering(self): 46 | p = Processor(fields=[2, 1]) 47 | output = [row for row in p.process(SAMPLE_CSV)] 48 | self.assertEqual([['c', 'b'], ['f', 'e'], ['i', 'h']], output) 49 | 50 | def test_single_col_dropping(self): 51 | p = Processor(fields=[1], invert=True) 52 | output = [row for row in p.process(SAMPLE_CSV)] 53 | self.assertEqual([['a', 'c'], ['d', 'f'], ['g', 'i']], output) 54 | 55 | def test_multiple_col_dropping(self): 56 | p = Processor(fields=[0,2], invert=True) 57 | output = [row for row in p.process(SAMPLE_CSV)] 58 | self.assertEqual([['b'], ['e'], ['h']], output) 59 | 60 | def test_single_col_plucking_with_skip(self): 61 | p = Processor(fields=[0], skip=1) 62 | output = [row for row in p.process(SAMPLE_CSV)] 63 | self.assertEqual([['d'], ['g']], output) 64 | 65 | def test_pluck_with_pipes(self): 66 | p = Processor(fields=[0], delimiter='|') 67 | output = [row for row in p.process(SAMPLE_PSV)] 68 | self.assertEqual([['a'], ['d'], ['g']], output) 69 | 70 | def test_quoted_pluck(self): 71 | p = Processor(fields=[0, 10, 11], skip=1) 72 | output = [row for row in p.process(SAMPLE_QUOTED_CSV)] 73 | expected = [['0200', '-35.277272', '149.117136'], ['0221', '-35.201372', '149.095065']] 74 | self.assertEqual(expected, output) 75 | 76 | def test_custom_quoted_pluck(self): 77 | p = Processor(fields=[0, 10, 11], quotechar='|', skip=1) 78 | output = [row for row in p.process(SAMPLE_CUSTOM_QUOTED_CSV)] 79 | expected = [['0200', '-35.277272', '149.117136'], ['0221', '-35.201372', '149.095065']] 80 | self.assertEqual(expected, output) 81 | 82 | def test_tab_delimited_input(self): 83 | p = Processor(delimiter="\t") 84 | output = [row for row in p.process(SAMPLE_TSV)] 85 | self.assertEqual([['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']], 86 | output) 87 | --------------------------------------------------------------------------------