├── README.md ├── LICENSE └── labeldata.py /README.md: -------------------------------------------------------------------------------- 1 | # textutil-assign-labels 2 | Simple utility script for assigning labels to data in multiple files. All rows in a single file are assigned the same label. 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 floydhub 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /labeldata.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | import argparse 3 | import random 4 | 5 | def str2bool(val): 6 | """ 7 | Helper method to convert string to bool 8 | """ 9 | if val is None: 10 | return False 11 | val = val.lower().strip() 12 | if val in ['true', 't', 'yes', 'y', '1', 'on']: 13 | return True 14 | elif val in ['false', 'f', 'no', 'n', '0', 'off']: 15 | return False 16 | 17 | def main(): 18 | """ 19 | Simple helper script to assign labels to data in multiple files. All the rows in 20 | a single file will be assigned the same label. 21 | 22 | This reads all the data into memory first. It is fast, but unsuitable for large files 23 | """ 24 | 25 | # Parse command line args 26 | parser = argparse.ArgumentParser(description='Assign labels to input data') 27 | 28 | fileargs = ['input1', 'input2', 'input3', 'input4', 'input5'] 29 | labelargs = ['label1', 'label2', 'label3', 'label4', 'label5'] 30 | 31 | for i in range(len(fileargs)): 32 | parser.add_argument('-i{}'.format(i+1), '--{}'.format(fileargs[i]), 33 | required=True if i==0 else False, 34 | help='Path to input file {}'.format(i)) 35 | parser.add_argument('-l{}'.format(i+1), '--{}'.format(labelargs[i]), 36 | required=True if i==0 else False, 37 | help='Label for input file {}'.format(i)) 38 | 39 | parser.add_argument('-o', '--output', required=True, help='Path to output file') 40 | parser.add_argument('-d', '--delimiter', required=True, default='\t', 41 | help='Column delimiter between row and label') 42 | parser.add_argument('-s', '--shuffle', required=False, type=str2bool, 43 | default='False', help='Shuffle rows in output?') 44 | parser.add_argument('-p', '--position', required=False, choices=['start', 'end'], 45 | default='end', help='Position label at start or end of row? (Default is end)') 46 | 47 | args = parser.parse_args() 48 | # Unescape the delimiter 49 | # args.delimiter = args.delimiter.decode('string_escape') # Python 2 50 | args.delimiter = bytes(args.delimiter, "utf-8").decode('unicode_escape') # Python 3 51 | # Convert args to dict 52 | vargs = vars(args) 53 | 54 | print("\nArguments:") 55 | for arg in vargs: 56 | print("{}={}".format(arg, getattr(args, arg))) 57 | 58 | # Load data from files 59 | output = [] 60 | for i in range(len(fileargs)): 61 | file = vargs[fileargs[i]] 62 | label = vargs[labelargs[i]] 63 | if file and label: 64 | print("\nProcessing Input{}".format(i)) 65 | rows = list(open(file, 'r').readlines()) 66 | if args.position == 'start': # append label to start of row 67 | rows = [label + args.delimiter + row.strip() for row in rows] 68 | else: # append label to end of row 69 | rows = [row.strip() + args.delimiter + label for row in rows] 70 | output = output + rows 71 | 72 | # Shuffle all rows? 73 | if args.shuffle == True: 74 | print("\nShuffling rows") 75 | random.shuffle(output) 76 | 77 | with open(args.output, "w") as f: 78 | print("\nDumping to output") 79 | for row in output: 80 | f.write("{}\n".format(row)) 81 | 82 | print("\nDone. Bye!") 83 | 84 | if __name__ == '__main__': 85 | main() --------------------------------------------------------------------------------