├── .gitignore ├── cobol.py ├── example.cbl ├── example.py └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc -------------------------------------------------------------------------------- /cobol.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class CobolPatterns: 4 | opt_pattern_format = "({})?" 5 | 6 | row_pattern_base = r'^(?P\d{2})\s+(?P\S+)' 7 | row_pattern_occurs = r'\s+OCCURS (?P\d+) TIMES' 8 | row_pattern_indexed_by = r"\s+INDEXED BY\s(?P\S+)" 9 | row_pattern_redefines = r"\s+REDEFINES\s(?P\S+)" 10 | row_pattern_pic = r'\s+PIC\s+(?P\S+)' 11 | row_pattern_end = r'\.$' 12 | 13 | row_pattern = re.compile(row_pattern_base + 14 | opt_pattern_format.format(row_pattern_redefines) + 15 | opt_pattern_format.format(row_pattern_occurs) + 16 | opt_pattern_format.format(row_pattern_indexed_by) + 17 | opt_pattern_format.format(row_pattern_pic) + 18 | row_pattern_end) 19 | 20 | pic_pattern_repeats = re.compile(r'(.)\((\d+)\)') 21 | pic_pattern_float = re.compile(r'S?[9Z]*[.V][9Z]+') 22 | pic_pattern_integer = re.compile(r'S?[9Z]+') 23 | 24 | 25 | # Parse the pic string 26 | def parse_pic_string(pic_str): 27 | # Expand repeating chars 28 | while True: 29 | match = CobolPatterns.pic_pattern_repeats.search(pic_str) 30 | 31 | if not match: 32 | break 33 | 34 | expanded_str = match.group(1) * int(match.group(2)) 35 | 36 | pic_str = CobolPatterns.pic_pattern_repeats.sub(expanded_str, pic_str, 1) 37 | 38 | # Match to types 39 | if CobolPatterns.pic_pattern_float.match(pic_str): 40 | data_type = 'Float' 41 | elif CobolPatterns.pic_pattern_integer.match(pic_str): 42 | data_type = 'Integer' 43 | else: 44 | data_type = 'Char' 45 | 46 | # Handle signed 47 | if pic_str[0] == "S": 48 | data_type = "Signed " + data_type 49 | pic_str = pic_str[1:] 50 | 51 | # Handle precision 52 | decimal_pos = 0 53 | 54 | if 'V' in pic_str: 55 | decimal_pos = len(pic_str[pic_str.index('V') + 1 :]) 56 | pic_str = pic_str.replace('V', '') 57 | 58 | return {'type':data_type, 'length':len(pic_str), 'precision':decimal_pos} 59 | 60 | # Cleans the COBOL by converting the cobol informaton to single lines 61 | def clean_cobol(lines): 62 | holder = [] 63 | 64 | output = [] 65 | 66 | for row in lines: 67 | row = row[6:72].rstrip() 68 | 69 | if row == "" or row[0] in ('*','/'): 70 | continue 71 | 72 | holder.append(row if len(holder) == 0 else row.strip()) 73 | 74 | if row[-1] == ".": 75 | output.append(" ".join(holder)) 76 | 77 | holder = [] 78 | 79 | 80 | if len(holder) > 0: 81 | print "[WARNING] probably invalid COBOL - found unfinished line: ", " ".join(holder) 82 | 83 | return output 84 | 85 | """ 86 | Parses the COBOL 87 | - converts the COBOL line into a dictionarty containing the information 88 | - parses the pic information into type, length, precision 89 | - handles redefines 90 | """ 91 | def parse_cobol(lines): 92 | output = [] 93 | 94 | intify = ["level","occurs"] 95 | 96 | # All in 1 line now, let's parse 97 | for row in lines: 98 | match = CobolPatterns.row_pattern.match(row.strip()) 99 | 100 | if not match: 101 | print "Found unmatched row", row.strip() 102 | continue 103 | 104 | match = match.groupdict() 105 | 106 | for i in intify: 107 | match[i] = int(match[i] ) if match[i] is not None else None 108 | 109 | if match['pic'] is not None: 110 | match['pic_info'] = parse_pic_string(match['pic']) 111 | 112 | if match['redefines'] is not None: 113 | # Find item that is being redefined. 114 | try: 115 | redefinedItemIndex, redefinedItem = [(index, item) for index, item in enumerate(output) if item['name'] == match['redefines']][0] 116 | 117 | related_group = get_subgroup( redefinedItem['level'] , output[ redefinedItemIndex+1 : ] ) 118 | 119 | output = output[:redefinedItemIndex] + output[ redefinedItemIndex + len(related_group) + 1 : ] 120 | 121 | match['redefines'] = None 122 | except IndexError: 123 | print "Could not find the field to be redefined ({}) for row: {}".format(match['redefines'], row.strip()) 124 | 125 | output.append(match) 126 | 127 | return output 128 | 129 | # Helper function 130 | # Gets all the lines that have a higher level then the parent_level until 131 | # a line with equal or lower level then parent_level is encountered 132 | def get_subgroup(parent_level, lines): 133 | output = [] 134 | 135 | for row in lines: 136 | if row["level"] > parent_level: 137 | output.append(row) 138 | else: 139 | return output 140 | 141 | return output 142 | 143 | def denormalize_cobol(lines): 144 | return handle_occurs(lines, 1) 145 | 146 | # Helper function 147 | # Will go ahead and denormalize the COBOL 148 | # Beacuse the OCCURS are removed the INDEXED BY will also be removed 149 | def handle_occurs(lines, occurs, level_diff=0, name_postfix=""): 150 | output = [] 151 | 152 | for i in range(1, occurs+1): 153 | 154 | skipTill = 0 155 | new_name_postfix = name_postfix if occurs == 1 else name_postfix + '-' + str(i) 156 | 157 | for index, row in enumerate(lines): 158 | if index < skipTill: 159 | continue 160 | 161 | new_row = row.copy() 162 | 163 | new_row['level'] += level_diff 164 | 165 | # Not needed when flattened 166 | new_row['indexed_by'] = None 167 | 168 | if row['occurs'] is None: 169 | # First time occurs is just 1, we don't want to add _1 after *every* field 170 | new_row['name'] = row['name'] + new_name_postfix 171 | # + "-" + str(i) if occurs > 1 else row['name'] + name_postfix 172 | 173 | output.append(new_row) 174 | 175 | else: 176 | if row["pic"] is not None: 177 | # If it has occurs and pic just repeat the same line multiple times 178 | new_row['occurs'] = None 179 | 180 | for j in range(1, row["occurs"]+1): 181 | row_to_add = new_row.copy() 182 | 183 | # First time occurs is just 1, we don't want to add _1 after *every* field 184 | row_to_add["name"] = row['name'] + new_name_postfix + '-' + str(j) 185 | # + "-" + str(i) + "-" + str(j) if occurs > 1 else row['name'] + name_postfix + "-" + str(j) 186 | 187 | output.append(row_to_add) 188 | 189 | else: 190 | # Get all the lines that have to occur 191 | occur_lines = get_subgroup(row['level'], lines[index+1:]) 192 | 193 | # Calculate the new level difference that has to be applied 194 | new_level_diff = level_diff + row['level'] - occur_lines[0]['level'] 195 | 196 | output += handle_occurs(occur_lines, row['occurs'], new_level_diff, new_name_postfix) 197 | 198 | skipTill = index + len(occur_lines) + 1 199 | 200 | return output 201 | 202 | """ 203 | Clean the names. 204 | 205 | Options to: 206 | - strip prefixes on names 207 | - enforce unique names 208 | - make database safe names by converting - to _ 209 | """ 210 | def clean_names(lines, ensure_unique_names=False, strip_prefix=False, make_database_safe=False): 211 | names = {} 212 | 213 | for row in lines: 214 | if strip_prefix: 215 | row['name'] = row['name'][ row['name'].find('-')+1 : ] 216 | 217 | if row['indexed_by'] is not None: 218 | row['indexed_by'] = row['indexed_by'][ row['indexed_by'].find('-')+1 : ] 219 | 220 | if ensure_unique_names: 221 | i = 1 222 | while (row['name'] if i == 1 else row['name'] + "-" + str(i)) in names: 223 | i += 1 224 | 225 | names[row['name'] if i == 1 else row['name'] + "-" + str(i)] = 1 226 | 227 | if i > 1: 228 | row['name'] = row['name'] + "-" + str(i) 229 | 230 | if make_database_safe: 231 | row['name'] = row['name'].replace("-","_") 232 | 233 | 234 | return lines 235 | 236 | def process_cobol(lines): 237 | return clean_names(denormalize_cobol(parse_cobol(clean_cobol(lines))), True, True, True) 238 | 239 | # Prints a Copybook compatible file 240 | def print_cobol(lines): 241 | output = [] 242 | 243 | default_padding = ' ' * 7 244 | 245 | levels = [0] 246 | 247 | for row in lines: 248 | row_output = [] 249 | 250 | if row['level'] > levels[-1]: 251 | levels.append(row['level']) 252 | else: 253 | while row['level'] < levels[-1]: 254 | levels.pop() 255 | 256 | row_output.append( (len(levels)-1) * ' ' ) 257 | row_output.append( "{0:02d} ".format(row['level']) ) 258 | row_output.append( row['name']) 259 | 260 | if row['indexed_by'] is not None: 261 | row_output.append(" INDEXED BY "+row['indexed_by']) 262 | 263 | if row['occurs'] is not None: 264 | row_output.append( " OCCURS {0:04d} TIMES".format(row['occurs']) ) 265 | 266 | if row['pic'] is not None: 267 | row_output.append( " PIC " + row['pic'] ) 268 | 269 | row_output.append(".") 270 | 271 | tot_length = 0 272 | max_data_length = 66 273 | outp = default_padding 274 | 275 | for data in row_output: 276 | 277 | if len(outp) + len(data) + 1 > max_data_length: 278 | # Makes rows 80 chars 279 | outp += (80-len(outp)) * ' ' 280 | 281 | output.append(outp) 282 | 283 | # Start the following line with an extra padding 284 | outp = default_padding + (len(levels)-1) * ' ' + ' ' 285 | 286 | outp += data 287 | 288 | outp += (80-len(outp)) * ' ' 289 | output.append(outp) 290 | 291 | print "\n".join(output) 292 | 293 | if __name__ == '__main__': 294 | import argparse 295 | import os.path 296 | 297 | parser = argparse.ArgumentParser(description="Parse COBOL Copybooks") 298 | parser.add_argument("filename", help="The filename of the copybook.") 299 | parser.add_argument("--skip-all-processing", help="Only processes the redefines.", default=False, action="store_true") 300 | parser.add_argument("--skip-unique-names", help="Skips making all names unique.", default=False, action="store_true") 301 | parser.add_argument("--skip-denormalize", help="Skips denormalizing the COBOL.", default=False, action="store_true") 302 | parser.add_argument("--skip-strip-prefix", help="Skips stripping the prefix from the names.", default=False, action="store_true") 303 | 304 | args = parser.parse_args() 305 | 306 | if not os.path.isfile(args.filename): 307 | print "Could not find", args.filename 308 | exit() 309 | 310 | with open(args.filename,'r') as f: 311 | lines = parse_cobol(clean_cobol(f.readlines())) 312 | 313 | if not args.skip_all_processing: 314 | if not args.skip_denormalize: 315 | lines = denormalize_cobol(lines) 316 | 317 | if not args.skip_strip_prefix or not args.skip_unique_names: 318 | lines = clean_names(lines, not args.skip_unique_names, not args.skip_strip_prefix) 319 | 320 | print_cobol(lines) 321 | -------------------------------------------------------------------------------- /example.cbl: -------------------------------------------------------------------------------- 1 | 00000 * Example COBOL Copybook file AAAAAAAA 2 | 00000 01 PAULUS-EXAMPLE-GROUP. AAAAAAAA 3 | 00000 05 PAULUS-ANOTHER-GROUP OCCURS 0003 TIMES. AAAAAAAA 4 | 00000 10 PAULUS-FIELD-1 PIC X(3). AAAAAAAA 5 | 00000 10 PAULUS-FIELD-2 REDEFINES PAULUS-FIELD-1 PIC 9(3). AAAAAAAA 6 | 00000 10 PAULUS-FIELD-3 OCCURS 0002 TIMES AAAAAAAA 7 | 00000 PIC S9(3)V99. AAAAAAAA 8 | 00000 05 PAULUS-THIS-IS-ANOTHER-GROUP. AAAAAAAA 9 | 00000 10 PAULUS-YES PIC X(5). AAAAAAAA 10 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | import cobol 2 | 3 | with open("example.cbl",'r') as f: 4 | for row in cobol.process_cobol(f.readlines()): 5 | print row['name'] 6 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # COBOL Copybook parser in Python 2 | This is a COBOL Copybook parser in Python featuring the following options: 3 | - Parse the Copybook into a usable format to use in Python 4 | - Clean up the Copybook by processing REDEFINES statements and remove unused definitions 5 | - Denormalize the Copybook 6 | - Write the cleaned Copybook in COBOL 7 | - Strip prefixes of field names and ensure that the Copybook only contains unique names 8 | - Can be used from the command-line or included in own Python projects 9 | 10 | Because I couldn't find a COBOL Copybook parser that fitted all my needs I wrote my own. It doesn't support all functions found in the Copybook, just the ones that I met on my path: REDEFINES, INDEXED BY, OCCURS. 11 | 12 | On a day to day basis I use it so that Informatica PowerCenter creates only 1 table of my COBOL data instead multiple. 13 | 14 | The code uses the pic parser code from [pyCOBOL](http://www.travelingfrontiers.com/projects/doku.php?id=pycobol). 15 | 16 | This code is licensed under GPLv3. 17 | 18 | ## Example output 19 | Below is an example Copybook file before and after being processed. 20 | 21 | Before: 22 | 23 | 00000 * Example COBOL Copybook file AAAAAAAA 24 | 00000 01 PAULUS-EXAMPLE-GROUP. AAAAAAAA 25 | 00000 05 PAULUS-ANOTHER-GROUP OCCURS 0003 TIMES. AAAAAAAA 26 | 00000 10 PAULUS-FIELD-1 PIC X(3). AAAAAAAA 27 | 00000 10 PAULUS-FIELD-2 REDEFINES PAULUS-FIELD-1 PIC 9(3). AAAAAAAA 28 | 00000 10 PAULUS-FIELD-3 OCCURS 0002 TIMES AAAAAAAA 29 | 00000 PIC S9(3)V99. AAAAAAAA 30 | 00000 05 PAULUS-THIS-IS-ANOTHER-GROUP. AAAAAAAA 31 | 00000 10 PAULUS-YES PIC X(5). AAAAAAAA 32 | 33 | After: 34 | 35 | 01 EXAMPLE-GROUP. 36 | 05 FIELD-2-1 PIC 9(3). 37 | 05 FIELD-3-1-1 PIC S9(3)V99. 38 | 05 FIELD-3-1-2 PIC S9(3)V99. 39 | 05 FIELD-2-2 PIC 9(3). 40 | 05 FIELD-3-2-1 PIC S9(3)V99. 41 | 05 FIELD-3-2-2 PIC S9(3)V99. 42 | 05 FIELD-2-3 PIC 9(3). 43 | 05 FIELD-3-3-1 PIC S9(3)V99. 44 | 05 FIELD-3-3-2 PIC S9(3)V99. 45 | 05 THIS-IS-ANOTHER-GROUP. 46 | 10 YES PIC X(5). 47 | 48 | 49 | ## How to use 50 | You can use it in two ways: inside your own python code or as a stand-alone command-line utility. 51 | 52 | ### Command-line 53 | Do a git clone from the repository and inside your brand new python-cobol folder run: 54 | 55 | python cobol.py example.cbl 56 | 57 | This will process the redefines, denormalize the file, strip the prefixes and ensure all names are unique. 58 | 59 | The utility allows for some command-line switches to disable some processing steps. 60 | 61 | $ python cobol.py --help 62 | usage: cobol.py [-h] [--skip-all-processing] [--skip-unique-names] 63 | [--skip-denormalize] [--skip-strip-prefix] 64 | filename 65 | 66 | Parse COBOL Copybooks 67 | 68 | positional arguments: 69 | filename The filename of the copybook. 70 | 71 | optional arguments: 72 | -h, --help show this help message and exit 73 | --skip-all-processing 74 | Skips unique names, denormalization and . 75 | --skip-unique-names Skips making all names unique. 76 | --skip-denormalize Skips denormalizing the COBOL. 77 | --skip-strip-prefix Skips stripping the prefix from the names. 78 | 79 | ### From within your Python code 80 | The parser can also be called from your Python code. All you need is a list of lines in COBOL Copybook format. See example.py how one would do it: 81 | 82 | ```python 83 | import cobol 84 | 85 | with open("example.cbl",'r') as f: 86 | for row in cobol.process_cobol(f.readlines()): 87 | print row['name'] 88 | ``` 89 | 90 | It is also possible to call one of the more specialized functions within cobol.py: 91 | 92 | * **clean_cobol(lines)** 93 | 94 | Cleans the COBOL by converting the cobol informaton to single lines 95 | 96 | 97 | * **parse_cobol(lines)** 98 | 99 | Parses a list of COBOL field definitions into a list of dictionaries containing the parsed information. 100 | 101 | 102 | * **denormalize_cobol(lines)** 103 | 104 | Denormalizes parsed COBOL lines 105 | 106 | 107 | * **clean_names(lines, ensure_unique_names=False, strip_prefix=False, make_database_safe=False)** 108 | 109 | Cleans names of the fields in a list of parsed COBOL lines. Options to strip prefixes, enforce unique names and make the names database safe by converting dashes (-) to underscores (_) 110 | 111 | 112 | * **print_cobol(lines)** 113 | 114 | Prints parsed COBOL lines in the Copybook format 115 | --------------------------------------------------------------------------------