├── .gitignore
├── cobol.py
├── example.cbl
├── example.py
└── readme.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc


--------------------------------------------------------------------------------
/cobol.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | class CobolPatterns:
  4 |     opt_pattern_format = "({})?"
  5 | 
  6 |     row_pattern_base = r'^(?P<level>\d{2})\s+(?P<name>\S+)'
  7 |     row_pattern_occurs = r'\s+OCCURS (?P<occurs>\d+) TIMES'
  8 |     row_pattern_indexed_by = r"\s+INDEXED BY\s(?P<indexed_by>\S+)"
  9 |     row_pattern_redefines = r"\s+REDEFINES\s(?P<redefines>\S+)"
 10 |     row_pattern_pic = r'\s+PIC\s+(?P<pic>\S+)'
 11 |     row_pattern_end = r'\.$'
 12 | 
 13 |     row_pattern = re.compile(row_pattern_base + 
 14 |                              opt_pattern_format.format(row_pattern_redefines) + 
 15 |                              opt_pattern_format.format(row_pattern_occurs) + 
 16 |                              opt_pattern_format.format(row_pattern_indexed_by) + 
 17 |                              opt_pattern_format.format(row_pattern_pic) + 
 18 |                              row_pattern_end)
 19 | 
 20 |     pic_pattern_repeats = re.compile(r'(.)\((\d+)\)')
 21 |     pic_pattern_float = re.compile(r'S?[9Z]*[.V][9Z]+')
 22 |     pic_pattern_integer = re.compile(r'S?[9Z]+')
 23 | 
 24 | 
 25 | # Parse the pic string
 26 | def parse_pic_string(pic_str):
 27 |     # Expand repeating chars
 28 |     while True:
 29 |         match = CobolPatterns.pic_pattern_repeats.search(pic_str)
 30 | 
 31 |         if not match:
 32 |             break
 33 |         
 34 |         expanded_str = match.group(1) * int(match.group(2))
 35 |         
 36 |         pic_str = CobolPatterns.pic_pattern_repeats.sub(expanded_str, pic_str, 1)
 37 | 
 38 |     # Match to types
 39 |     if CobolPatterns.pic_pattern_float.match(pic_str):
 40 |         data_type = 'Float'
 41 |     elif CobolPatterns.pic_pattern_integer.match(pic_str):
 42 |         data_type = 'Integer'
 43 |     else:
 44 |         data_type = 'Char'
 45 | 
 46 |     # Handle signed
 47 |     if pic_str[0] == "S":
 48 |         data_type = "Signed " + data_type
 49 |         pic_str = pic_str[1:]
 50 | 
 51 |     # Handle precision
 52 |     decimal_pos = 0
 53 | 
 54 |     if 'V' in pic_str:
 55 |         decimal_pos = len(pic_str[pic_str.index('V') + 1 :])
 56 |         pic_str = pic_str.replace('V', '')
 57 | 
 58 |     return {'type':data_type, 'length':len(pic_str), 'precision':decimal_pos}
 59 | 
 60 | # Cleans the COBOL by converting the cobol informaton to single lines
 61 | def clean_cobol(lines):
 62 |     holder = []
 63 | 
 64 |     output = []
 65 | 
 66 |     for row in lines:            
 67 |         row = row[6:72].rstrip()
 68 | 
 69 |         if row == "" or row[0] in ('*','/'):
 70 |             continue
 71 | 
 72 |         holder.append(row if len(holder) == 0 else row.strip())
 73 | 
 74 |         if row[-1] == ".":
 75 |             output.append(" ".join(holder))
 76 | 
 77 |             holder = []
 78 |             
 79 | 
 80 |     if len(holder) > 0:
 81 |         print "[WARNING] probably invalid COBOL - found unfinished line: ", " ".join(holder)
 82 | 
 83 |     return output
 84 | 
 85 | """
 86 | Parses the COBOL
 87 |  - converts the COBOL line into a dictionarty containing the information
 88 |  - parses the pic information into type, length, precision 
 89 |  - handles redefines
 90 | """
 91 | def parse_cobol(lines):
 92 |     output = []
 93 | 
 94 |     intify = ["level","occurs"]
 95 | 
 96 |     # All in 1 line now, let's parse
 97 |     for row in lines:
 98 |         match = CobolPatterns.row_pattern.match(row.strip())
 99 | 
100 |         if not match:
101 |             print "Found unmatched row", row.strip()
102 |             continue
103 | 
104 |         match = match.groupdict()
105 | 
106 |         for i in intify:
107 |             match[i] = int(match[i] ) if match[i] is not None else None
108 | 
109 |         if match['pic'] is not None:
110 |             match['pic_info'] = parse_pic_string(match['pic'])
111 | 
112 |         if match['redefines'] is not None:
113 |             # Find item that is being redefined.
114 |             try:
115 |                 redefinedItemIndex, redefinedItem = [(index, item) for index, item in enumerate(output) if item['name'] == match['redefines']][0]
116 | 
117 |                 related_group = get_subgroup( redefinedItem['level'] , output[ redefinedItemIndex+1 : ] )
118 | 
119 |                 output = output[:redefinedItemIndex] + output[ redefinedItemIndex + len(related_group) + 1 : ]
120 | 
121 |                 match['redefines'] = None
122 |             except IndexError:
123 |                 print "Could not find the field to be redefined ({}) for row: {}".format(match['redefines'], row.strip())
124 | 
125 |         output.append(match)
126 | 
127 |     return output
128 | 
129 | # Helper function
130 | # Gets all the lines that have a higher level then the parent_level until
131 | # a line with equal or lower level then parent_level is encountered
132 | def get_subgroup(parent_level, lines):
133 |     output = []
134 | 
135 |     for row in lines:
136 |         if row["level"] > parent_level:
137 |             output.append(row)
138 |         else:
139 |             return output
140 | 
141 |     return output
142 | 
143 | def denormalize_cobol(lines):
144 |     return handle_occurs(lines, 1)
145 | 
146 | # Helper function
147 | # Will go ahead and denormalize the COBOL
148 | # Beacuse the OCCURS are removed the INDEXED BY will also be removed
149 | def handle_occurs(lines, occurs, level_diff=0, name_postfix=""):
150 |     output = []
151 | 
152 |     for i in range(1, occurs+1):
153 |         
154 |         skipTill = 0
155 |         new_name_postfix = name_postfix if occurs == 1 else name_postfix + '-' + str(i)
156 | 
157 |         for index, row in enumerate(lines):
158 |             if index < skipTill:
159 |                 continue
160 | 
161 |             new_row = row.copy()
162 | 
163 |             new_row['level'] += level_diff
164 | 
165 |             # Not needed when flattened
166 |             new_row['indexed_by'] = None
167 | 
168 |             if row['occurs'] is None:
169 |                 # First time occurs is just 1, we don't want to add _1 after *every* field
170 |                 new_row['name'] = row['name'] + new_name_postfix
171 |                 # + "-" + str(i) if occurs > 1 else row['name'] + name_postfix
172 | 
173 |                 output.append(new_row)
174 |             
175 |             else:
176 |                 if row["pic"] is not None:
177 |                     # If it has occurs and pic just repeat the same line multiple times
178 |                     new_row['occurs'] = None
179 | 
180 |                     for j in range(1, row["occurs"]+1):
181 |                         row_to_add = new_row.copy()
182 | 
183 |                         # First time occurs is just 1, we don't want to add _1 after *every* field
184 |                         row_to_add["name"] = row['name'] + new_name_postfix + '-' + str(j)
185 |                         # + "-" + str(i) + "-" + str(j) if occurs > 1 else row['name'] + name_postfix + "-" + str(j) 
186 | 
187 |                         output.append(row_to_add)
188 | 
189 |                 else:
190 |                     # Get all the lines that have to occur
191 |                     occur_lines = get_subgroup(row['level'], lines[index+1:])
192 | 
193 |                     # Calculate the new level difference that has to be applied
194 |                     new_level_diff = level_diff + row['level'] - occur_lines[0]['level']
195 | 
196 |                     output += handle_occurs(occur_lines, row['occurs'], new_level_diff, new_name_postfix)
197 | 
198 |                     skipTill = index + len(occur_lines) + 1
199 | 
200 |     return output
201 | 
202 | """
203 | Clean the names.
204 | 
205 | Options to:
206 |  - strip prefixes on names
207 |  - enforce unique names
208 |  - make database safe names by converting - to _
209 | """
210 | def clean_names(lines, ensure_unique_names=False, strip_prefix=False, make_database_safe=False):
211 |     names = {}
212 | 
213 |     for row in lines:
214 |         if strip_prefix:
215 |             row['name'] = row['name'][ row['name'].find('-')+1 : ]
216 | 
217 |             if row['indexed_by'] is not None:
218 |                 row['indexed_by'] = row['indexed_by'][ row['indexed_by'].find('-')+1 : ]
219 | 
220 |         if ensure_unique_names:
221 |             i = 1
222 |             while (row['name'] if i == 1 else row['name'] + "-" + str(i)) in names:
223 |                 i += 1
224 | 
225 |             names[row['name'] if i == 1 else row['name'] + "-" + str(i)] = 1
226 | 
227 |             if i > 1:
228 |                 row['name'] = row['name'] + "-" + str(i)
229 | 
230 |         if make_database_safe:
231 |             row['name'] = row['name'].replace("-","_")
232 | 
233 | 
234 |     return lines
235 | 
236 | def process_cobol(lines):
237 |     return clean_names(denormalize_cobol(parse_cobol(clean_cobol(lines))), True, True, True)    
238 | 
239 | # Prints a Copybook compatible file
240 | def print_cobol(lines):
241 |     output = []
242 | 
243 |     default_padding = ' ' * 7
244 | 
245 |     levels = [0]
246 | 
247 |     for row in lines:
248 |         row_output = []
249 | 
250 |         if row['level'] > levels[-1]:
251 |             levels.append(row['level'])
252 |         else:
253 |             while row['level'] < levels[-1]:
254 |                 levels.pop()
255 | 
256 |         row_output.append( (len(levels)-1) * '  ' )
257 |         row_output.append( "{0:02d}  ".format(row['level']) )
258 |         row_output.append( row['name'])
259 | 
260 |         if row['indexed_by'] is not None:
261 |             row_output.append(" INDEXED BY "+row['indexed_by'])
262 | 
263 |         if row['occurs'] is not None:
264 |             row_output.append( " OCCURS {0:04d} TIMES".format(row['occurs']) )
265 | 
266 |         if row['pic'] is not None:
267 |             row_output.append( " PIC " + row['pic'] )
268 | 
269 |         row_output.append(".")
270 | 
271 |         tot_length = 0
272 |         max_data_length = 66
273 |         outp = default_padding
274 | 
275 |         for data in row_output:
276 | 
277 |             if len(outp) + len(data) + 1 > max_data_length:
278 |                 # Makes rows 80 chars
279 |                 outp += (80-len(outp)) * ' '
280 | 
281 |                 output.append(outp)
282 | 
283 |                 # Start the following line with an extra padding
284 |                 outp = default_padding + (len(levels)-1) * '  ' + '    '
285 | 
286 |             outp += data
287 | 
288 |         outp += (80-len(outp)) * ' '
289 |         output.append(outp)
290 | 
291 |     print "\n".join(output)
292 | 
293 | if __name__ == '__main__':
294 |     import argparse
295 |     import os.path
296 | 
297 |     parser = argparse.ArgumentParser(description="Parse COBOL Copybooks")
298 |     parser.add_argument("filename", help="The filename of the copybook.")
299 |     parser.add_argument("--skip-all-processing", help="Only processes the redefines.", default=False, action="store_true")
300 |     parser.add_argument("--skip-unique-names", help="Skips making all names unique.", default=False, action="store_true")
301 |     parser.add_argument("--skip-denormalize", help="Skips denormalizing the COBOL.", default=False, action="store_true")
302 |     parser.add_argument("--skip-strip-prefix", help="Skips stripping the prefix from the names.", default=False, action="store_true")
303 | 
304 |     args = parser.parse_args() 
305 | 
306 |     if not os.path.isfile(args.filename):
307 |         print "Could not find", args.filename
308 |         exit()
309 | 
310 |     with open(args.filename,'r') as f:
311 |         lines = parse_cobol(clean_cobol(f.readlines()))
312 | 
313 |         if not args.skip_all_processing:
314 |             if not args.skip_denormalize:
315 |                 lines = denormalize_cobol(lines)
316 | 
317 |             if not args.skip_strip_prefix or not args.skip_unique_names:
318 |                 lines = clean_names(lines, not args.skip_unique_names, not args.skip_strip_prefix)
319 | 
320 |         print_cobol(lines)
321 | 


--------------------------------------------------------------------------------
/example.cbl:
--------------------------------------------------------------------------------
 1 | 00000 * Example COBOL Copybook file                                     AAAAAAAA
 2 | 00000  01  PAULUS-EXAMPLE-GROUP.                                        AAAAAAAA
 3 | 00000       05  PAULUS-ANOTHER-GROUP OCCURS 0003 TIMES.                 AAAAAAAA
 4 | 00000           10  PAULUS-FIELD-1 PIC X(3).                            AAAAAAAA
 5 | 00000           10  PAULUS-FIELD-2 REDEFINES PAULUS-FIELD-1 PIC 9(3).   AAAAAAAA
 6 | 00000           10  PAULUS-FIELD-3 OCCURS 0002 TIMES                    AAAAAAAA
 7 | 00000                           PIC S9(3)V99.                           AAAAAAAA
 8 | 00000       05  PAULUS-THIS-IS-ANOTHER-GROUP.                           AAAAAAAA
 9 | 00000           10  PAULUS-YES PIC X(5).                                AAAAAAAA
10 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
1 | import cobol
2 | 
3 | with open("example.cbl",'r') as f:
4 |     for row in cobol.process_cobol(f.readlines()):
5 |     	print row['name']
6 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # COBOL Copybook parser in Python
  2 | This is a COBOL Copybook parser in Python featuring the following options:
  3 |  - Parse the Copybook into a usable format to use in Python
  4 |  - Clean up the Copybook by processing REDEFINES statements and remove unused definitions
  5 |  - Denormalize the Copybook
  6 |  - Write the cleaned Copybook in COBOL
  7 |  - Strip prefixes of field names and ensure that the Copybook only contains unique names
  8 |  - Can be used from the command-line or included in own Python projects
  9 | 
 10 | Because I couldn't find a COBOL Copybook parser that fitted all my needs I wrote my own. It doesn't support all functions found in the Copybook, just the ones that I met on my path: REDEFINES, INDEXED BY, OCCURS. 
 11 | 
 12 | On a day to day basis I use it so that Informatica PowerCenter creates only 1 table of my COBOL data instead multiple.
 13 | 
 14 | The code uses the pic parser code from [pyCOBOL](http://www.travelingfrontiers.com/projects/doku.php?id=pycobol).
 15 | 
 16 | This code is licensed under GPLv3.
 17 | 
 18 | ## Example output
 19 | Below is an example Copybook file before and after being processed.
 20 | 
 21 | Before:
 22 | 
 23 | 	00000 * Example COBOL Copybook file                                     AAAAAAAA
 24 | 	00000  01  PAULUS-EXAMPLE-GROUP.                                        AAAAAAAA
 25 | 	00000       05  PAULUS-ANOTHER-GROUP OCCURS 0003 TIMES.                 AAAAAAAA
 26 | 	00000           10  PAULUS-FIELD-1 PIC X(3).                            AAAAAAAA
 27 | 	00000           10  PAULUS-FIELD-2 REDEFINES PAULUS-FIELD-1 PIC 9(3).   AAAAAAAA
 28 | 	00000           10  PAULUS-FIELD-3 OCCURS 0002 TIMES                    AAAAAAAA
 29 | 	00000                           PIC S9(3)V99.                           AAAAAAAA
 30 | 	00000       05  PAULUS-THIS-IS-ANOTHER-GROUP.                           AAAAAAAA
 31 | 	00000           10  PAULUS-YES PIC X(5).                                AAAAAAAA
 32 | 
 33 | After:
 34 | 
 35 | 	         01  EXAMPLE-GROUP.                                                     
 36 | 	           05  FIELD-2-1 PIC 9(3).                                              
 37 | 	           05  FIELD-3-1-1 PIC S9(3)V99.                                        
 38 | 	           05  FIELD-3-1-2 PIC S9(3)V99.                                        
 39 | 	           05  FIELD-2-2 PIC 9(3).                                              
 40 | 	           05  FIELD-3-2-1 PIC S9(3)V99.                                        
 41 | 	           05  FIELD-3-2-2 PIC S9(3)V99.                                        
 42 | 	           05  FIELD-2-3 PIC 9(3).                                              
 43 | 	           05  FIELD-3-3-1 PIC S9(3)V99.                                        
 44 | 	           05  FIELD-3-3-2 PIC S9(3)V99.                                        
 45 | 	           05  THIS-IS-ANOTHER-GROUP.                                           
 46 | 	             10  YES PIC X(5).                                                 
 47 | 
 48 | 
 49 | ## How to use
 50 | You can use it in two ways: inside your own python code or as a stand-alone command-line utility.
 51 | 
 52 | ### Command-line
 53 | Do a git clone from the repository and inside your brand new python-cobol folder run:
 54 | 
 55 | 	python cobol.py example.cbl
 56 | 
 57 | This will process the redefines, denormalize the file, strip the prefixes and ensure all names are unique. 
 58 | 
 59 | The utility allows for some command-line switches to disable some processing steps.
 60 | 
 61 | 	$ python cobol.py --help
 62 | 	usage: cobol.py [-h] [--skip-all-processing] [--skip-unique-names]
 63 | 	                      [--skip-denormalize] [--skip-strip-prefix]
 64 | 	                      filename
 65 | 
 66 | 	Parse COBOL Copybooks
 67 | 
 68 | 	positional arguments:
 69 | 	  filename              The filename of the copybook.
 70 | 
 71 | 	optional arguments:
 72 | 	  -h, --help            show this help message and exit
 73 | 	  --skip-all-processing
 74 | 	                        Skips unique names, denormalization and .
 75 | 	  --skip-unique-names   Skips making all names unique.
 76 | 	  --skip-denormalize    Skips denormalizing the COBOL.
 77 | 	  --skip-strip-prefix   Skips stripping the prefix from the names.	
 78 | 
 79 | ### From within your Python code
 80 | The parser can also be called from your Python code. All you need is a list of lines in COBOL Copybook format. See example.py how one would do it:
 81 | 
 82 | ```python
 83 | import cobol
 84 | 
 85 | with open("example.cbl",'r') as f:
 86 |     for row in cobol.process_cobol(f.readlines()):
 87 |     	print row['name']
 88 | ```
 89 | 
 90 | It is also possible to call one of the more specialized functions within cobol.py:
 91 | 
 92 | *    **clean_cobol(lines)**
 93 |     
 94 |      Cleans the COBOL by converting the cobol informaton to single lines
 95 | 
 96 | 
 97 | *    **parse_cobol(lines)**
 98 | 
 99 |      Parses a list of COBOL field definitions into a list of dictionaries containing the parsed information.
100 | 
101 | 
102 | *    **denormalize_cobol(lines)**
103 | 
104 |      Denormalizes parsed COBOL lines
105 | 
106 | 
107 | *    **clean_names(lines, ensure_unique_names=False, strip_prefix=False, make_database_safe=False)**
108 |      
109 |      Cleans names of the fields in a list of parsed COBOL lines. Options to strip prefixes, enforce unique names and make the names database safe by converting dashes (-) to underscores (_)
110 | 
111 | 
112 | *    **print_cobol(lines)**
113 |      
114 |      Prints parsed COBOL lines in the Copybook format
115 | 


--------------------------------------------------------------------------------