├── README.md └── sql_tools.py /README.md: -------------------------------------------------------------------------------- 1 | # The Tracab Cookbook 2 | Here is a collection of code snippets and functions that I have used to work with Tracab data. Thought I would share so I have a public record for my personal use later and to share for others. 3 | 4 | ###### Caveats 5 | This is based on parsing the raw .dat a pandas object, I call this object 'tdat'. Pandas isn't the most memory effective or speed effective data strcuture for tracking data, but whatever. All functions use the 'tdat' object name for the parsed tracking data, this is a default you can override. 6 | 7 | ###### Contributions 8 | If you want to contribute snippets to store here they will be warmly welcomed.   9 | 10 | ### 1. Parse Tracab Metadata 11 | Each tracab file comes with a xml file holding all meta data for the period start and ends as well as the pitch dimensions. This function parses it into a dict for easier access for later analysis. 12 | 13 | ```py 14 | 15 | def parse_tracking_metadata(filename): 16 | 17 | tree = ET.parse(filename) # parse the raw xml 18 | root = tree.getroot() # get the root object to access the information 19 | 20 | gamexml = root.findall('match')[0] # get all of the nodes called 'match' 21 | 22 | info_raw = [] # create an empty list for storing the data 23 | 24 | # for each period node get the start and end, appending them to the infa_raw list 25 | for i in gamexml.iter('period'): 26 | info_raw.append( i.get('iStartFrame') ) 27 | info_raw.append( i.get('iEndFrame') ) 28 | 29 | game_info = dict() # Create empty dict for storing the information 30 | 31 | # get all the information for each period and add the info to the dictionary 32 | game_info['period1_start'] = int(info_raw[0]) 33 | game_info['period1_end'] = int(info_raw[1]) 34 | game_info['period2_start'] = int(info_raw[2]) 35 | game_info['period2_end'] = int(info_raw[3]) 36 | game_info['period3_start'] = int(info_raw[4]) 37 | game_info['period3_end'] = int(info_raw[5]) 38 | game_info['period4_start'] = int(info_raw[6]) 39 | game_info['period4_end'] = int(info_raw[7]) 40 | 41 | # get all the information for the pitch sizes and add the info to the dictionary 42 | for detail in root.iter('match'): 43 | game_info['pitch_x'] = int(float(detail.get('fPitchXSizeMeters'))) 44 | game_info['pitch_y'] = int(float(detail.get('fPitchYSizeMeters'))) 45 | 46 | # return the dictionary of information 47 | return(game_info) 48 | ``` 49 | 50 | 51 | ### 2. Get One Frame of Tracking Data 52 | Select one frame in time of tracking data, with options to select just one team (1 = home, 0 = away, 10 = ball). 53 | 54 | ```p 55 | def get_frame(frame_select, trackingdata = tdat, a_team = False, team_select = 1): 56 | 57 | if a_team: 58 | return(trackingdata[(trackingdata['frameID'] == frame_select) & 59 | (trackingdata['team'] == team_select)].reset_index(drop=True)) 60 | else: 61 | return(trackingdata[trackingdata['frameID'] == frame_select].reset_index(drop=True)) 62 | ``` 63 | 64 | ### 3. Get a Segment of Tracking Data 65 | Select a period of time within the tracking data, with options to select just one team (1 = home, 0 = away, 10 = ball). 66 | 67 | ```p 68 | def get_segment(frame_select_start, frame_select_end, trackingdata = tdat, a_team = False, team_select = 1): 69 | 70 | if a_team: 71 | return(trackingdata[(trackingdata['frameID'].between(frame_select_start, frame_select_end)) & 72 | (trackingdata['team'] == team_select)].reset_index(drop=True)) 73 | else: 74 | return(trackingdata[trackingdata['frameID'].between(frame_select_start, frame_select_end)].reset_index(drop=True)) 75 | ``` 76 | 77 | ### 4. Add False/True Column if a Player's Team is in Possession 78 | Althougth tracab data has the information regarding which team is in possession in a particular frame there isn't a easy way of determining if a player's team is in possession of the ball. 79 | 80 | ```p 81 | def add_team_in_possession(trackingdata = tdat): 82 | 83 | trackingdata['team_in_possession'] = [(x == 1 and y == "H") or 84 | (x == 0 and y == "A") 85 | for x,y in zip(trackingdata.team, 86 | trackingdata.ball_owning_team )] 87 | return( trackingdata ) 88 | ``` 89 | 90 | ### 5. Create a Pitch 91 | A flexible pitch function that's customisable but with fixed defaults. 92 | 93 | ```p 94 | def create_pitch(fig_width = 9, 95 | fig_height = 6, 96 | x = 5600, 97 | y = 3600, 98 | border = 400, 99 | line_colour = 'black', 100 | jp_alpha = 0.2, 101 | jp_colour = 'r', 102 | middle_thirds = False, 103 | basic_features = False, 104 | JdeP = False, 105 | add_axis = False): 106 | 107 | plt.figure(figsize=(fig_width, fig_height)) 108 | plt.axis([-x-border,x+border,-y-border, y+border]) 109 | 110 | ## create each half 111 | plt.plot([ -x, -x, 0, 0, -x ] , [ -y, y, y, -y, -y], color = line_colour, linewidth = 1) 112 | plt.plot([ x, x, 0, 0, x ] , [ -y, y, y, -y, -y], color = line_colour, linewidth = 1) 113 | 114 | ## create the 18 yard boxes 115 | w_of_18 = 2015 116 | h_of_18 = x-1660 117 | 118 | plt.plot([ -x, -x, -h_of_18, -h_of_18, -x ] , 119 | [ -w_of_18, w_of_18, w_of_18, -w_of_18, -w_of_18], 120 | color = 'black', 121 | linewidth = 1) 122 | 123 | plt.plot([ x, x, h_of_18, h_of_18, x ] , 124 | [ -w_of_18, w_of_18, w_of_18, -w_of_18, -w_of_18], 125 | color = 'black', l 126 | inewidth = 1) 127 | 128 | ## create the goals 129 | plt.plot([ -x, -x] , [ -366, 366], color = 'black', linewidth = 3) 130 | plt.plot([ x, x] , [ -366, 366], color = 'black', linewidth = 3) 131 | 132 | ## Add middle thirds 133 | if middle_thirds: 134 | middle_third_w = (x/2) - (x/6) 135 | plt.fill([-middle_third_w,middle_third_w,middle_third_w,-middle_third_w,-middle_third_w], 136 | [y,y,-y,-y,y], 137 | "black", 138 | alpha = 0.05) 139 | 140 | if basic_features: 141 | print("") 142 | else: 143 | ## add centre circle 144 | circle1=plt.Circle((0,0),(915),edgecolor=line_colour, fill = None) 145 | plt.gcf().gca().add_artist(circle1) 146 | 147 | ## add spots 148 | plt.scatter([x-1100, -x+1100, 0], [0,0,0], color = line_colour, s=(x/500)) 149 | 150 | ## create the 6 yard boxes 151 | w_of_6 = 1015 152 | h_of_6 = x-550 153 | 154 | plt.plot([ -x, -x, -h_of_6, -h_of_6, -x ] , 155 | [ -w_of_6, w_of_6, w_of_6, -w_of_6, -w_of_6], 156 | color = 'black', 157 | linewidth = 1) 158 | 159 | plt.plot([ x, x, h_of_6, h_of_6, x ] , 160 | [ -w_of_6, w_of_6, w_of_6, -w_of_6, -w_of_6], 161 | color = 'black', linewidth = 1) 162 | 163 | 164 | ## add Juego de Posicion 165 | if JdeP: 166 | middle_third_w = (x/2) - (x/6) 167 | 168 | plt.plot([ -h_of_18, h_of_18 ] , 169 | [ w_of_18, w_of_18], 170 | color = jp_colour, 171 | linewidth = 1, 172 | alpha = jp_alpha) 173 | 174 | plt.plot([ -h_of_18, h_of_18 ] , 175 | [ -w_of_18, -w_of_18], 176 | color = jp_colour, 177 | linewidth = 1, 178 | alpha = jp_alpha) 179 | 180 | plt.plot([ -h_of_18, h_of_18 ] , 181 | [ 915, 915], 182 | color = jp_colour, 183 | linewidth = 1, 184 | alpha = jp_alpha) 185 | 186 | plt.plot([ -h_of_18, h_of_18 ] , 187 | [ -915, -915], 188 | color = jp_colour, 189 | linewidth = 1, 190 | alpha = jp_alpha) 191 | 192 | plt.plot([ h_of_18, h_of_18 ] , 193 | [ -y, y], 194 | color = jp_colour, 195 | linewidth = 1, 196 | alpha = jp_alpha) 197 | 198 | plt.plot([ -h_of_18, -h_of_18 ] , 199 | [ -y, y], 200 | color = jp_colour, 201 | linewidth = 1, 202 | alpha = jp_alpha) 203 | 204 | jp_third = (h_of_18) / 2 205 | 206 | plt.plot([ -middle_third_w, -middle_third_w ] , 207 | [ w_of_18, y], 208 | color = jp_colour, 209 | linewidth = 1, 210 | alpha = jp_alpha) 211 | 212 | plt.plot([ -middle_third_w, -middle_third_w ] , 213 | [ -w_of_18, -y], 214 | color = jp_colour, 215 | linewidth = 1, 216 | alpha = jp_alpha) 217 | 218 | plt.plot([ middle_third_w, middle_third_w ] , 219 | [ w_of_18, y], 220 | color = jp_colour, 221 | linewidth = 1, 222 | alpha = jp_alpha) 223 | 224 | plt.plot([ middle_third_w, middle_third_w ] , 225 | [ -w_of_18, -y], 226 | color = jp_colour, 227 | linewidth = 1, 228 | alpha = jp_alpha) 229 | 230 | ## add axis 231 | if add_axis: 232 | print("") 233 | else: 234 | plt.axis('off') 235 | 236 | return(plt) 237 | 238 | ``` 239 | 240 | ### 6. Add the Ball x & y Coordinates 241 | Create two new columns for the ball's x and y coorindates that frame in time. 242 | 243 | ```p 244 | def add_ball_xy(trackingdata = tdat): 245 | 246 | ball_df = trackingdata[trackingdata['team'] == 10].reset_index(drop=True)[['frameID', 'x', 'y']] 247 | ball_df.columns = ['frameID', 'ball_x', 'ball_y'] 248 | 249 | trackingdata = trackingdata.merge(ball_df, on = "frameID") 250 | 251 | return(trackingdata) 252 | 253 | ``` 254 | 255 | ### 7. Calculate the Distance to the Ball 256 | Creates a new column of the distance the player is from the ball. 257 | 258 | ```p 259 | def add_distance_to_ball(trackingdata = tdat): 260 | 261 | if 'ball_x' in trackingdata.columns: 262 | trackingdata['distance_to_ball'] = trackingdata[['x', 'y']].sub(np.array( trackingdata[['ball_x', 'ball_y']] )).pow(2).sum(1).pow(0.5) 263 | trackingdata.distance_to_ball = trackingdata.distance_to_ball.round(2) 264 | return(trackingdata) 265 | 266 | else: 267 | print("x||----------------") 268 | print("Ball x and y coordinates missing - 'add_distance_to_ball' function failed") 269 | print("Use 'add_ball_xy' to add the missing coordinates") 270 | print("----------------||x") 271 | 272 | 273 | add_distance_to_ball(tdat2) 274 | ``` 275 | 276 | ### 8. Calculate the Distance to the Goals 277 | Creates two new columns for the distance from a player to each goal, goal1 (-x) and goal2 (x) 278 | 279 | ```p 280 | def add_distance_to_goals(trackingdata = tdat, x = 5250): 281 | 282 | trackingdata['distance_to_goal1'] = trackingdata[['x', 'y']].sub(np.array( -x, 0 )).pow(2).sum(1).pow(0.5) 283 | trackingdata['distance_to_goal2'] = trackingdata[['x', 'y']].sub(np.array( x, 0 )).pow(2).sum(1).pow(0.5) 284 | 285 | trackingdata.distance_to_goal1 = trackingdata.distance_to_goal1.round(2) 286 | trackingdata.distance_to_goal2 = trackingdata.distance_to_goal2.round(2) 287 | 288 | return(trackingdata) 289 | ``` 290 | 291 | ### 9. Calculate Distance Between 2 Points 292 | Calculate the distance between 2 points 293 | 294 | ```p 295 | def calc_distance(x,y): 296 | return np.sqrt(np.sum((x-y)**2)) 297 | 298 | # test 299 | a = np.array((0,100)) 300 | b = np.array((0,400)) 301 | calc_distance(a,b) 302 | "300" 303 | ``` 304 | 305 | ### 10. Add Attacking Direction 306 | For many analyses with tracking data you need to know the direction of play. Adding an attacking direction column helps with this. Attcking direction of 1 means the team is defending the goal -x and attacking the goal +x. An attacking direction of 1 means the team is defending the goal +x and attacking the goal -x. 307 | 308 | ```p 309 | def add_attacking_direction(trackingdata=tdat, metadata = meta): 310 | 311 | period1_start_frame = trackingdata[trackingdata['frameID'] == metadata['period1_start']].reset_index(drop=True) 312 | 313 | avg_starting_x_attack = period1_start_frame[period1_start_frame['team'] == 1]['x'].mean() 314 | avg_starting_x_defence = period1_start_frame[period1_start_frame['team'] == 0]['x'].mean() 315 | 316 | ## lists of attacking direction 317 | periods_list = [] 318 | direction_list = [] 319 | 320 | if avg_starting_x_attack < avg_starting_x_defence: 321 | periods_list.append(1) 322 | periods_list.append(1) 323 | direction_list.append(1) 324 | direction_list.append(-1) 325 | else: 326 | periods_list.append(2) 327 | periods_list.append(2) 328 | direction_list.append(-1) 329 | direction_list.append(1) 330 | 331 | attacking_direction_ref = pd.DataFrame( 332 | {'period_id': periods_list, 333 | 'attacking_direction': direction_list, 334 | 'team': [1,0]}) 335 | 336 | trackingdata = pd.merge(trackingdata, attacking_direction_ref, on = ["team", "period_id"]) 337 | 338 | return(trackingdata) 339 | ``` 340 | 341 | ### 11. Switch the Pitch 342 | For some analysis there is a need to orientate the pitch towards a standardised direction from -x -> x. This function switches the x,y coordinates to face -x -> x if the team in possession has an attacking direction of -1. Returns the tracking data segment and if the switch occured as a false/true. 343 | 344 | ```p 345 | def switch_the_pitch(frame_seg): 346 | 347 | if frame_seg.attacking_direction[0] == 1: 348 | if frame_seg.ball_owning_team[0] == "H": 349 | switch = False 350 | else: 351 | switch = True 352 | else: 353 | if frame_seg.ball_owning_team[0] == "H": 354 | switch = True 355 | else: 356 | switch = False 357 | if switch: 358 | frame_seg['x'] = frame_seg.x * -1 359 | frame_seg['y'] = frame_seg.y * -1 360 | else: 361 | frame_seg['x'] = frame_seg.x 362 | frame_seg['y'] = frame_seg.y 363 | return([frame_seg, switch]) 364 | ``` 365 | 366 | ### 12. Calculate the Slope between 2 Points 367 | For some analysis it is helpful to calculate the slope between two points. 368 | 369 | ```p 370 | 371 | def slope(x1, y1, x2, y2): 372 | m = (y2-y1)/(x2-x1) 373 | return(m) 374 | 375 | ``` 376 | 377 | ### 13. Convert a Tracab Location into Opta Coordinates 378 | There will be the need to convert a location in tracab coordinate space (-x:x and -y:y) into the equivilent Opta coordinates (0-100 for both x and y) 379 | 380 | ```p 381 | def to_opta_coords(att_dir, X, Y, pitch_x = meta['pitch_x'], pitch_y = meta['pitch_y']): 382 | 383 | if att_dir == 1: 384 | 385 | tracab_x = (pitch_x / 2) * 100 386 | opta_x_temp = 0.5 + (X / tracab_x) / 2 387 | opta_x = int(round(opta_x_temp,2)*100) 388 | 389 | tracab_y = (pitch_y / 2) * 100 390 | opta_y_temp = 0.5 + (Y / tracab_y) / 2 391 | opta_y = int(round(opta_y_temp,2)*100) 392 | 393 | return([opta_x, opta_y]) 394 | 395 | else: 396 | 397 | X = X*-1 398 | tracab_x = (pitch_x / 2) * 100 399 | opta_x_temp = 0.5 + (X / tracab_x) / 2 400 | opta_x = int(round(opta_x_temp,2)*100) 401 | 402 | Y = Y*-1 403 | tracab_y = (pitch_y / 2) * 100 404 | opta_y_temp = 0.5 + (Y / tracab_y) / 2 405 | opta_y = int(round(opta_y_temp,2)*100) 406 | 407 | return([opta_x, opta_y]) 408 | ``` 409 | 410 | ### 14. Convert a Opta Location into Tracab Coordinates 411 | There will be the need to convert a location in opta coordinate space (0-100 for both x and y) into the equivilent tracab coordinates (-x:x and -y:y). 412 | 413 | ```p 414 | def to_tracab_coords(att_dir, opta_x, opta_y, pitch_x = meta['pitch_x'], pitch_y = meta['pitch_y']): 415 | 416 | if att_dir == 1: 417 | 418 | tracab_x = (opta_x - 50) * pitch_x 419 | tracab_y = (opta_y - 50) * pitch_y 420 | 421 | return([tracab_x, tracab_y]) 422 | 423 | else: 424 | 425 | tracab_x = ((opta_x - 50) * pitch_x) * -1 426 | tracab_y = ((opta_y - 50) * pitch_y) * -1 427 | 428 | return([tracab_x, tracab_y]) 429 | ``` 430 | ### 15. Parse f7 Files to Player Database 431 | Convert the f7 xml file into a player info dataframe 432 | ```p 433 | def parse_f7(file_name): 434 | 435 | # parse the xml and convert to a tree and root 436 | tree = ET.parse(file_name) 437 | root = tree.getroot() 438 | 439 | match_id = int(root.find('SoccerDocument').get('uID')[1:]) 440 | 441 | # ## get the main game info from the single 'Game' node 442 | gameinfo = root.findall('SoccerDocument') 443 | gameinfo = gameinfo[0] 444 | # # gameinfo.get('Country') 445 | # gameinfo = gameinfo.iter('MatchData') 446 | # gameinfo = gameinfo[0] 447 | 448 | 449 | 450 | # gameinfo.iter('MatchInfo') 451 | # root.iter('MatchData').iter('MatchInfo').get('Period') 452 | 453 | formation_place = [] 454 | player_id = [] 455 | position = [] 456 | jersey_no = [] 457 | status = [] 458 | 459 | for neighbor in gameinfo.iter('MatchPlayer'): 460 | formation_place.append(neighbor.get('Formation_Place')) 461 | player_id.append(neighbor.get('PlayerRef')) 462 | position.append(neighbor.get('Position')) 463 | jersey_no.append(neighbor.get('ShirtNumber')) 464 | status.append(neighbor.get('Status')) 465 | 466 | 467 | players1 = pd.DataFrame( 468 | {'formation_place': formation_place, 469 | 'player_id': player_id, 470 | 'position': position, 471 | 'jersey_no': jersey_no, 472 | 'status': status}) 473 | 474 | 475 | p_id = [] 476 | first_name = [] 477 | last_name = [] 478 | 479 | for neighbor in gameinfo.iter('Player'): 480 | p_id.append(neighbor.get('uID')) 481 | first_name.append(neighbor.find('PersonName').find('First').text) 482 | last_name.append(neighbor.find('PersonName').find('Last').text) 483 | 484 | 485 | players2 = pd.DataFrame( 486 | {'first_name': first_name, 487 | 'player_id': p_id, 488 | 'last_name': last_name}) 489 | 490 | 491 | players1['player_id'] = players1['player_id'].str[1:] 492 | players2['player_id'] = players2['player_id'].str[1:] 493 | 494 | playersDB = players1.merge(players2, on='player_id', how='inner') 495 | playersDB["player_name"] = playersDB["first_name"].map(str) + " " + playersDB["last_name"] 496 | 497 | 498 | minute = [] 499 | period_id = [] 500 | player_off = [] 501 | player_on = [] 502 | 503 | 504 | for neighbor in gameinfo.iter('Substitution'): 505 | minute.append(neighbor.get('Time')) 506 | period_id.append(neighbor.get('Period')) 507 | player_off.append(neighbor.get('SubOff')) 508 | player_on.append(neighbor.get('SubOn')) 509 | 510 | 511 | subs = pd.DataFrame( 512 | {'minute': minute, 513 | 'period_id': period_id, 514 | 'player_off': player_off, 515 | 'player_on': player_on 516 | }) 517 | 518 | 519 | subs['player_off'] = subs['player_off'].str[1:] 520 | subs['player_on'] = subs['player_on'].str[1:] 521 | 522 | playersDB['start_min'] = 0 523 | playersDB['end_min'] = 0 524 | 525 | match_length = 90 526 | for neighbor in gameinfo.iter('Stat'): 527 | if neighbor.get('Type') == "match_time": 528 | match_length = int(neighbor.text) 529 | 530 | for i in range(0,len(playersDB)): 531 | 532 | player_2_test = playersDB.iloc[i] 533 | 534 | if player_2_test['status'] == "Start": 535 | 536 | if player_2_test['player_id'] in subs.player_off.get_values(): 537 | playersDB.at[i, 'end_min'] = subs.loc[subs['player_off'] == player_2_test['player_id']]['minute'].get_values()[0] 538 | 539 | else: 540 | playersDB.at[i, 'end_min'] = match_length 541 | 542 | if player_2_test['status'] == "Sub": 543 | 544 | if player_2_test['player_id'] in subs.player_on.get_values(): 545 | playersDB.at[i, 'start_min'] = subs.loc[subs['player_on'] == player_2_test['player_id']]['minute'].get_values()[0] 546 | playersDB.at[i, 'end_min'] = match_length 547 | else: 548 | playersDB.at[i, 'end_min'] = player_2_test['end_min'] 549 | 550 | if player_2_test['player_id'] in subs.player_off.get_values(): 551 | playersDB.at[i, 'end_min'] = subs.loc[subs['player_off'] == player_2_test['player_id']]['minute'].get_values()[0] 552 | 553 | playersDB['mins_played'] = playersDB["end_min"] - playersDB["start_min"] 554 | 555 | playersDB['match_id'] = match_id 556 | 557 | teams = [] 558 | for team in gameinfo.findall('Team'): 559 | teams.append(team.get('uID')[1:]) 560 | 561 | playersDB['team_id'] = "" 562 | playersDB['team'] = "" 563 | 564 | 565 | for i in range(0,36): 566 | if i <= 17: 567 | playersDB.at[i, 'team_id'] = teams[0] 568 | playersDB.at[i, 'team'] = 1 569 | else: 570 | playersDB.at[i, 'team_id'] = teams[1] 571 | playersDB.at[i, 'team'] = 0 572 | 573 | return(playersDB) 574 | ``` 575 | 576 | ### 17. Add the Player Name and Opta player_id 577 | Useful in order to link tracking data to players 578 | 579 | ```p 580 | def add_player_id(f7_filename = f7_file, tracking_data = tdat): 581 | 582 | playerDB_ = parse_f7(f7_filename)[['jersey_no','player_id', 'team', 'player_name']] 583 | 584 | ballDB = pd.Series(['999.0', '000000', '10.0', 'ball'], index=['jersey_no','player_id', 'team', 'player_name']) 585 | playerDB_ = playerDB_.append(ballDB, ignore_index=True) 586 | 587 | playerDB_['jersey_no'] = playerDB_['jersey_no'].transform(float) 588 | playerDB_['team'] = playerDB_['team'].transform(float) 589 | 590 | tracking_data = tracking_data.merge(playerDB_, on = ['jersey_no', 'team']) 591 | 592 | return(tracking_data) 593 | ``` 594 | 595 | 596 | -------------------------------------------------------------------------------- /sql_tools.py: -------------------------------------------------------------------------------- 1 | 2 | def create_connection(db_file): 3 | """ create a database connection to the SQLite database 4 | specified by db_file 5 | :param db_file: database file 6 | :return: Connection object or None 7 | """ 8 | conn = None 9 | try: 10 | conn = sqlite3.connect(db_file) 11 | return(conn) 12 | 13 | except Error as e: 14 | print(e) 15 | 16 | return(conn) 17 | 18 | def create_table(conn, create_table_sql): 19 | """ create a table from the create_table_sql statement 20 | :param conn: Connection object 21 | :param create_table_sql: a CREATE TABLE statement 22 | :return: 23 | """ 24 | try: 25 | c = conn.cursor() 26 | c.execute(create_table_sql) 27 | "table created" 28 | # print("table created") 29 | except Error as e: 30 | print(e) 31 | 32 | def convert_datatype_to_sql_type(type_str): 33 | """ turns a pandas datatype into a SQL datatype 34 | automatically assigns correct data types to each column. 35 | :param type_str pandas version 36 | :return SQL version 37 | """ 38 | if type_str == "int64": 39 | return('integer') 40 | elif type_str == "float64": 41 | return('real') 42 | else: 43 | return('text') 44 | 45 | def pandas_to_sql_table_creation(table_name, dat, connection, NOT_NULL_ = True): 46 | """ turns a pandas dataframe into a sql table 47 | automatically assigns correct data types to each column. 48 | :param table_name decide on the name of the SQL table to be created 49 | :param dat pandas dataframe ready for conversion 50 | :param connection - database connection created via create_connection(db_file) 51 | :param NOT_NULL_ blanket assigns all columns to be NOT_NULL or not 52 | :outcome create the table within the database 53 | """ 54 | try: 55 | 56 | # convert datatype for each column into the SQL equivilent 57 | data_types_sql = [convert_datatype_to_sql_type(str(f)) for f in list(dat.dtypes)] 58 | 59 | # create the start of the table create SQL code, creates a Primary Key, as table_name_id 60 | table_create_str = """ CREATE TABLE """ + 61 | table_name + 62 | """ (""" + 63 | table_name + 64 | """_id integer PRIMARY KEY,""" 65 | 66 | # loop through each column and add to 'table_create_str' the column names, types and NULLS 67 | for c in range(len(dat.columns)): 68 | if NOT_NULL_: 69 | table_create_str += dat.columns[c] + " " + data_types_sql[c] + " NOT NULL," 70 | else: 71 | table_create_str += dat.columns[c] + " " + data_types_sql[c] + "," 72 | 73 | # remove the last comma and add the SQL ending. 74 | table_create_str = table_create_str[:-1] + """ );""" 75 | 76 | # if there is a connection then action the create_table function with the newly formed SQL statements 77 | if connection is not None: 78 | create_table(connection, table_create_str) 79 | print(table_name + " successfully added to database") 80 | 81 | except: 82 | # error flagging if the above fails 83 | print(table_name + " ** FAILURE ** when adding table to database") 84 | 85 | 86 | def create_sql_insert(table_name, column_names): 87 | """ Create the SQL INSERT base statement without values 88 | :param table_name previously created SQL table 89 | :param column_names columns names of the dat dataframe 90 | :return SQL statement string 91 | """ 92 | columns_string = "" 93 | for i in column_names: 94 | columns_string += i + "," 95 | columns_string = columns_string[:-1] 96 | 97 | question_string = "" 98 | for i in column_names: 99 | question_string += "?," 100 | question_string = question_string[:-1] 101 | 102 | sql = ''' INSERT INTO ''' + table_name + '''(''' + columns_string + ''') VALUES(''' + question_string + ''')''' 103 | 104 | return(sql) 105 | 106 | def insert_a_row(table_name, dat, connection, row_index, id_to_print): 107 | """ takes one row of the pandas dataframe and inserts it into the newly created SQL table 108 | :param table_name previously created SQL table 109 | :param dat pandas dataframe ready for insertion 110 | :param connection - database connection created via create_connection(db_file) 111 | :param row_index row index of the dat dataframe 112 | :param id_to_print used for progress tracking 113 | :outcome insert a row into the SQL database 114 | """ 115 | try: 116 | # get the raw SQL Statement without the values 117 | sql = create_sql_insert(table_name, dat.columns) 118 | 119 | # get datatypes to help with proper formatting of values prior to insertion 120 | data_types_sql = [convert_datatype_to_sql_type(str(f)) for f in list(dat.dtypes)] 121 | 122 | # list of values from the row 123 | values_raw = list(dat.iloc[row_index].values) 124 | 125 | # create an empty lists to append the correct values to 126 | values = [] 127 | 128 | # loop through the values and datatypes and append the correctly formatted version 129 | for idx in range(len(values_raw)): 130 | if data_types_sql[idx] == "text": 131 | values.append(str(values_raw[idx])) 132 | elif data_types_sql[idx] == "integer": 133 | values.append(int(values_raw[idx])) 134 | elif data_types_sql[idx] == "real": 135 | values.append(float(values_raw[idx])) 136 | else: 137 | values.append(str(values_raw[idx])) 138 | 139 | # create the SQL tuple of values to add 140 | data_sql = tuple(values); 141 | 142 | # with the connection add the row of data into the SQL database. 143 | with connection: 144 | cur = connection.cursor() 145 | cur.execute(sql, data_sql) 146 | 147 | except: 148 | # error tracking 149 | print(dat.iloc[row_index][id_to_print] + " *** FAILURE *** when adding to database into " + table_name) 150 | 151 | 152 | --------------------------------------------------------------------------------