├── usa_spent ├── __init__.py ├── usa_params.py ├── psc_code_mapper.py ├── extract.py ├── download_txns_csv.py ├── async_requests.py ├── usaspending_api.py └── transform.py ├── README.md ├── requirements.txt └── usa_spent.ipynb /usa_spent/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # USA-SPENT 2 | 3 | An asynchronous python wrapper for usaspending's beta api. 4 | 5 | Requires Python 3.5+ 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp>=2.1.0 2 | async-timeout>=1.2.1 3 | chardet>=3.0.4 4 | multidict>=2.1.6 5 | numpy>=1.13.1 6 | pandas>=0.20.2 7 | python-dateutil>=2.6.0 8 | pytz>=2017.2 9 | requests>=2.14.2 10 | six>=1.10.0 11 | xlrd>=1.0.0 12 | yarl>=0.10.3 13 | -------------------------------------------------------------------------------- /usa_spent/usa_params.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | class PostParams: 5 | 6 | def __init__(self, **kwargs): 7 | self.params = {} 8 | for key, val in kwargs.items(): 9 | self.params.update({key: val}) 10 | 11 | @staticmethod 12 | def filter_(field, operation, value): 13 | return {'field': field, 'operation': operation, 'value': value} 14 | 15 | 16 | if __name__=='__main__': 17 | filter_ = PostParams.filter_ 18 | 19 | params = PostParams( 20 | page=1, 21 | filters=[ 22 | filter_('awarding_agency__toptier_agency__cgac_code', 23 | 'equals', 24 | '070'), 25 | filter_('action_date', 26 | 'equals', 27 | '2017-02-13') 28 | ], 29 | ) 30 | 31 | print(params.params) -------------------------------------------------------------------------------- /usa_spent/psc_code_mapper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | 4 | 5 | def main(): 6 | #TODO create or open mapping document - /data/psc_mapper.csv 7 | #TODO - fix the load and save to be NOT DictReader... instead port from pipe delimited to a dictionary 8 | fileloc = r'C:\Users\dshorstein\Python\Projects\usa-spent\data\mapper.csv' 9 | 10 | headers = ['psc', 'napcs_code'] 11 | 12 | map_object = {'6515': '811', 'R499': '741010101'} 13 | 14 | save_mapper(fileloc, headers, map_object) 15 | 16 | 17 | 18 | def save_mapper(fileloc, headers, mapper): 19 | rows = headers 20 | rows.extend([(key, val) for key, val in mapper.items()]) 21 | with open(fileloc, 'w', newline='') as output_file: # TODO - MAKE SEPARATOR PIPE DELIMITED 22 | writer = csv.writer(output_file, delimiter='|') 23 | writer.writerows(rows) 24 | 25 | 26 | def load_mapper(fileloc): 27 | with open(fileloc, 'r', newline='') as input_file: 28 | reader = csv.reader(input_file, delimiter='|') 29 | headers = next(reader) 30 | mapper = dict(reader) 31 | 32 | return headers, mapper 33 | 34 | if __name__ == '__main__': 35 | main() -------------------------------------------------------------------------------- /usa_spent/extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | from usa_spent.usa_params import PostParams 3 | from usa_spent.usaspending_api import UsaSpendingService 4 | 5 | 6 | def extract(start_date, ending_date, file_loc): 7 | 8 | # fields = ['contract_data', 'federal_action_obligation'] 9 | 10 | endpoint = '/api/v1/transactions/' 11 | 12 | usa = UsaSpendingService() 13 | 14 | filter_ = PostParams.filter_ 15 | 16 | params = PostParams( 17 | # fields=fields, 18 | filters=[ 19 | # filter_('awarding_agency__toptier_agency__cgac_code', 20 | # 'equals', 21 | # '070'), 22 | filter_('action_date', 23 | 'less_than_or_equal', 24 | ending_date), 25 | filter_('action_date', 26 | 'greater_than_or_equal', 27 | start_date) 28 | ], 29 | ) 30 | 31 | usa.search(endpoint=endpoint, fileloc=file_loc, params=params.params) 32 | 33 | 34 | if __name__ == '__main__': 35 | 36 | 37 | 38 | data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'data')) 39 | 40 | if not os.path.exists(data_path): 41 | os.mkdir(data_path) 42 | 43 | file_loc = os.path.join(data_path, 'output.csv') 44 | 45 | 46 | extract('2017-03-30', '2017-03-31', file_loc) 47 | -------------------------------------------------------------------------------- /usa_spent/download_txns_csv.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import json 4 | 5 | 6 | endpoints = { 7 | 'transactions': '/api/v1/download/transactions/' 8 | } 9 | 10 | baseurl = 'https://api.usaspending.gov' 11 | 12 | 13 | def post_params(**parameters): 14 | return parameters 15 | 16 | 17 | def filter_(**filter_params): 18 | return filter_params 19 | 20 | 21 | def request_full_csv(endpoint, payload=None): 22 | 23 | headers = {'content-type': 'application/json'} 24 | 25 | url = baseurl + endpoint 26 | 27 | r = requests.post(url, headers=headers, data=payload) 28 | return r 29 | 30 | 31 | 32 | 33 | if __name__ == '__main__': 34 | params = post_params( 35 | verbose=True, 36 | filters=[ 37 | filter_(field='awarding_agency__toptier_agency__cgac_code', 38 | operation='equals', 39 | value='070'), 40 | filter_(field='action_date', 41 | operation='equals', 42 | value='2017-02-13') 43 | ] 44 | ) 45 | 46 | endpoint = endpoints['transactions'] 47 | 48 | r = request_full_csv(endpoint, json.dumps(params)) 49 | 50 | print('Requested file for {} endpoint with parameters = {}'.format(endpoint, params)) 51 | 52 | retry_url = r.json()['retry_url'] 53 | 54 | elapsed = 0 55 | 56 | while r.status_code == 202: 57 | elapsed += 1 58 | time.sleep(1) 59 | r = requests.get(retry_url) 60 | print('Time elapsed = {} seconds. Status is "{}". Trying {} again.'.format(elapsed, r.json()['status'], r.url)) 61 | 62 | 63 | if r.status_code == 200: 64 | print('success! csv file is located at {}'.format(r.json()['location'])) 65 | 66 | else: 67 | print(r.status_code) -------------------------------------------------------------------------------- /usa_spent/async_requests.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from aiohttp import ClientSession 4 | 5 | 6 | class AsyncGet: 7 | def __init__(self, url): 8 | self.url = url 9 | self.request_type = 'GET' 10 | 11 | 12 | class AsyncPost: 13 | def __init__(self, url, payload, headers): 14 | self.url = url 15 | self.payload = payload 16 | self.headers = headers 17 | self.request_type = 'POST' 18 | 19 | 20 | class AsyncResponses: 21 | def __init__(self, request_items): 22 | self.response = None 23 | self.request_items = list(request_items) 24 | self.print_status = False 25 | 26 | def _reset(self): 27 | self.response = None 28 | 29 | async def _fetch(self, request_obj, session): 30 | if self.print_status: 31 | print('Request for {}'.format(request_obj.url)) 32 | 33 | if request_obj.request_type == 'GET': 34 | async with session.get(request_obj.url) as response: 35 | return await response.text() 36 | else: 37 | async with session.post(url=request_obj.url, data=request_obj.payload, 38 | headers=request_obj.headers) as response: 39 | return await response.text() 40 | 41 | async def _run(self): 42 | 43 | tasks = [] 44 | async with ClientSession() as session: 45 | 46 | for request_obj in self.request_items: 47 | task = asyncio.ensure_future(self._fetch(request_obj, session)) 48 | tasks.append(task) 49 | 50 | request_resp = await asyncio.gather(*tasks) 51 | self.response = request_resp 52 | 53 | def async_run(self): 54 | loop = asyncio.get_event_loop() 55 | future = asyncio.ensure_future(self._run()) 56 | loop.run_until_complete(future) 57 | 58 | if __name__ == '__main__': 59 | urls = ['https://api.usaspending.gov/api/v1/accounts/awards/?page={}', 60 | 'https://api.usaspending.gov/api/v1/federal_accounts/?page={}', 61 | 'https://api.usaspending.gov/api/v1/transactions/?page={}', 62 | 'https://api.usaspending.gov/api/v1/tas/balances/?page={}', 63 | 'https://api.usaspending.gov/api/v1/tas/categories/?page={}'] 64 | 65 | urls = [url.format(n) for n in range(1, 5) for url in urls] 66 | 67 | urls_async = [AsyncGet(url) for url in urls] 68 | 69 | async_request = AsyncResponses(urls_async) 70 | async_request.async_run() 71 | 72 | results = zip(urls, async_request.response) 73 | 74 | for item in results: 75 | print(item[0]) 76 | print(item[1][:500]) -------------------------------------------------------------------------------- /usa_spent/usaspending_api.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | 4 | from usa_spent.async_requests import AsyncPost, AsyncResponses 5 | 6 | 7 | #TODO - write outputs in chunks to file, to reduce memory impact 8 | #TODO - add error checks 9 | 10 | class UsaSpendingService: 11 | def __init__(self): 12 | self.endpoints = None 13 | self.base_url = 'https://api.usaspending.gov' 14 | self.endpoint_fields = {} 15 | self.results = [] 16 | self.record_count = 0 17 | self.result_keys = [] 18 | self.first_line = True 19 | 20 | def search(self, endpoint, fileloc, params={}): 21 | url = self.base_url + endpoint 22 | 23 | chunk = 10 24 | start_page = 1 25 | # page_index = start_page 26 | 27 | headers = {'content-type': 'application/json'} 28 | 29 | print('Searching params: {}'.format(params)) 30 | 31 | params['page'] = start_page 32 | params = json.dumps(params) 33 | 34 | while True: 35 | 36 | url_searches = [] 37 | 38 | for page in range(start_page, start_page + chunk): 39 | params = json.loads(params) 40 | params['page'] = page 41 | params = json.dumps(params) 42 | url_searches.append(AsyncPost(url, payload=params, headers=headers)) 43 | 44 | resp = AsyncResponses(url_searches) 45 | resp.async_run() 46 | for item in resp.response: 47 | item = json.loads(item) 48 | if item.get('message'): 49 | print(item.get('message')) # TODO - log this instead of just printing, to see what went wrong 50 | continue 51 | 52 | # print('Page {} | {}'.format(item['page_metadata']['page'], item['page_metadata']['current'])) 53 | 54 | self.results = [self._flatten_json(json.loads(item)) for item in resp.response] 55 | 56 | # self.results = [json.loads(item).get('results') for item in resp.response] 57 | 58 | # pages_dict = {} 59 | # for page in self.results: 60 | # transactions = {} 61 | # if page: 62 | # for item in page: 63 | # transactions[item["id"]] = item 64 | # pages_dict[("page-" + str(page_index))] = transactions 65 | # page_index += 1 66 | 67 | self.results = [result for page in self.results if page for result in page] 68 | 69 | # self.results = pages_dict 70 | 71 | # self.save_raw_json(fileloc) 72 | 73 | self.save_dict_to_csv(fileloc) 74 | 75 | self.record_count += len(self.results) 76 | 77 | try: 78 | if json.loads(resp.response[-1]).get('page_metadata').get('has_next_page'): 79 | start_page += chunk 80 | else: 81 | break 82 | 83 | except: 84 | print('error!', resp.response[-1]) 85 | start_page += chunk 86 | 87 | print('Completed data pull! {} records found'.format(self.record_count)) 88 | 89 | def _flatten_json(self, json_data): 90 | data = [] 91 | results = json_data.get('results') 92 | if not results: 93 | return None 94 | 95 | for record in results: 96 | record_data = {} 97 | for key, val in record.items(): 98 | if not isinstance(val, dict): 99 | record_data[key] = val 100 | else: 101 | for subkey, subval in val.items(): 102 | if not isinstance(subval, dict): 103 | record_data['{}__{}'.format(key, subkey)] = subval 104 | else: 105 | for subsubkey, subsubval in subval.items(): 106 | record_data['{}__{}__{}'.format(key, subkey, subsubkey)] = subsubval 107 | data.append(record_data) 108 | [self.result_keys.append(key) for key in record_data.keys() if key not in self.result_keys] 109 | 110 | return data 111 | 112 | 113 | # TODO - MAKE HEADER ONLY WRITTEN ONCE or NONE... and tack on new fields to end using SET; write separate csv file 114 | def save_dict_to_csv(self, fileloc): 115 | keys = self.result_keys 116 | with open(fileloc, 'a+', newline='', encoding="utf-8", errors='replace') as output_file: 117 | dict_writer = csv.DictWriter(output_file, keys) 118 | if self.first_line: 119 | dict_writer.writeheader() 120 | self.first_line = False 121 | dict_writer.writerows(self.results) 122 | 123 | # def save_raw_json(self, fileloc): 124 | # # This is a conventional way to append to json, but you could do a more hack-y approach if runtime suffers 125 | # # Though realistically, network operations almost always dominate runtime 126 | # 127 | # try: 128 | # with open(fileloc) as input_file: 129 | # data = json.load(input_file) 130 | # 131 | # except: 132 | # data = {} 133 | # 134 | # data.update(self.results) 135 | # #for result in self.results: 136 | # # data.update(result) 137 | # 138 | # with open(fileloc, 'w') as output_file: 139 | # json.dump(data, output_file) 140 | # 141 | # 142 | # #print(self.results, output) 143 | -------------------------------------------------------------------------------- /usa_spent/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | import numpy as np 4 | 5 | def main(): 6 | 7 | 8 | fileloc = r'C:\Users\dshorstein\Python\Projects\usa-spent\data\output.csv' 9 | 10 | df = pd.read_csv(fileloc) 11 | 12 | df['TRANSACTION_TYPE'] = df.type.apply(get_transaction_type) 13 | 14 | 15 | # grouped = df[['type_description', 'TRANSACTION_TYPE', 'federal_action_obligation']].groupby(['TRANSACTION_TYPE', 'type_description']).sum() 16 | # 17 | # print(grouped) 18 | # 19 | # small_cols = ['federal_action_obligation', 'TRANSACTION_TYPE', 'contract_data__piid', 'assistance_data__fain', 'type', 'type_description', 'awarding_agency__toptier_agency__abbreviation', 'awarding_agency__subtier_agency__name', 'assistance_data__cfda__website_address', 'place_of_performance__state_name', 'place_of_performance__zip5', 'place_of_performance__state_code', 'place_of_performance__city_name', 'action_date', 'action_type', 'action_type_description', 'period_of_performance_current_end_date', 'period_of_performance_start_date', 'contract_data__naics', 'contract_data__naics_description', 'contract_data__product_or_service_code', 'description'] 20 | # 21 | # small_df = df[small_cols] 22 | # 23 | # tp = small_df[df_search('toilet paper', small_df)] 24 | # grant_dollars = small_df[df_search('grant', small_df)].federal_action_obligation.sum() 25 | # 26 | # print(tp) 27 | # print(grant_dollars) 28 | 29 | 30 | def get_transaction_type(row): # TODO - add one / two loan types 31 | if is_loan(row['type_description']): 32 | return 'loan' 33 | 34 | elif row.type.isalpha(): 35 | return 'grant' 36 | elif row.type.isdigit(): 37 | return 'contract' 38 | 39 | def is_loan(decription): 40 | loan = 'loan' 41 | if loan in decription.lower(): 42 | return True 43 | else: 44 | return False 45 | 46 | def state_code(row): 47 | if row['place_of_performance__state_code'] != '': 48 | row['STATE_transform'] = row['place_of_performance__state_code'] 49 | else: 50 | row['STATE_transform'] = row['place_of_performance__state_name'] 51 | #row['STATE_transform'] = np.where(~row['place_of_performance__state_code'].isnull(), row['place_of_performance__state_code'], 52 | # row['place_of_performance__state_name']) 53 | if row['STATE_transform'] == row['place_of_performance__state_code']: 54 | return row['STATE_transform'] 55 | elif row['STATE_transform'] == row['place_of_performance__state_name']: 56 | if 'alabama' in row['STATE_transform'].lower(): 57 | return 'AL' 58 | elif 'alaska' in row['STATE_transform'].lower(): 59 | return 'AK' 60 | elif 'arizona' in row['STATE_transform'].lower(): 61 | return 'AZ' 62 | elif 'arkansas' in row['STATE_transform'].lower(): 63 | return 'AR' 64 | elif 'california' in row['STATE_transform'].lower(): 65 | return 'CA' 66 | elif 'colorado' in row['STATE_transform'].lower(): 67 | return 'CO' 68 | elif 'connecticut' in row['STATE_transform'].lower(): 69 | return 'CT' 70 | elif 'delaware' in row['STATE_transform'].lower(): 71 | return 'DE' 72 | elif 'florida' in row['STATE_transform'].lower(): 73 | return 'FL' 74 | elif 'georgia' in row['STATE_transform'].lower(): 75 | return 'GA' 76 | elif 'hawaii' in row['STATE_transform'].lower(): 77 | return 'HI' 78 | elif 'idaho' in row['STATE_transform'].lower(): 79 | return 'ID' 80 | elif 'illinois' in row['STATE_transform'].lower(): 81 | return 'IL' 82 | elif 'iowa' in row['STATE_transform'].lower(): 83 | return 'IA' 84 | elif 'kansas' in row['STATE_transform'].lower(): 85 | return 'KS' 86 | elif 'kentucky' in row['STATE_transform'].lower(): 87 | return 'KY' 88 | elif 'louisiana' in row['STATE_transform'].lower(): 89 | return 'LA' 90 | elif 'maine' in row['STATE_transform'].lower(): 91 | return 'ME' 92 | elif 'maryland' in row['STATE_transform'].lower(): 93 | return 'MD' 94 | elif 'massachusetts' in row['STATE_transform'].lower(): 95 | return 'MA' 96 | elif 'michigan' in row['STATE_transform'].lower(): 97 | return 'MI' 98 | elif 'minnesota' in row['STATE_transform'].lower(): 99 | return 'MN' 100 | elif 'mississippi' in row['STATE_transform'].lower(): 101 | return 'MS' 102 | elif 'missouri' in row['STATE_transform'].lower(): 103 | return 'MO' 104 | elif 'montana' in row['STATE_transform'].lower(): 105 | return 'MT' 106 | elif 'nebraska' in row['STATE_transform'].lower(): 107 | return 'NE' 108 | elif 'nevada' in row['STATE_transform'].lower(): 109 | return 'NV' 110 | elif 'new hampshire' in row['STATE_transform'].lower(): 111 | return 'NH' 112 | elif 'new jersey' in row['STATE_transform'].lower(): 113 | return 'NJ' 114 | elif 'new mexico' in row['STATE_transform'].lower(): 115 | return 'NM' 116 | elif 'new york' in row['STATE_transform'].lower(): 117 | return 'NY' 118 | elif 'north carolina' in row['STATE_transform'].lower(): 119 | return 'NC' 120 | elif 'north dakota' in row['STATE_transform'].lower(): 121 | return 'ND' 122 | elif 'ohio' in row['STATE_transform'].lower(): 123 | return 'OH' 124 | elif 'oklahoma' in row['STATE_transform'].lower(): 125 | return 'OK' 126 | elif 'oregon' in row['STATE_transform'].lower(): 127 | return 'OR' 128 | elif 'pennsylvania' in row['STATE_transform'].lower(): 129 | return 'PA' 130 | elif 'rhode island' in row['STATE_transform'].lower(): 131 | return 'RI' 132 | elif 'south carolina' in row['STATE_transform'].lower(): 133 | return 'SC' 134 | elif 'south dakota' in row['STATE_transform'].lower(): 135 | return 'SD' 136 | elif 'tennessee' in row['STATE_transform'].lower(): 137 | return 'TN' 138 | elif 'texas' in row['STATE_transform'].lower(): 139 | return 'TX' 140 | elif 'utah' in row['STATE_transform'].lower(): 141 | return 'UT' 142 | elif 'vermont' in row['STATE_transform'].lower(): 143 | return 'VT' 144 | elif 'virginia' in row['STATE_transform'].lower(): 145 | return 'VA' 146 | elif 'washington' in row['STATE_transform'].lower(): 147 | return 'WA' 148 | elif 'west virginia' in row['STATE_transform'].lower(): 149 | return 'WV' 150 | elif 'wisconsin' in row['STATE_transform'].lower(): 151 | return 'WI' 152 | elif 'wyoming' in row['STATE_transform'].lower(): 153 | return 'WY' 154 | elif 'guam' in row['STATE_transform'].lower(): 155 | return 'GU' 156 | elif 'puerto rico' in row['STATE_transform'].lower(): 157 | return 'PR' 158 | elif 'virgin islands' in row['STATE_transform'].lower(): 159 | return 'VI' 160 | else: 161 | return row['STATE_transform'] 162 | 163 | 164 | 165 | 166 | def df_search(term, df): 167 | result = df.description.apply(lambda x: term in str(x).lower()) 168 | return result 169 | 170 | 171 | if __name__ == '__main__': 172 | main() 173 | -------------------------------------------------------------------------------- /usa_spent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "\n", 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "\n", 16 | "from usa_spent.extract import extract\n", 17 | "from usa_spent.transform import get_transaction_type\n", 18 | "from usa_spent.transform import state_code" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "data_path = os.path.join(os.path.realpath('.'), 'data')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "if not os.path.exists(data_path):\n", 41 | " os.mkdir(data_path)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "file_loc = os.path.join(data_path, 'output_0301_02.csv')" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "# Extract" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "Searching params: {'filters': [{'field': 'action_date', 'operation': 'less_than_or_equal', 'value': '2017-03-02'}, {'field': 'action_date', 'operation': 'greater_than_or_equal', 'value': '2017-03-01'}]}\n", 72 | "Completed data pull! 21327 records found\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "extract('2017-03-01', '2017-03-02', file_loc)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "# Transform" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 8, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "df = pd.read_csv(file_loc, low_memory=False)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 9, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "df = df[sorted(list(df.columns))].copy()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### Set transaction type as loan, grant, or contract" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "###### Loan is from \"type_description\" containing \"loan\". Grant is character in \"type\" field. Contract is numeric in \"type\" field." 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 10, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "df['TRANSACTION_TYPE'] = df.apply(lambda row: get_transaction_type(row), axis=1)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "### Set place of performance zip and state" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 12, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "df['POP_ZIP'] = df['place_of_performance__zip5']" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "###### Uses state code and state name if no state code. State name is then crosswalked back to state code" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 13, 162 | "metadata": { 163 | "collapsed": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "df['POP_STATE'] = df.apply(lambda row: state_code(row), axis=1)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "### Set top tier funding & awarding agency" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 14, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "df['FUND_AGENCY'] = df['funding_agency__toptier_agency__name']\n", 186 | "df['AWARD_AGENCY'] = df['awarding_agency__toptier_agency__name']" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "### Set sub tier funding & awarding agency" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 15, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "df['FUND_AGENCY_SUB'] = df['funding_agency__subtier_agency__name']\n", 205 | "df['AWARD_AGENCY_SUB'] = df['awarding_agency__subtier_agency__name']" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "### Set product or service code" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 16, 218 | "metadata": { 219 | "collapsed": true 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "df['PSC'] = df['contract_data__product_or_service_code']" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "### Create NAICS field" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "###### NAICS number and NAICS description fields" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 17, 243 | "metadata": { 244 | "collapsed": true 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "df['NAICS'] = df['contract_data__naics']\n", 249 | "df['NAICS_DESC'] = df['contract_data__naics_description']" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "### Create Date Fields" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 18, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "df['ACTION_DT'] = df['action_date'] \n", 268 | "df['START_DT'] = df['period_of_performance_start_date']\n", 269 | "df['CURR_END_DT'] = df['period_of_performance_current_end_date']" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "### Obligation Dollar Amount" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 19, 282 | "metadata": { 283 | "collapsed": true 284 | }, 285 | "outputs": [], 286 | "source": [ 287 | "df['OBLIGATION_AMT'] = df['federal_action_obligation']" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": { 293 | "collapsed": true 294 | }, 295 | "source": [ 296 | "### Create output csv file" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 20, 302 | "metadata": { 303 | "collapsed": true 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "output_columns = ['TRANSACTION_TYPE','ACTION_DT','START_DT','CURR_END_DT','POP_ZIP','POP_STATE','OBLIGATION_AMT',\n", 308 | " 'PSC','NAICS','NAICS_DESC','FUND_AGENCY','AWARD_AGENCY','FUND_AGENCY_SUB','AWARD_AGENCY_SUB']" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 21, 314 | "metadata": { 315 | "collapsed": true 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "df[output_columns].to_csv(os.path.join(data_path, '0301 clean output.csv'), index = False)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": { 326 | "collapsed": true 327 | }, 328 | "outputs": [], 329 | "source": [] 330 | } 331 | ], 332 | "metadata": { 333 | "kernelspec": { 334 | "display_name": "Python 3", 335 | "language": "python", 336 | "name": "python3" 337 | }, 338 | "language_info": { 339 | "codemirror_mode": { 340 | "name": "ipython", 341 | "version": 3 342 | }, 343 | "file_extension": ".py", 344 | "mimetype": "text/x-python", 345 | "name": "python", 346 | "nbconvert_exporter": "python", 347 | "pygments_lexer": "ipython3", 348 | "version": "3.6.1" 349 | } 350 | }, 351 | "nbformat": 4, 352 | "nbformat_minor": 1 353 | } 354 | --------------------------------------------------------------------------------