├── README.md ├── aetna.py ├── aetna_data.db ├── bcbs.py ├── bcbs_data.db └── bcbs_manual.py /README.md: -------------------------------------------------------------------------------- 1 | # transparency-in-coverage-filesizes 2 | -------------------------------------------------------------------------------- /aetna.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import sqlite3 3 | import aiohttp 4 | import asyncio 5 | from tqdm import tqdm 6 | 7 | con = sqlite3.connect("aetna_data.db") 8 | cur = con.cursor() 9 | cur.execute("CREATE TABLE IF NOT EXISTS in_network_files (url PRIMARY KEY UNIQUE, size)") 10 | 11 | # The following values were inferred from looking at the network requests on the pages linked from here: 12 | # https://www.aetna.com/individuals-families/member-rights-resources/rights/disclosure-information.html 13 | brand_codes = ['ASH','AETNACVS','ALICUNDER100','ALICFI'] 14 | def resolve_urls(file_paths, brand_code): 15 | return [ 16 | f'https://mrf.healthsparq.com/aetnacvs-egress.nophi.kyruushsq.com/prd/mrf/AETNACVS_I/{brand_code}/2022-08-05/inNetworkRates/{file_path}' 17 | for file_path in file_paths] 18 | 19 | urls = [] 20 | for brand_code in tqdm(brand_codes): 21 | url = f'https://mrf.healthsparq.com/aetnacvs-egress.nophi.kyruushsq.com/prd/mrf/AETNACVS_I/{brand_code}/latest_metadata.json' 22 | resp = requests.get(url, stream = True) 23 | 24 | file_list = resp.json()['files'] 25 | file_paths = set([file['fileName'] for file in file_list if file['fileSchema'] == 'IN_NETWORK_RATES']) 26 | new_urls = resolve_urls(file_paths, brand_code) 27 | urls.extend(new_urls) 28 | 29 | 30 | async def fetch_url_sizes(table, urls): 31 | 32 | async with aiohttp.ClientSession() as session: 33 | 34 | fs = [session.head(url) for url in urls] 35 | 36 | for f in asyncio.as_completed(fs): 37 | resp = await f 38 | url = str(resp.url) 39 | size = int(resp.headers.get('content-length', -1)) 40 | cur.execute(f"""INSERT OR IGNORE INTO {table} VALUES ("{url}", {size})""") 41 | con.commit() 42 | 43 | asyncio.run(fetch_url_sizes('in_network_files', urls)) 44 | 45 | cur.execute("SELECT SUM(size) FROM in_network_files").fetchone()[0]/1_000_000_000 46 | 47 | -------------------------------------------------------------------------------- /aetna_data.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alecstein/transparency-in-coverage-filesizes/73d5f3dbb48e89897ed7eef950786e7c483651fd/aetna_data.db -------------------------------------------------------------------------------- /bcbs.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aiohttp 3 | import requests 4 | import sqlite3 5 | import time 6 | from bs4 import BeautifulSoup 7 | from tqdm import tqdm 8 | 9 | con = sqlite3.connect("bcbs_data.db") 10 | cur = con.cursor() 11 | cur.execute("CREATE TABLE IF NOT EXISTS index_files(url PRIMARY KEY UNIQUE, size)") 12 | cur.execute("CREATE TABLE IF NOT EXISTS in_network_files(url PRIMARY KEY UNIQUE, size)") 13 | cur.execute("CREATE TABLE IF NOT EXISTS fetched_index_files(url PRIMARY KEY UNIQUE)") 14 | 15 | mrfs_url = 'https://www.bluecrossnc.com/about-us/policies-and-best-practices/transparency-coverage-mrf' 16 | 17 | r = requests.get(mrfs_url) 18 | soup = BeautifulSoup(r.content, features = "lxml") 19 | 20 | links = soup.find_all('a') 21 | urls = [] 22 | for link in links: 23 | if (url := link.get('href')) is not None and 'index.json' in url: 24 | urls.append(url) 25 | 26 | 27 | async def fetch_url_sizes(table, urls): 28 | 29 | async with aiohttp.ClientSession() as session: 30 | 31 | fs = [session.head(url) for url in urls] 32 | 33 | for f in asyncio.as_completed(fs): 34 | resp = await f 35 | url = str(resp.url) 36 | size = int(resp.headers.get('content-length', -1)) 37 | cur.execute(f"""INSERT OR IGNORE INTO {table} VALUES ("{url}", {size})""") 38 | con.commit() 39 | 40 | # Get all the MRF files on the main BCBS page 41 | asyncio.run(fetch_url_sizes('index_files', urls)) 42 | 43 | # Sort from smallest to largest (some are multiple GB) 44 | index_file_urls = cur.execute("SELECT url FROM index_files ORDER BY size").fetchall() 45 | 46 | for url in tqdm(index_file_urls): 47 | 48 | url = url[0] 49 | 50 | if cur.execute(f"""SELECT url FROM fetched_index_files where url = "{url}" """).fetchone() is None: 51 | 52 | resp = requests.get(url, stream = True) 53 | size_mb = int(resp.headers['Content-Length'])/1_000_000 54 | 55 | if size_mb > 1_000: 56 | print(f"\nCannot download file of size: {size_mb} MB. Do this manually.") 57 | print(url) 58 | continue 59 | 60 | urls = [file['location'] for file in r.json()['reporting_structure'][0]['in_network_files']] 61 | 62 | asyncio.run(fetch_url_sizes('in_network_files', urls)) 63 | 64 | cur.execute(f"""INSERT OR IGNORE INTO fetched_index_files VALUES ("{url}")""") 65 | con.commit() 66 | -------------------------------------------------------------------------------- /bcbs_data.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alecstein/transparency-in-coverage-filesizes/73d5f3dbb48e89897ed7eef950786e7c483651fd/bcbs_data.db -------------------------------------------------------------------------------- /bcbs_manual.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aiohttp 3 | import requests 4 | import sqlite3 5 | import ijson 6 | 7 | con = sqlite3.connect("bcbs_data.db") 8 | cur = con.cursor() 9 | 10 | 11 | async def fetch_url_sizes(table): 12 | 13 | filename = '/Users/alecstein/dolthub/bounties/transparency-in-coverage/bcbs/2022-07-27_blue-cross-and-blue-shield-of-north-carolina_index.json' 14 | 15 | urls = [] 16 | with open(filename) as f: 17 | url_objs = ijson.items(f, 'reporting_structure.item.in_network_files.item.location', multiple_values=True) 18 | for url in url_objs: 19 | try: 20 | urls.append(url) 21 | print(1) 22 | except ijson.common.IncompleteJSONError as e: 23 | print(e) 24 | break 25 | 26 | async with aiohttp.ClientSession() as session: 27 | fs = [session.head(url) for url in urls] 28 | 29 | for f in asyncio.as_completed(urls): 30 | resp = await f 31 | url = str(resp.url) 32 | size = int(resp.headers.get('content-length', -1)) 33 | print(url, size) 34 | cur.execute(f"""INSERT OR IGNORE INTO {table} VALUES ("{url}", {size})""") 35 | con.commit() 36 | 37 | asyncio.run(fetch_url_sizes('test')) --------------------------------------------------------------------------------