├── README.md
├── aetna.py
├── aetna_data.db
├── bcbs.py
├── bcbs_data.db
└── bcbs_manual.py


/README.md:
--------------------------------------------------------------------------------
1 | # transparency-in-coverage-filesizes
2 | 


--------------------------------------------------------------------------------
/aetna.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import sqlite3
 3 | import aiohttp
 4 | import asyncio
 5 | from tqdm import tqdm
 6 | 
 7 | con = sqlite3.connect("aetna_data.db")
 8 | cur = con.cursor()
 9 | cur.execute("CREATE TABLE IF NOT EXISTS in_network_files (url PRIMARY KEY UNIQUE, size)")
10 | 
11 | # The following values were inferred from looking at the network requests on the pages linked from here:
12 | # https://www.aetna.com/individuals-families/member-rights-resources/rights/disclosure-information.html
13 | brand_codes = ['ASH','AETNACVS','ALICUNDER100','ALICFI']
14 | def resolve_urls(file_paths, brand_code):
15 |     return [
16 |         f'https://mrf.healthsparq.com/aetnacvs-egress.nophi.kyruushsq.com/prd/mrf/AETNACVS_I/{brand_code}/2022-08-05/inNetworkRates/{file_path}' 
17 |         for file_path in file_paths]
18 | 
19 | urls = []
20 | for brand_code in tqdm(brand_codes):
21 |         url = f'https://mrf.healthsparq.com/aetnacvs-egress.nophi.kyruushsq.com/prd/mrf/AETNACVS_I/{brand_code}/latest_metadata.json'
22 |         resp = requests.get(url, stream = True)
23 |         
24 |         file_list = resp.json()['files']
25 |         file_paths = set([file['fileName'] for file in file_list if file['fileSchema'] == 'IN_NETWORK_RATES'])
26 |         new_urls = resolve_urls(file_paths, brand_code)
27 |         urls.extend(new_urls)
28 | 
29 | 
30 | async def fetch_url_sizes(table, urls):
31 | 
32 |     async with aiohttp.ClientSession() as session:
33 |         
34 |         fs = [session.head(url) for url in urls]
35 |     
36 |         for f in asyncio.as_completed(fs):
37 |             resp = await f
38 |             url = str(resp.url)
39 |             size = int(resp.headers.get('content-length', -1))
40 |             cur.execute(f"""INSERT OR IGNORE INTO {table} VALUES ("{url}", {size})""")
41 |             con.commit()
42 | 
43 | asyncio.run(fetch_url_sizes('in_network_files', urls))
44 | 
45 | cur.execute("SELECT SUM(size) FROM in_network_files").fetchone()[0]/1_000_000_000
46 | 
47 | 


--------------------------------------------------------------------------------
/aetna_data.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alecstein/transparency-in-coverage-filesizes/73d5f3dbb48e89897ed7eef950786e7c483651fd/aetna_data.db


--------------------------------------------------------------------------------
/bcbs.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import aiohttp
 3 | import requests
 4 | import sqlite3
 5 | import time
 6 | from bs4 import BeautifulSoup
 7 | from tqdm import tqdm
 8 | 
 9 | con = sqlite3.connect("bcbs_data.db")
10 | cur = con.cursor()
11 | cur.execute("CREATE TABLE IF NOT EXISTS index_files(url PRIMARY KEY UNIQUE, size)")
12 | cur.execute("CREATE TABLE IF NOT EXISTS in_network_files(url PRIMARY KEY UNIQUE, size)")
13 | cur.execute("CREATE TABLE IF NOT EXISTS fetched_index_files(url PRIMARY KEY UNIQUE)")
14 | 
15 | mrfs_url = 'https://www.bluecrossnc.com/about-us/policies-and-best-practices/transparency-coverage-mrf'
16 | 
17 | r = requests.get(mrfs_url)
18 | soup = BeautifulSoup(r.content, features = "lxml")
19 | 
20 | links = soup.find_all('a')
21 | urls = []
22 | for link in links:
23 |     if (url := link.get('href')) is not None and 'index.json' in url:
24 |         urls.append(url)
25 | 
26 | 
27 | async def fetch_url_sizes(table, urls):
28 | 
29 |     async with aiohttp.ClientSession() as session:
30 |         
31 |         fs = [session.head(url) for url in urls]
32 |     
33 |         for f in asyncio.as_completed(fs):
34 |             resp = await f
35 |             url = str(resp.url)
36 |             size = int(resp.headers.get('content-length', -1))
37 |             cur.execute(f"""INSERT OR IGNORE INTO {table} VALUES ("{url}", {size})""")
38 |             con.commit()
39 |             
40 | # Get all the MRF files on the main BCBS page
41 | asyncio.run(fetch_url_sizes('index_files', urls))
42 | 
43 | # Sort from smallest to largest (some are multiple GB)
44 | index_file_urls = cur.execute("SELECT url FROM index_files ORDER BY size").fetchall()
45 | 
46 | for url in tqdm(index_file_urls):
47 | 
48 |     url = url[0]
49 | 
50 |     if cur.execute(f"""SELECT url FROM fetched_index_files where url = "{url}" """).fetchone() is None:
51 | 
52 |         resp = requests.get(url, stream = True)
53 |         size_mb = int(resp.headers['Content-Length'])/1_000_000
54 | 
55 |         if size_mb > 1_000:
56 |             print(f"\nCannot download file of size: {size_mb} MB. Do this manually.")
57 |             print(url)
58 |             continue
59 | 
60 |         urls = [file['location'] for file in r.json()['reporting_structure'][0]['in_network_files']]
61 | 
62 |         asyncio.run(fetch_url_sizes('in_network_files', urls))
63 | 
64 |         cur.execute(f"""INSERT OR IGNORE INTO fetched_index_files VALUES ("{url}")""")
65 |         con.commit()
66 | 


--------------------------------------------------------------------------------
/bcbs_data.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alecstein/transparency-in-coverage-filesizes/73d5f3dbb48e89897ed7eef950786e7c483651fd/bcbs_data.db


--------------------------------------------------------------------------------
/bcbs_manual.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import aiohttp
 3 | import requests
 4 | import sqlite3
 5 | import ijson
 6 | 
 7 | con = sqlite3.connect("bcbs_data.db")
 8 | cur = con.cursor()
 9 | 
10 | 
11 | async def fetch_url_sizes(table):
12 | 
13 |     filename = '/Users/alecstein/dolthub/bounties/transparency-in-coverage/bcbs/2022-07-27_blue-cross-and-blue-shield-of-north-carolina_index.json'
14 | 
15 |     urls = []
16 |     with open(filename) as f:
17 |         url_objs = ijson.items(f, 'reporting_structure.item.in_network_files.item.location', multiple_values=True)
18 |         for url in url_objs:
19 |             try:
20 |                 urls.append(url)
21 |                 print(1)
22 |             except ijson.common.IncompleteJSONError as e:
23 |                 print(e)
24 |                 break
25 | 
26 |     async with aiohttp.ClientSession() as session:
27 |         fs = [session.head(url) for url in urls]
28 | 
29 |         for f in asyncio.as_completed(urls):
30 |             resp = await f
31 |             url = str(resp.url)
32 |             size = int(resp.headers.get('content-length', -1))
33 |             print(url, size)
34 |             cur.execute(f"""INSERT OR IGNORE INTO {table} VALUES ("{url}", {size})""")
35 |             con.commit()
36 | 
37 | asyncio.run(fetch_url_sizes('test'))


--------------------------------------------------------------------------------