├── main.py ├── proxy_config.py └── requirements.txt /main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import csv 3 | from datetime import datetime 4 | from bs4 import BeautifulSoup 5 | from proxy_config import login, password, proxy 6 | requests.packages.urllib3.disable_warnings() 7 | 8 | headers = { 9 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:97.0) Gecko/20100101 Firefox/97.0', 10 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' 11 | } 12 | 13 | proxies = { 14 | 'https': f'http://{login}:{password}@{proxy}' 15 | } 16 | 17 | 18 | def get_data(url): 19 | cur_date = datetime.now().strftime('%m_%d_%Y') 20 | response = requests.get(url=url, headers=headers, proxies=proxies) 21 | print(response) 22 | 23 | with open(file='index.html', mode='w') as file: 24 | file.write(response.text) 25 | 26 | with open(file='index.html') as file: 27 | src = file.read() 28 | 29 | soup = BeautifulSoup(src, 'lxml') 30 | table = soup.find('table', id='ro5xgenergy') 31 | 32 | data_th = table.find('thead').find_all('tr')[-1].find_all('th') 33 | 34 | table_headers = ['Area'] 35 | for dth in data_th: 36 | dth = dth.text.strip() 37 | table_headers.append(dth) 38 | 39 | with open(file=f'data_{cur_date}.csv', mode='w') as file: 40 | writer = csv.writer(file) 41 | 42 | writer.writerow( 43 | ( 44 | table_headers 45 | ) 46 | ) 47 | 48 | tbody_trs = table.find('tbody').find_all('tr') 49 | 50 | ids = [] 51 | data = [] 52 | for tr in tbody_trs: 53 | area = tr.find('th').text.strip() 54 | 55 | data_by_month = tr.find_all('td') 56 | data = [area] 57 | for dbm in data_by_month: 58 | if dbm.find('a'): 59 | area_data = dbm.find('a').get('href') 60 | id = area_data.split('/')[4].split('?')[0] 61 | ids.append(id) 62 | elif dbm.find('span'): 63 | area_data = dbm.find('span').text.strip() 64 | else: 65 | area_data = 'None' 66 | 67 | data.append(area_data) 68 | 69 | with open(file=f'data_{cur_date}.csv', mode='a') as file: 70 | writer = csv.writer(file) 71 | 72 | writer.writerow( 73 | ( 74 | data 75 | ) 76 | ) 77 | 78 | with open(file='ids.txt', mode='w') as file: 79 | for id in ids: 80 | file.write(f'{id}\n') 81 | 82 | return 'Work done!' 83 | 84 | 85 | def download_xlsx(file_path='ids.txt'): 86 | 87 | with open(file=file_path) as file: 88 | ids = [line.strip() for line in file.readlines()] 89 | 90 | for i, id in enumerate(ids): 91 | headers = { 92 | 'Host': 'data.bls.gov', 93 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:97.0) Gecko/20100101 Firefox/97.0', 94 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 95 | 'Accept-Language': 'en-US,en;q=0.5', 96 | 'Accept-Encoding': 'gzip, deflate', 97 | 'Content-Type': 'application/x-www-form-urlencoded', 98 | 'Origin': 'https://data.bls.gov', 99 | 'Dnt': '1', 100 | 'Upgrade-Insecure-Requests': '1', 101 | 'Sec-Fetch-Dest': 'document', 102 | 'Sec-Fetch-Mode': 'navigate', 103 | 'Sec-Fetch-Site': 'same-origin', 104 | 'Sec-Fetch-User': '?1', 105 | 'Te': 'trailers', 106 | 'Connection': 'close', 107 | } 108 | 109 | data = f'request_action=get_data&reformat=true&from_results_page=true&years_option=specific_years&delimiter=comma&output_type=multi&periods_option=all_periods&output_view=data&output_format=excelTable&original_output_type=default&annualAveragesRequested=false&series_id={id}' 110 | 111 | response = requests.post('https://data.bls.gov/pdq/SurveyOutputServlet', headers=headers, data=data, verify=False, proxies=proxies) 112 | 113 | with open(file=f'xlsx_files/{id}.xlsx', mode='wb') as file: 114 | file.write(response.content) 115 | 116 | print(f'{i + 1}/{len(ids)}') 117 | 118 | 119 | def main(): 120 | # print(get_data(url='https://www.bls.gov/regions/midwest/data/AverageEnergyPrices_SelectedAreas_Table.htm')) 121 | download_xlsx() 122 | 123 | 124 | if __name__ == '__main__': 125 | main() 126 | -------------------------------------------------------------------------------- /proxy_config.py: -------------------------------------------------------------------------------- 1 | login = 'login' 2 | password = 'password' 3 | proxy = 'proxy:port' 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.10.0 2 | lxml==4.8.0 3 | requests==2.27.1 --------------------------------------------------------------------------------