├── .gitignore ├── async └── main.py ├── multiprocessing └── main.py ├── requirements.txt └── threading ├── .env-local ├── locking.py ├── main.py ├── pipelines └── wiki_yahoo_scraper_pipeline.yaml ├── workers ├── PostgresWorker.py ├── WikiWorker.py ├── YahooFinanceWorkers.py └── __init__.py └── yaml_reader.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | venv 3 | 4 | *.pyc -------------------------------------------------------------------------------- /async/main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | import requests 4 | import aiohttp 5 | 6 | 7 | async def get_url_response(url): 8 | async with aiohttp.ClientSession() as session: 9 | async with session.get(url) as response: 10 | return await response.text() 11 | 12 | 13 | async def main(): 14 | urls = ['https://google.com', 15 | 'https://wikipedia.org/wiki/Concurrency', 16 | 'https://python.org', 17 | 'https://pypi.org/project/requests/', 18 | 'https://docs.python.org/3/library/asyncio-task.html', 19 | 'https://www.apple.com/', 20 | 'https://medium.com'] 21 | 22 | start = time.time() 23 | sync_text_response = [] 24 | for url in urls: 25 | sync_text_response.append(requests.get(url).text) 26 | 27 | end_time = time.time() 28 | print('Requests time:', end_time - start) 29 | 30 | start = time.time() 31 | tasks = [] 32 | for url in urls: 33 | tasks.append(asyncio.create_task(get_url_response(url))) 34 | 35 | async_text_response = await asyncio.gather(*tasks) 36 | 37 | end_time = time.time() 38 | print('Async requests time:', end_time - start) 39 | 40 | 41 | if __name__ == '__main__': 42 | asyncio.run(main()) 43 | -------------------------------------------------------------------------------- /multiprocessing/main.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool, cpu_count 2 | 3 | 4 | def check_number_of_values_in_range(comp_list, lower, upper): 5 | number_of_hits = 0 6 | for i in range(lower, upper): 7 | if i in comp_list: 8 | number_of_hits += 1 9 | return number_of_hits 10 | 11 | 12 | num_processes = 4 13 | comparison_list = [1, 2, 3] 14 | lower_and_upper_bounds = [(0, 25*10**6), (25*10*6, 50*10**6), 15 | (50*10**6, 75*10*6), (75*10*6, 10**8)] 16 | 17 | num_cpu_to_use = max(1, cpu_count() - 1) 18 | 19 | print('Number of cpus being used:', num_cpu_to_use) 20 | 21 | prepared_list = [] 22 | for i in range(len(lower_and_upper_bounds)): 23 | prepared_list.append((comparison_list, *lower_and_upper_bounds[i])) 24 | 25 | print('List to use as input:', prepared_list) 26 | with Pool(num_cpu_to_use) as mp_pool: 27 | result = mp_pool.starmap(check_number_of_values_in_range, prepared_list) # [(comp_list, lower, upper), ..] 28 | 29 | print(result) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.7.4.post0 2 | async-timeout==3.0.1 3 | attrs==21.2.0 4 | beautifulsoup4==4.9.3 5 | certifi==2021.5.30 6 | chardet==4.0.0 7 | greenlet==1.1.0 8 | idna==2.10 9 | importlib-metadata==4.6.0 10 | lxml==4.6.3 11 | multidict==5.1.0 12 | psycopg2-binary==2.9.1 13 | PyYAML==5.4.1 14 | requests==2.25.1 15 | soupsieve==2.2.1 16 | SQLAlchemy==1.4.20 17 | typing-extensions==3.10.0.0 18 | urllib3==1.26.6 19 | yarl==1.6.3 20 | zipp==3.5.0 21 | -------------------------------------------------------------------------------- /threading/.env-local: -------------------------------------------------------------------------------- 1 | export PIPELINE_LOCATION='pipelines/wiki_yahoo_scraper_pipeline.yaml' 2 | export PG_USER='' 3 | export PG_PW='' 4 | export PG_HOST='localhost' 5 | export PG_DB='postgres' 6 | -------------------------------------------------------------------------------- /threading/locking.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | 4 | counter = 0 5 | 6 | lock = threading.Lock() 7 | 8 | # lock.acquire() 9 | # lock.release() 10 | 11 | 12 | def increment(): 13 | global counter 14 | for i in range(10**6): 15 | with lock: 16 | counter += 1 17 | # more locking 18 | # more locking 19 | 20 | threads = [] 21 | for i in range(4): 22 | x = threading.Thread(target=increment) 23 | threads.append(x) 24 | 25 | for t in threads: 26 | t.start() 27 | 28 | for t in threads: 29 | t.join() 30 | 31 | print('Counter value:', counter) 32 | 33 | 34 | # counter = x 35 | # thread is locking 36 | # thread <- counter = x 37 | # thread -> counter = counter + 1 -> counter = x + 1 38 | # release locking 39 | # thread 2 is locking 40 | # thread 2 <- counter = x + 1 41 | # thread 2 -> counter = counter + 1 -> counter = x +1 + 1 42 | # release locking 43 | 44 | # counter = x + 2 -------------------------------------------------------------------------------- /threading/main.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | 4 | from yaml_reader import YamlPipelineExecutor 5 | 6 | 7 | def main(): 8 | pipeline_location = os.environ.get('PIPELINE_LOCATION') 9 | if pipeline_location is None: 10 | print('Pipeline location not defined') 11 | exit(1) 12 | scraper_start_time = time.time() 13 | 14 | yamlPipelineExecutor = YamlPipelineExecutor(pipeline_location=pipeline_location) 15 | yamlPipelineExecutor.start() 16 | yamlPipelineExecutor.join() 17 | print('Extracting time took:', round(time.time() - scraper_start_time, 1)) 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /threading/pipelines/wiki_yahoo_scraper_pipeline.yaml: -------------------------------------------------------------------------------- 1 | queues: 2 | - name: SymbolQueue 3 | dscription: contains symbols to be scraped from yahoo finance 4 | 5 | - name: PostgresUploading 6 | description: contains data that needs to be uploaded to postgres 7 | 8 | workers: 9 | - name: WikiWorker 10 | description: This scraps raw wikipedia page and pulls out symbols 11 | note: Only have on instance here, otherwise we scrap the same symbol multiple times 12 | location: workers.WikiWorker 13 | class: WikiWorkerMasterScheduler 14 | instance: 1 # Please don't change this, otherwise we do duplicate work, see note above 15 | input_values: 16 | - 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies' 17 | output_queues: 18 | - SymbolQueue 19 | 20 | - name: YahooFinanceWorker 21 | description: pulls price data for a specific stock symbol from yahoo finance 22 | location: workers.YahooFinanceWorkers 23 | class: YahooFinancePriceScheduler 24 | instances: 2 25 | input_queue: SymbolQueue 26 | output_queues: 27 | - PostgresUploading 28 | 29 | - name: PostgresWorker 30 | description: take stock data and save in postgres 31 | location: workers.PostgresWorker 32 | class: PostgresMasterScheduler 33 | instances: 6 34 | input_queue: PostgresUploading 35 | -------------------------------------------------------------------------------- /threading/workers/PostgresWorker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import threading 3 | 4 | from queue import Empty 5 | 6 | from sqlalchemy import create_engine 7 | from sqlalchemy.sql import text 8 | 9 | 10 | class PostgresMasterScheduler(threading.Thread): 11 | def __init__(self, input_queue, **kwargs): 12 | if 'output_queue' in kwargs: 13 | kwargs.pop('output_queue') 14 | super(PostgresMasterScheduler, self).__init__(**kwargs) 15 | self._input_queue = input_queue 16 | self.start() 17 | 18 | def run(self): 19 | while True: 20 | try: 21 | val = self._input_queue.get(timeout=10) 22 | except Empty: 23 | print('Timeout reached in postgres scheduler, stopping') 24 | break 25 | 26 | if val == 'DONE': 27 | break 28 | 29 | symbol, price, extracted_time = val 30 | postgresWorker = PostgresWorker(symbol, price, extracted_time) 31 | postgresWorker.insert_into_db() 32 | 33 | 34 | class PostgresWorker(): 35 | def __init__(self, symbol, price, extracted_time): 36 | self._symbol = symbol 37 | self._price = price 38 | self._extracted_time = extracted_time 39 | 40 | self._PG_USER = os.environ.get('PG_USER') 41 | self._PG_PW = os.environ.get('PG_PW') 42 | self._PG_HOST = os.environ.get('PG_HOST') 43 | self._PG_DB = os.environ.get('PG_DB') 44 | 45 | self._engine = create_engine(f'postgresql://{self._PG_USER}:{self._PG_PW}@{self._PG_HOST}/{self._PG_DB}') 46 | 47 | def _create_insert_query(self): 48 | SQL = """INSERT INTO prices (symbol, price, extracted_time) VALUES 49 | (:symbol, :price, :extracted_time)""" 50 | return SQL 51 | 52 | def insert_into_db(self): 53 | insert_query = self._create_insert_query() 54 | 55 | with self._engine.connect() as conn: 56 | conn.execute(text(insert_query), {'symbol': self._symbol, 57 | 'price': self._price, 58 | 'extracted_time': str(self._extracted_time)}) 59 | -------------------------------------------------------------------------------- /threading/workers/WikiWorker.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | 6 | 7 | class WikiWorkerMasterScheduler(threading.Thread): 8 | def __init__(self, output_queue, **kwargs): 9 | if 'input_queue' in kwargs: 10 | kwargs.pop('input_queue') 11 | 12 | self._input_values = kwargs.pop('input_values') 13 | 14 | temp_queue = output_queue 15 | if type(temp_queue) != list: 16 | temp_queue = [temp_queue] 17 | self._output_queues = temp_queue 18 | super(WikiWorkerMasterScheduler, self).__init__(**kwargs) 19 | self.start() 20 | 21 | def run(self): 22 | for entry in self._input_values: 23 | wikiWorker = WikiWorker(entry) 24 | symbol_counter = 0 25 | for symbol in wikiWorker.get_sp_500_companies(): 26 | for output_queue in self._output_queues: 27 | output_queue.put(symbol) 28 | symbol_counter += 1 29 | if symbol_counter >= 5: 30 | break 31 | 32 | 33 | class WikiWorker(): 34 | def __init__(self, url): 35 | self._url = url 36 | 37 | @staticmethod 38 | def _extract_company_symbols(page_html): 39 | soup = BeautifulSoup(page_html, 'lxml') 40 | table = soup.find(id='constituents') 41 | table_rows = table.find_all('tr') 42 | for table_row in table_rows[1:]: 43 | symbol = table_row.find('td').text.strip('\n') 44 | yield symbol 45 | 46 | def get_sp_500_companies(self): 47 | response = requests.get(self._url) 48 | if response.status_code != 200: 49 | print("Couldn't get entries") 50 | return [] 51 | 52 | yield from self._extract_company_symbols(response.text) 53 | 54 | -------------------------------------------------------------------------------- /threading/workers/YahooFinanceWorkers.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import datetime 3 | import random 4 | import time 5 | from queue import Empty 6 | 7 | import requests 8 | from lxml import html 9 | 10 | 11 | class YahooFinancePriceScheduler(threading.Thread): 12 | def __init__(self, input_queue, output_queue, **kwargs): 13 | super(YahooFinancePriceScheduler, self).__init__(**kwargs) 14 | self._input_queue = input_queue 15 | temp_queue = output_queue 16 | if type(temp_queue) != list: 17 | temp_queue = [temp_queue] 18 | self._output_queues = temp_queue 19 | self.start() 20 | 21 | def run(self): 22 | while True: 23 | try: 24 | val = self._input_queue.get(timeout=10) 25 | except Empty: 26 | print('Yahoo scheduler queue is empty, stopping') 27 | break 28 | if val == 'DONE': 29 | break 30 | 31 | yahooFinacePriceWorker = YahooFinacePriceWorker(symbol=val) 32 | price = yahooFinacePriceWorker.get_price() 33 | for output_queue in self._output_queues: 34 | output_values = (val, price, datetime.datetime.utcnow()) 35 | output_queue.put(output_values) 36 | time.sleep(random.random()) 37 | 38 | 39 | class YahooFinacePriceWorker(): 40 | def __init__(self, symbol): 41 | self._symbol = symbol 42 | base_url = 'https://finance.yahoo.com/quote/' 43 | self._url = f'{base_url}{self._symbol}' 44 | 45 | def get_price(self): 46 | r = requests.get(self._url) 47 | if r.status_code != 200: 48 | return 49 | page_contents = html.fromstring(r.text) 50 | raw_price = page_contents.xpath('//*[@id="quote-header-info"]/div[3]/div[1]/div/span[1]')[0].text 51 | price = float(raw_price.replace(',', '')) 52 | return price 53 | -------------------------------------------------------------------------------- /threading/workers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingwithmax/threading-tutorial/72ce51a9cfa68a292281fd6f53b54c6027b6411a/threading/workers/__init__.py -------------------------------------------------------------------------------- /threading/yaml_reader.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import threading 3 | import time 4 | 5 | import yaml 6 | from multiprocessing import Queue 7 | 8 | 9 | class YamlPipelineExecutor(threading.Thread): 10 | def __init__(self, pipeline_location): 11 | super(YamlPipelineExecutor, self).__init__() 12 | self._pipline_location = pipeline_location 13 | self._queues = {} 14 | self._workers = {} 15 | self._queue_consumers = {} 16 | self._downstream_queues = {} 17 | 18 | def _load_pipeline(self): 19 | with open(self._pipline_location, 'r') as inFile: 20 | self._yaml_data = yaml.safe_load(inFile) 21 | 22 | def _initialize_queues(self): 23 | for queue in self._yaml_data['queues']: 24 | queue_name = queue['name'] 25 | self._queues[queue_name] = Queue() 26 | 27 | def _initialize_workers(self): 28 | for worker in self._yaml_data['workers']: 29 | WorkerClass = getattr(importlib.import_module(worker['location']), worker['class']) 30 | input_queue = worker.get('input_queue') 31 | output_queues = worker.get('output_queues') 32 | worker_name = worker['name'] 33 | num_instances = worker.get('instances', 1) 34 | 35 | self._downstream_queues[worker_name] = output_queues 36 | if input_queue is not None: 37 | self._queue_consumers[input_queue] = num_instances 38 | init_params = { 39 | 'input_queue': self._queues[input_queue] if input_queue is not None else None, 40 | 'output_queue': [self._queues[output_queue] for output_queue in output_queues] \ 41 | if output_queues is not None else None 42 | } 43 | 44 | input_values = worker.get('input_values') 45 | if input_values is not None: 46 | init_params['input_values'] = input_values 47 | 48 | self._workers[worker_name] = [] 49 | for i in range(num_instances): 50 | self._workers[worker_name].append(WorkerClass(**init_params)) 51 | 52 | def _join_workers(self): 53 | for worker_name in self._workers: 54 | for worker_thread in self._workers[worker_name]: 55 | worker_thread.join() 56 | 57 | def process_pipeline(self): 58 | self._load_pipeline() 59 | self._initialize_queues() 60 | self._initialize_workers() 61 | # self._join_workers() 62 | 63 | def run(self): 64 | self.process_pipeline() 65 | 66 | while True: 67 | total_workers_alive = 0 68 | worker_stats = [] 69 | to_del = [] 70 | for worker_name in self._workers: 71 | total_worker_threads_alive = 0 72 | for worker_thread in self._workers[worker_name]: 73 | if worker_thread.is_alive(): 74 | total_worker_threads_alive += 1 75 | total_workers_alive += total_worker_threads_alive 76 | if total_worker_threads_alive == 0: 77 | if self._downstream_queues[worker_name] is not None: 78 | for output_queue in self._downstream_queues[worker_name]: 79 | number_of_consumers = self._queue_consumers[output_queue] 80 | for i in range(number_of_consumers): 81 | self._queues[output_queue].put('DONE') 82 | 83 | to_del.append(worker_name) 84 | 85 | worker_stats.append([worker_name, total_worker_threads_alive]) 86 | print(worker_stats) 87 | if total_workers_alive == 0: 88 | break 89 | 90 | # queue_stats = [] 91 | # for queue in self._queues: 92 | # queue_stats.append([queue, self._queues[queue].qsize()]) 93 | # 94 | # print(queue_stats) 95 | 96 | for worker_name in to_del: 97 | del self._workers[worker_name] 98 | 99 | time.sleep(1) 100 | --------------------------------------------------------------------------------